Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
5ed844b
1
Parent(s):
9e8c029
Added examples for structured summaries and groups. Adapted functions for structured summaries. Simplified front tab GUI
Browse files- Dockerfile +2 -2
- README.md +1 -1
- app.py +93 -67
- example_data/case_note_headers_specific.csv +7 -0
- example_data/{dummy_consultation_r_zero_shot_col_Response_text_Qwen_3_4B_topic_analysis.xlsx → combined_case_notes_col_Case_Note_Gemma_3_4B_structured_summaries.xlsx} +2 -2
- example_data/combined_case_notes_col_Case_Note_Gemma_3_4B_topic_analysis_grouped.xlsx +3 -0
- example_data/dummy_consultation_r_col_Response_text_Gemma_3_4B_topic_analysis_zero_shot.xlsx +3 -0
- pyproject.toml +1 -1
- tools/combine_sheets_into_xlsx.py +121 -40
- tools/config.py +1 -1
- tools/dedup_summaries.py +1 -1
- tools/example_table_outputs.py +60 -27
- tools/llm_api_call.py +167 -69
- tools/prompts.py +5 -4
Dockerfile
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
# This Dockerfile is optimised for AWS ECS using Python 3.11, and assumes CPU inference with OpenBLAS for local models.
|
| 2 |
# Stage 1: Build dependencies and download models
|
| 3 |
-
FROM public.ecr.aws/docker/library/python:3.11.13-slim-
|
| 4 |
|
| 5 |
# Install system dependencies.
|
| 6 |
RUN apt-get update && apt-get install -y \
|
|
@@ -30,7 +30,7 @@ RUN pip install --no-cache-dir --target=/install torch==2.7.1+cpu --extra-index-
|
|
| 30 |
RUN rm requirements_no_local.txt
|
| 31 |
|
| 32 |
# Stage 2: Final runtime image
|
| 33 |
-
FROM public.ecr.aws/docker/library/python:3.11.13-slim-
|
| 34 |
|
| 35 |
# Install system dependencies.
|
| 36 |
RUN apt-get update \
|
|
|
|
| 1 |
# This Dockerfile is optimised for AWS ECS using Python 3.11, and assumes CPU inference with OpenBLAS for local models.
|
| 2 |
# Stage 1: Build dependencies and download models
|
| 3 |
+
FROM public.ecr.aws/docker/library/python:3.11.13-slim-trixie AS builder
|
| 4 |
|
| 5 |
# Install system dependencies.
|
| 6 |
RUN apt-get update && apt-get install -y \
|
|
|
|
| 30 |
RUN rm requirements_no_local.txt
|
| 31 |
|
| 32 |
# Stage 2: Final runtime image
|
| 33 |
+
FROM public.ecr.aws/docker/library/python:3.11.13-slim-trixie
|
| 34 |
|
| 35 |
# Install system dependencies.
|
| 36 |
RUN apt-get update \
|
README.md
CHANGED
|
@@ -21,7 +21,7 @@ Basic use:
|
|
| 21 |
2. Select the relevant open text column from the dropdown.
|
| 22 |
3. If you have your own suggested (zero shot) topics, upload this (see examples folder for an example file)
|
| 23 |
4. Write a one sentence description of the consultation/context of the open text.
|
| 24 |
-
5. Click '
|
| 25 |
6. A summary xlsx file workbook will be created on the front page in the box 'Overall summary xlsx file'. This will combine all the results from the different processes into one workbook.
|
| 26 |
|
| 27 |
# Installation guide
|
|
|
|
| 21 |
2. Select the relevant open text column from the dropdown.
|
| 22 |
3. If you have your own suggested (zero shot) topics, upload this (see examples folder for an example file)
|
| 23 |
4. Write a one sentence description of the consultation/context of the open text.
|
| 24 |
+
5. Click 'Extract topics, deduplicate, and summarise'. This will run through the whole analysis process from topic extraction, to topic deduplication, to topic-level and overall summaries.
|
| 25 |
6. A summary xlsx file workbook will be created on the front page in the box 'Overall summary xlsx file'. This will combine all the results from the different processes into one workbook.
|
| 26 |
|
| 27 |
# Installation guide
|
app.py
CHANGED
|
@@ -10,7 +10,7 @@ from tools.dedup_summaries import sample_reference_table_summaries, summarise_ou
|
|
| 10 |
from tools.combine_sheets_into_xlsx import collect_output_csvs_and_create_excel_output
|
| 11 |
from tools.custom_csvlogger import CSVLogger_custom
|
| 12 |
from tools.auth import authenticate_user
|
| 13 |
-
from tools.example_table_outputs import dummy_consultation_table, case_notes_table, dummy_consultation_table_zero_shot
|
| 14 |
from tools.prompts import initial_table_prompt, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, two_para_summary_format_prompt, single_para_summary_format_prompt
|
| 15 |
# from tools.verify_titles import verify_titles
|
| 16 |
from tools.config import RUN_AWS_FUNCTIONS, HOST_NAME, ACCESS_LOGS_FOLDER, FEEDBACK_LOGS_FOLDER, USAGE_LOGS_FOLDER, RUN_LOCAL_MODEL, FILE_INPUT_HEIGHT, GEMINI_API_KEY, model_full_names, BATCH_SIZE_DEFAULT, CHOSEN_LOCAL_MODEL_TYPE, LLM_SEED, COGNITO_AUTH, MAX_QUEUE_SIZE, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, INPUT_FOLDER, OUTPUT_FOLDER, S3_LOG_BUCKET, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, model_name_map, GET_COST_CODES, ENFORCE_COST_CODES, DEFAULT_COST_CODE, COST_CODES_PATH, S3_COST_CODES_PATH, OUTPUT_COST_CODES_PATH, SHOW_COSTS, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, USAGE_LOG_FILE_NAME, CSV_ACCESS_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, DYNAMODB_ACCESS_LOG_HEADERS, DYNAMODB_FEEDBACK_LOG_HEADERS, DYNAMODB_USAGE_LOG_HEADERS, S3_ACCESS_LOGS_FOLDER, S3_FEEDBACK_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER, AWS_ACCESS_KEY, AWS_SECRET_KEY, SHOW_EXAMPLES, HF_TOKEN, AZURE_API_KEY, LLM_TEMPERATURE
|
|
@@ -59,10 +59,13 @@ else: default_model_choice = "gemini-2.5-flash"
|
|
| 59 |
in_data_files = gr.File(height=FILE_INPUT_HEIGHT, label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet'])
|
| 60 |
in_colnames = gr.Dropdown(choices=[""], multiselect = False, label="Select the open text column of interest. In an Excel file, this shows columns across all sheets.", allow_custom_value=True, interactive=True)
|
| 61 |
context_textbox = gr.Textbox(label="Write up to one sentence giving context to the large language model for your task (e.g. 'Consultation for the construction of flats on Main Street')")
|
| 62 |
-
topic_extraction_output_files_xlsx = gr.File(label="Overall summary xlsx file", scale=1, interactive=False)
|
| 63 |
display_topic_table_markdown = gr.Markdown(value="", show_copy_button=True)
|
| 64 |
output_messages_textbox = gr.Textbox(value="", label="Output messages", scale=1, interactive=False, lines=4)
|
| 65 |
-
candidate_topics = gr.File(height=FILE_INPUT_HEIGHT, label="Input topics from file (csv). File should have at least one column with a header, and all topic names below this. Using the headers 'General topic' and/or 'Subtopic' will allow for these columns to be suggested to the model. If a third column is present, it will be assumed to be a topic description.")
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
# Create the gradio interface
|
| 68 |
app = gr.Blocks(theme = gr.themes.Default(primary_hue="blue"), fill_width=True)
|
|
@@ -164,64 +167,78 @@ with app:
|
|
| 164 |
NOTE: Large language models are not 100% accurate and may produce biased or harmful outputs. All outputs from this app **absolutely need to be checked by a human** to check for harmful outputs, hallucinations, and accuracy.""")
|
| 165 |
|
| 166 |
if SHOW_EXAMPLES == "True":
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
|
| 174 |
-
with gr.Tab(label="
|
| 175 |
-
gr.Markdown("""### Choose a tabular data file (xlsx, csv, or parquet) of open text to extract topics from.""")
|
| 176 |
with gr.Row():
|
| 177 |
-
model_choice = gr.Dropdown(value = default_model_choice, choices = model_full_names, label="
|
| 178 |
|
| 179 |
-
with gr.Accordion("Upload xlsx or
|
| 180 |
-
#in_data_files = gr.File(height=FILE_INPUT_HEIGHT, label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet'])
|
| 181 |
in_data_files.render()
|
| 182 |
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
in_colnames.render()
|
| 186 |
|
| 187 |
-
with gr.Accordion("Group analysis by
|
| 188 |
-
in_group_col
|
| 189 |
|
| 190 |
-
with gr.Accordion("
|
| 191 |
candidate_topics.render()
|
| 192 |
with gr.Row(equal_height=True):
|
| 193 |
-
force_zero_shot_radio = gr.Radio(label="Force responses into
|
| 194 |
-
force_single_topic_radio = gr.Radio(label="Ask the model to assign responses to only a single topic", value="No", choices=["Yes", "No"])
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
sentiment_checkbox = gr.Radio(label="Choose sentiment categories to split responses", value="Negative or Positive", choices=["Negative or Positive", "Negative, Neutral, or Positive", "Do not assess sentiment"])
|
| 201 |
|
| 202 |
|
| 203 |
if GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True":
|
| 204 |
with gr.Accordion("Assign task to cost code", open = True, visible=True):
|
| 205 |
gr.Markdown("Please ensure that you have approval from your budget holder before using this app for redaction tasks that incur a cost.")
|
| 206 |
-
with gr.Row():
|
| 207 |
-
cost_code_dataframe = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Existing cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', visible=True, wrap=True, max_height=200)
|
| 208 |
with gr.Column():
|
| 209 |
-
|
|
|
|
|
|
|
|
|
|
| 210 |
cost_code_choice_drop = gr.Dropdown(value=DEFAULT_COST_CODE, label="Choose cost code for analysis", choices=[DEFAULT_COST_CODE], allow_custom_value=False, visible=True)
|
| 211 |
|
| 212 |
-
all_in_one_btn = gr.Button("
|
| 213 |
-
extract_topics_btn = gr.Button("1. Extract topics", variant="secondary")
|
| 214 |
|
| 215 |
with gr.Row(equal_height=True):
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
topic_extraction_output_files = gr.File(label="Extract topics output files", scale=1, interactive=False)
|
| 220 |
-
topic_extraction_output_files_xlsx.render()
|
| 221 |
|
| 222 |
-
#display_topic_table_markdown = gr.Markdown(value="", show_copy_button=True)
|
| 223 |
display_topic_table_markdown.render()
|
| 224 |
-
|
| 225 |
|
| 226 |
data_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
|
| 227 |
data_feedback_radio = gr.Radio(label="Please give some feedback about the results of the topic extraction.",
|
|
@@ -232,17 +249,24 @@ with app:
|
|
| 232 |
with gr.Row():
|
| 233 |
s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
|
| 234 |
|
| 235 |
-
with gr.Tab(label="
|
| 236 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
|
| 238 |
-
with gr.Accordion("Modify existing topics", open = False):
|
| 239 |
modification_input_files = gr.File(height=FILE_INPUT_HEIGHT, label="Upload files to modify topics", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet'])
|
| 240 |
|
| 241 |
modifiable_unique_topics_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=(4, "fixed"), row_count = (1, "fixed"), visible=True, type="pandas")
|
| 242 |
|
| 243 |
save_modified_files_button = gr.Button(value="Save modified topic names")
|
| 244 |
|
| 245 |
-
with gr.Accordion("Deduplicate topics - upload reference data file and unique data files", open =
|
| 246 |
### DEDUPLICATION
|
| 247 |
deduplication_input_files = gr.File(height=FILE_INPUT_HEIGHT, label="Upload files to deduplicate topics", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet'])
|
| 248 |
deduplication_input_files_status = gr.Textbox(value = "", label="Previous file input", visible=False)
|
|
@@ -252,35 +276,36 @@ with app:
|
|
| 252 |
merge_sentiment_drop = gr.Dropdown(label="Merge sentiment values together for duplicate subtopics.", value="No", choices=["Yes", "No"])
|
| 253 |
deduplicate_score_threshold = gr.Number(label="Similarity threshold with which to determine duplicates.", value = 90, minimum=5, maximum=100, precision=0)
|
| 254 |
|
| 255 |
-
deduplicate_previous_data_btn = gr.Button("
|
| 256 |
|
|
|
|
| 257 |
### SUMMARISATION
|
| 258 |
summarisation_input_files = gr.File(height=FILE_INPUT_HEIGHT, label="Upload files to summarise", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet'])
|
| 259 |
|
| 260 |
summarise_format_radio = gr.Radio(label="Choose summary type", value=two_para_summary_format_prompt, choices=[two_para_summary_format_prompt, single_para_summary_format_prompt])
|
| 261 |
|
| 262 |
-
summarise_previous_data_btn = gr.Button("
|
| 263 |
with gr.Row():
|
| 264 |
summary_output_files = gr.File(height=FILE_INPUT_HEIGHT, label="Summarised output files", interactive=False, scale=3)
|
| 265 |
summary_output_files_xlsx = gr.File(height=FILE_INPUT_HEIGHT, label="xlsx file summary", interactive=False, scale=1)
|
| 266 |
|
| 267 |
summarised_output_markdown = gr.Markdown(value="### Summarised table will appear here", show_copy_button=True)
|
| 268 |
|
| 269 |
-
|
| 270 |
-
|
| 271 |
|
| 272 |
-
|
| 273 |
-
|
| 274 |
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
|
| 285 |
with gr.Tab(label="Topic table viewer", visible=False):
|
| 286 |
gr.Markdown("""### View a 'unique_topic_table' csv file in markdown format.""")
|
|
@@ -299,8 +324,9 @@ with app:
|
|
| 299 |
with gr.Tab(label="LLM and topic extraction settings"):
|
| 300 |
gr.Markdown("""Define settings that affect large language model output.""")
|
| 301 |
with gr.Accordion("Settings for LLM generation", open = True):
|
| 302 |
-
|
| 303 |
-
|
|
|
|
| 304 |
random_seed = gr.Number(value=LLM_SEED, label="Random seed for LLM generation", visible=False)
|
| 305 |
|
| 306 |
with gr.Accordion("AWS API keys", open = False):
|
|
@@ -403,7 +429,7 @@ with app:
|
|
| 403 |
force_zero_shot_radio,
|
| 404 |
in_excel_sheets,
|
| 405 |
force_single_topic_radio,
|
| 406 |
-
|
| 407 |
aws_access_key_textbox,
|
| 408 |
aws_secret_key_textbox,
|
| 409 |
hf_api_key_textbox,
|
|
@@ -435,7 +461,7 @@ with app:
|
|
| 435 |
logged_content_df],
|
| 436 |
api_name="extract_topics", show_progress_on=output_messages_textbox).\
|
| 437 |
success(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, original_data_file_name_textbox, in_colnames, model_choice, conversation_metadata_textbox_placeholder, input_tokens_num, output_tokens_num, number_of_calls_num, estimated_time_taken_number, cost_code_choice_drop], None, preprocess=False, api_name="usage_logs").\
|
| 438 |
-
then(collect_output_csvs_and_create_excel_output, inputs=[in_data_files, in_colnames, original_data_file_name_textbox, in_group_col, model_choice, master_reference_df_state, master_unique_topics_df_state, summarised_output_df, missing_df_state, in_excel_sheets, usage_logs_state, model_name_map_state, output_folder_state], outputs=[topic_extraction_output_files_xlsx, summary_xlsx_output_files_list])
|
| 439 |
|
| 440 |
###
|
| 441 |
# DEDUPLICATION AND SUMMARISATION FUNCTIONS
|
|
@@ -457,14 +483,14 @@ with app:
|
|
| 457 |
success(sample_reference_table_summaries, inputs=[master_reference_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown], api_name="sample_summaries").\
|
| 458 |
success(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, google_api_key_textbox, temperature_slide, working_data_file_name_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_excel_sheets, in_colnames, log_files_output_list_state, summarise_format_radio, output_folder_state, context_textbox, aws_access_key_textbox, aws_secret_key_textbox, model_name_map_state, hf_api_key_textbox, logged_content_df], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output, overall_summarisation_input_files, input_tokens_num, output_tokens_num, number_of_calls_num, estimated_time_taken_number, output_messages_textbox, logged_content_df], api_name="summarise_topics", show_progress_on=[output_messages_textbox, summary_output_files]).\
|
| 459 |
success(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, original_data_file_name_textbox, in_colnames, model_choice, conversation_metadata_textbox_placeholder, input_tokens_num, output_tokens_num, number_of_calls_num, estimated_time_taken_number, cost_code_choice_drop], None, preprocess=False).\
|
| 460 |
-
then(collect_output_csvs_and_create_excel_output, inputs=[in_data_files, in_colnames, original_data_file_name_textbox, in_group_col, model_choice, master_reference_df_revised_summaries_state, master_unique_topics_df_revised_summaries_state, summarised_output_df, missing_df_state, in_excel_sheets, usage_logs_state, model_name_map_state, output_folder_state], outputs=[summary_output_files_xlsx, summary_xlsx_output_files_list])
|
| 461 |
|
| 462 |
# SUMMARISE WHOLE TABLE PAGE
|
| 463 |
overall_summarise_previous_data_btn.click(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
|
| 464 |
success(load_in_previous_data_files, inputs=[overall_summarisation_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, working_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
|
| 465 |
success(overall_summary, inputs=[master_unique_topics_df_state, model_choice, google_api_key_textbox, temperature_slide, working_data_file_name_textbox, output_folder_state, in_colnames, context_textbox, aws_access_key_textbox, aws_secret_key_textbox, model_name_map_state, hf_api_key_textbox, logged_content_df], outputs=[overall_summary_output_files, overall_summarised_output_markdown, summarised_output_df, conversation_metadata_textbox, input_tokens_num, output_tokens_num, number_of_calls_num, estimated_time_taken_number, output_messages_textbox, logged_content_df], scroll_to_output=True, api_name="overall_summary", show_progress_on=[output_messages_textbox, overall_summary_output_files]).\
|
| 466 |
success(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, original_data_file_name_textbox, in_colnames, model_choice, conversation_metadata_textbox_placeholder, input_tokens_num, output_tokens_num, number_of_calls_num, estimated_time_taken_number, cost_code_choice_drop], None, preprocess=False).\
|
| 467 |
-
then(collect_output_csvs_and_create_excel_output, inputs=[in_data_files, in_colnames, original_data_file_name_textbox, in_group_col, model_choice, master_reference_df_state, master_unique_topics_df_state, summarised_output_df, missing_df_state, in_excel_sheets, usage_logs_state, model_name_map_state, output_folder_state], outputs=[overall_summary_output_files_xlsx, summary_xlsx_output_files_list])
|
| 468 |
|
| 469 |
|
| 470 |
# All in one button
|
|
@@ -504,7 +530,7 @@ with app:
|
|
| 504 |
force_zero_shot_radio,
|
| 505 |
in_excel_sheets,
|
| 506 |
force_single_topic_radio,
|
| 507 |
-
|
| 508 |
aws_access_key_textbox,
|
| 509 |
aws_secret_key_textbox,
|
| 510 |
hf_api_key_textbox,
|
|
@@ -559,7 +585,7 @@ with app:
|
|
| 559 |
show_progress_on=[output_messages_textbox], api_name="all_in_one_pipeline"
|
| 560 |
).\
|
| 561 |
success(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, original_data_file_name_textbox, in_colnames, model_choice, conversation_metadata_textbox_placeholder, input_tokens_num, output_tokens_num, number_of_calls_num, estimated_time_taken_number, cost_code_choice_drop], None, preprocess=False).\
|
| 562 |
-
then(collect_output_csvs_and_create_excel_output, inputs=[in_data_files, in_colnames, original_data_file_name_textbox, in_group_col, model_choice, master_reference_df_revised_summaries_state, master_unique_topics_df_revised_summaries_state, summarised_output_df, missing_df_state, in_excel_sheets, usage_logs_state, model_name_map_state, output_folder_state], outputs=[overall_summary_output_files_xlsx, summary_xlsx_output_files_list]).\
|
| 563 |
success(move_overall_summary_output_files_to_front_page, inputs=[summary_xlsx_output_files_list], outputs=[topic_extraction_output_files_xlsx])
|
| 564 |
|
| 565 |
###
|
|
@@ -590,7 +616,7 @@ with app:
|
|
| 590 |
success(fn=join_cols_onto_reference_df, inputs=[master_reference_df_state, file_data_state, join_colnames, reference_df_data_file_name_textbox], outputs=[master_reference_df_state_joined, out_join_files])
|
| 591 |
|
| 592 |
# Export to xlsx file
|
| 593 |
-
export_xlsx_btn.click(collect_output_csvs_and_create_excel_output, inputs=[in_data_files, in_colnames, original_data_file_name_textbox, in_group_col, model_choice, master_reference_df_state, master_unique_topics_df_state, summarised_output_df, missing_df_state, in_excel_sheets, usage_logs_state, model_name_map_state, output_folder_state], outputs=[out_xlsx_files, summary_xlsx_output_files_list], api_name="export_xlsx")
|
| 594 |
|
| 595 |
# If relevant environment variable is set, load in the default cost code file from S3 or locally
|
| 596 |
if GET_COST_CODES == "True" and (COST_CODES_PATH or S3_COST_CODES_PATH):
|
|
|
|
| 10 |
from tools.combine_sheets_into_xlsx import collect_output_csvs_and_create_excel_output
|
| 11 |
from tools.custom_csvlogger import CSVLogger_custom
|
| 12 |
from tools.auth import authenticate_user
|
| 13 |
+
from tools.example_table_outputs import dummy_consultation_table, case_notes_table, dummy_consultation_table_zero_shot, case_notes_table_grouped, case_notes_table_structured_summary
|
| 14 |
from tools.prompts import initial_table_prompt, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, two_para_summary_format_prompt, single_para_summary_format_prompt
|
| 15 |
# from tools.verify_titles import verify_titles
|
| 16 |
from tools.config import RUN_AWS_FUNCTIONS, HOST_NAME, ACCESS_LOGS_FOLDER, FEEDBACK_LOGS_FOLDER, USAGE_LOGS_FOLDER, RUN_LOCAL_MODEL, FILE_INPUT_HEIGHT, GEMINI_API_KEY, model_full_names, BATCH_SIZE_DEFAULT, CHOSEN_LOCAL_MODEL_TYPE, LLM_SEED, COGNITO_AUTH, MAX_QUEUE_SIZE, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, INPUT_FOLDER, OUTPUT_FOLDER, S3_LOG_BUCKET, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, model_name_map, GET_COST_CODES, ENFORCE_COST_CODES, DEFAULT_COST_CODE, COST_CODES_PATH, S3_COST_CODES_PATH, OUTPUT_COST_CODES_PATH, SHOW_COSTS, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, USAGE_LOG_FILE_NAME, CSV_ACCESS_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, DYNAMODB_ACCESS_LOG_HEADERS, DYNAMODB_FEEDBACK_LOG_HEADERS, DYNAMODB_USAGE_LOG_HEADERS, S3_ACCESS_LOGS_FOLDER, S3_FEEDBACK_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER, AWS_ACCESS_KEY, AWS_SECRET_KEY, SHOW_EXAMPLES, HF_TOKEN, AZURE_API_KEY, LLM_TEMPERATURE
|
|
|
|
| 59 |
in_data_files = gr.File(height=FILE_INPUT_HEIGHT, label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet'])
|
| 60 |
in_colnames = gr.Dropdown(choices=[""], multiselect = False, label="Select the open text column of interest. In an Excel file, this shows columns across all sheets.", allow_custom_value=True, interactive=True)
|
| 61 |
context_textbox = gr.Textbox(label="Write up to one sentence giving context to the large language model for your task (e.g. 'Consultation for the construction of flats on Main Street')")
|
| 62 |
+
topic_extraction_output_files_xlsx = gr.File(label="Overall summary xlsx file", scale=1, interactive=False, file_count="multiple")
|
| 63 |
display_topic_table_markdown = gr.Markdown(value="", show_copy_button=True)
|
| 64 |
output_messages_textbox = gr.Textbox(value="", label="Output messages", scale=1, interactive=False, lines=4)
|
| 65 |
+
candidate_topics = gr.File(height=FILE_INPUT_HEIGHT, label="Input topics from file (csv). File should have at least one column with a header, and all topic names below this. Using the headers 'General topic' and/or 'Subtopic' will allow for these columns to be suggested to the model. If a third column is present, it will be assumed to be a topic description.", file_count="single")
|
| 66 |
+
produce_structured_summary_radio = gr.Radio(label="Ask the model to produce structured summaries using the suggested topics as headers rather than extract topics", value="No", choices=["Yes", "No"])
|
| 67 |
+
in_group_col = gr.Dropdown(multiselect = False, label="Select the column to group results by", allow_custom_value=True, interactive=True)
|
| 68 |
+
batch_size_number = gr.Number(label = "Number of responses to submit in a single LLM query (batch size)", value = BATCH_SIZE_DEFAULT, precision=0, minimum=1, maximum=50)
|
| 69 |
|
| 70 |
# Create the gradio interface
|
| 71 |
app = gr.Blocks(theme = gr.themes.Default(primary_hue="blue"), fill_width=True)
|
|
|
|
| 167 |
NOTE: Large language models are not 100% accurate and may produce biased or harmful outputs. All outputs from this app **absolutely need to be checked by a human** to check for harmful outputs, hallucinations, and accuracy.""")
|
| 168 |
|
| 169 |
if SHOW_EXAMPLES == "True":
|
| 170 |
+
def show_info_box_on_click(
|
| 171 |
+
in_data_files, in_colnames, context_textbox, original_data_file_name_textbox, topic_extraction_output_files_xlsx, display_topic_table_markdown, output_messages_textbox, candidate_topics, produce_structured_summary_radio, in_group_col, batch_size_number,
|
| 172 |
+
):
|
| 173 |
+
gr.Info(
|
| 174 |
+
"Example data loaded. Now click on the 'All in one...' button below to run the full suite of topic extraction, deduplication, and summarisation."
|
| 175 |
+
)
|
| 176 |
+
|
| 177 |
+
examples = gr.Examples(examples=\
|
| 178 |
+
|
| 179 |
+
[[["example_data/dummy_consultation_response.csv"], "Response text", "Consultation for the construction of flats on Main Street", "dummy_consultation_response.csv", ["example_data/dummy_consultation_r_col_Response_text_Gemma_3_4B_topic_analysis.xlsx"], dummy_consultation_table, "Example output from the dummy consultation dataset successfully loaded. Download the xlsx outputs to the right to see full outputs.", None, "No", None, 5],\
|
| 180 |
+
|
| 181 |
+
[["example_data/combined_case_notes.csv"], "Case Note", "Social Care case notes for young people", "combined_case_notes.csv", ["example_data/combined_case_notes_col_Case_Note_Gemma_3_4B_topic_analysis.xlsx"], case_notes_table, "Example output from the case notes dataset successfully loaded. Download the xlsx outputs to the right to see full outputs.", None, "No", None, 5],\
|
| 182 |
+
|
| 183 |
+
[["example_data/dummy_consultation_response.csv"], "Response text", "Consultation for the construction of flats on Main Street", "dummy_consultation_response.csv", ["example_data/dummy_consultation_r_col_Response_text_Gemma_3_4B_topic_analysis_zero_shot.xlsx"], dummy_consultation_table_zero_shot, "Example output from the dummy consultation dataset with suggested topics successfully loaded. Download the xlsx outputs to the right to see full outputs.", "example_data/dummy_consultation_response_themes.csv", "No", None, 5],\
|
| 184 |
+
|
| 185 |
+
[["example_data/combined_case_notes.csv"], "Case Note", "Social Care case notes for young people", "combined_case_notes.csv", ["example_data/combined_case_notes_col_Case_Note_Gemma_3_4B_topic_analysis_grouped.xlsx"], case_notes_table_grouped, "Example data from the case notes dataset with groups successfully loaded. Download the xlsx outputs to the right to see full outputs.", "example_data/case_note_headers_specific.csv", "No", "Client", 5],\
|
| 186 |
+
|
| 187 |
+
[["example_data/combined_case_notes.csv"], "Case Note", "Social Care case notes for young people", "combined_case_notes.csv", ["example_data/combined_case_notes_col_Case_Note_Gemma_3_4B_structured_summaries.xlsx"], case_notes_table_structured_summary, "Example data from the case notes dataset for structured summaries successfully loaded. Download the xlsx outputs to the right to see full outputs.", "example_data/case_note_headers_specific.csv", "Yes", "Client", 50]],\
|
| 188 |
+
|
| 189 |
+
inputs=[in_data_files, in_colnames, context_textbox, original_data_file_name_textbox, topic_extraction_output_files_xlsx, display_topic_table_markdown, output_messages_textbox, candidate_topics, produce_structured_summary_radio, in_group_col, batch_size_number],
|
| 190 |
+
|
| 191 |
+
example_labels=["Main Street construction consultation", "Case notes for young people", "Main Street construction consultation with suggested topics", "Case notes grouped by person with suggested topics", "Case notes structured summary with suggested topics"],
|
| 192 |
+
|
| 193 |
+
label="Try topic extraction and summarisation with an example dataset",
|
| 194 |
+
|
| 195 |
+
fn=show_info_box_on_click,
|
| 196 |
+
run_on_click=True,
|
| 197 |
+
)
|
| 198 |
|
| 199 |
+
with gr.Tab(label="All in one topic extraction and summarisation"):
|
|
|
|
| 200 |
with gr.Row():
|
| 201 |
+
model_choice = gr.Dropdown(value = default_model_choice, choices = model_full_names, label="Large language model for topic extraction and summarisation", multiselect=False)
|
| 202 |
|
| 203 |
+
with gr.Accordion("Upload xlsx, csv, or parquet file", open = True):
|
|
|
|
| 204 |
in_data_files.render()
|
| 205 |
|
| 206 |
+
in_excel_sheets = gr.Dropdown(multiselect = False, label="Select the Excel sheet of interest.", visible=False, allow_custom_value=True)
|
| 207 |
+
in_colnames.render()
|
|
|
|
| 208 |
|
| 209 |
+
with gr.Accordion("Group analysis by values in another column", open=False):
|
| 210 |
+
in_group_col.render()
|
| 211 |
|
| 212 |
+
with gr.Accordion("Provide list of suggested topics", open = False):
|
| 213 |
candidate_topics.render()
|
| 214 |
with gr.Row(equal_height=True):
|
| 215 |
+
force_zero_shot_radio = gr.Radio(label="Force responses into suggested topics", value="No", choices=["Yes", "No"])
|
| 216 |
+
force_single_topic_radio = gr.Radio(label="Ask the model to assign responses to only a single topic", value="No", choices=["Yes", "No"])
|
| 217 |
+
produce_structured_summary_radio.render()
|
| 218 |
+
|
| 219 |
+
with gr.Accordion("Response sentiment analysis", open = False):
|
| 220 |
+
sentiment_checkbox = gr.Radio(label="Response sentiment analysis", value="Negative or Positive", choices=["Negative or Positive", "Negative, Neutral, or Positive", "Do not assess sentiment"])
|
|
|
|
|
|
|
| 221 |
|
| 222 |
|
| 223 |
if GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True":
|
| 224 |
with gr.Accordion("Assign task to cost code", open = True, visible=True):
|
| 225 |
gr.Markdown("Please ensure that you have approval from your budget holder before using this app for redaction tasks that incur a cost.")
|
| 226 |
+
with gr.Row(equal_height=True):
|
|
|
|
| 227 |
with gr.Column():
|
| 228 |
+
with gr.Accordion("Cost code table", open = False, visible=True):
|
| 229 |
+
cost_code_dataframe = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Existing cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', visible=True, wrap=True, max_height=200)
|
| 230 |
+
reset_cost_code_dataframe_button = gr.Button(value="Reset code code table filter")
|
| 231 |
+
with gr.Column():
|
| 232 |
cost_code_choice_drop = gr.Dropdown(value=DEFAULT_COST_CODE, label="Choose cost code for analysis", choices=[DEFAULT_COST_CODE], allow_custom_value=False, visible=True)
|
| 233 |
|
| 234 |
+
all_in_one_btn = gr.Button("Extract topics, deduplicate, and summarise", variant="primary")
|
|
|
|
| 235 |
|
| 236 |
with gr.Row(equal_height=True):
|
| 237 |
+
output_messages_textbox.render()
|
| 238 |
+
|
| 239 |
+
topic_extraction_output_files_xlsx.render()
|
|
|
|
|
|
|
| 240 |
|
|
|
|
| 241 |
display_topic_table_markdown.render()
|
|
|
|
| 242 |
|
| 243 |
data_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
|
| 244 |
data_feedback_radio = gr.Radio(label="Please give some feedback about the results of the topic extraction.",
|
|
|
|
| 249 |
with gr.Row():
|
| 250 |
s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
|
| 251 |
|
| 252 |
+
with gr.Tab(label="Advanced - Step by step topic extraction and summarisation"):
|
| 253 |
+
|
| 254 |
+
with gr.Accordion("1. Extract topics - go to first tab for file upload, model choice, and other settings before clicking this button", open = True):
|
| 255 |
+
context_textbox.render()
|
| 256 |
+
extract_topics_btn = gr.Button("1. Extract topics", variant="secondary")
|
| 257 |
+
topic_extraction_output_files = gr.File(label="Extract topics output files", scale=1, interactive=False)
|
| 258 |
+
|
| 259 |
+
with gr.Accordion("2. Modify topics from topic extraction", open = False):
|
| 260 |
+
gr.Markdown("""Load in previously completed Extract Topics output files ('reference_table', and 'unique_topics' files) to modify topics, deduplicate topics, or summarise the outputs. If you want pivot table outputs, please load in the original data file along with the selected open text column on the first tab before deduplicating or summarising.""")
|
| 261 |
+
|
| 262 |
|
|
|
|
| 263 |
modification_input_files = gr.File(height=FILE_INPUT_HEIGHT, label="Upload files to modify topics", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet'])
|
| 264 |
|
| 265 |
modifiable_unique_topics_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=(4, "fixed"), row_count = (1, "fixed"), visible=True, type="pandas")
|
| 266 |
|
| 267 |
save_modified_files_button = gr.Button(value="Save modified topic names")
|
| 268 |
|
| 269 |
+
with gr.Accordion("3. Deduplicate topics - upload reference data file and unique data files", open = False):
|
| 270 |
### DEDUPLICATION
|
| 271 |
deduplication_input_files = gr.File(height=FILE_INPUT_HEIGHT, label="Upload files to deduplicate topics", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet'])
|
| 272 |
deduplication_input_files_status = gr.Textbox(value = "", label="Previous file input", visible=False)
|
|
|
|
| 276 |
merge_sentiment_drop = gr.Dropdown(label="Merge sentiment values together for duplicate subtopics.", value="No", choices=["Yes", "No"])
|
| 277 |
deduplicate_score_threshold = gr.Number(label="Similarity threshold with which to determine duplicates.", value = 90, minimum=5, maximum=100, precision=0)
|
| 278 |
|
| 279 |
+
deduplicate_previous_data_btn = gr.Button("3. Deduplicate topics", variant="primary")
|
| 280 |
|
| 281 |
+
with gr.Accordion("4. Summarise topics", open = False):
|
| 282 |
### SUMMARISATION
|
| 283 |
summarisation_input_files = gr.File(height=FILE_INPUT_HEIGHT, label="Upload files to summarise", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet'])
|
| 284 |
|
| 285 |
summarise_format_radio = gr.Radio(label="Choose summary type", value=two_para_summary_format_prompt, choices=[two_para_summary_format_prompt, single_para_summary_format_prompt])
|
| 286 |
|
| 287 |
+
summarise_previous_data_btn = gr.Button("4. Summarise topics", variant="primary")
|
| 288 |
with gr.Row():
|
| 289 |
summary_output_files = gr.File(height=FILE_INPUT_HEIGHT, label="Summarised output files", interactive=False, scale=3)
|
| 290 |
summary_output_files_xlsx = gr.File(height=FILE_INPUT_HEIGHT, label="xlsx file summary", interactive=False, scale=1)
|
| 291 |
|
| 292 |
summarised_output_markdown = gr.Markdown(value="### Summarised table will appear here", show_copy_button=True)
|
| 293 |
|
| 294 |
+
with gr.Accordion("5. Create overall summary", open = False):
|
| 295 |
+
gr.Markdown("""### Create an overall summary from an existing topic summary table.""")
|
| 296 |
|
| 297 |
+
### SUMMARISATION
|
| 298 |
+
overall_summarisation_input_files = gr.File(height=FILE_INPUT_HEIGHT, label="Upload a '...unique_topic' file to summarise", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet'])
|
| 299 |
|
| 300 |
+
overall_summarise_format_radio = gr.Radio(label="Choose summary type", value=two_para_summary_format_prompt, choices=[two_para_summary_format_prompt, single_para_summary_format_prompt], visible=False) # This is currently an invisible placeholder in case in future I want to add in overall summarisation customisation
|
| 301 |
+
|
| 302 |
+
overall_summarise_previous_data_btn = gr.Button("5. Create overall summary", variant="primary")
|
| 303 |
|
| 304 |
+
with gr.Row():
|
| 305 |
+
overall_summary_output_files = gr.File(height=FILE_INPUT_HEIGHT, label="Summarised output files", interactive=False, scale=3)
|
| 306 |
+
overall_summary_output_files_xlsx = gr.File(height=FILE_INPUT_HEIGHT, label="xlsx file summary", interactive=False, scale=1)
|
| 307 |
+
|
| 308 |
+
overall_summarised_output_markdown = gr.HTML(value="### Overall summary will appear here")
|
| 309 |
|
| 310 |
with gr.Tab(label="Topic table viewer", visible=False):
|
| 311 |
gr.Markdown("""### View a 'unique_topic_table' csv file in markdown format.""")
|
|
|
|
| 324 |
with gr.Tab(label="LLM and topic extraction settings"):
|
| 325 |
gr.Markdown("""Define settings that affect large language model output.""")
|
| 326 |
with gr.Accordion("Settings for LLM generation", open = True):
|
| 327 |
+
with gr.Row():
|
| 328 |
+
temperature_slide = gr.Slider(minimum=0.0, maximum=1.0, value=LLM_TEMPERATURE, label="Choose LLM temperature setting", precision=1, step=0.1)
|
| 329 |
+
batch_size_number.render()
|
| 330 |
random_seed = gr.Number(value=LLM_SEED, label="Random seed for LLM generation", visible=False)
|
| 331 |
|
| 332 |
with gr.Accordion("AWS API keys", open = False):
|
|
|
|
| 429 |
force_zero_shot_radio,
|
| 430 |
in_excel_sheets,
|
| 431 |
force_single_topic_radio,
|
| 432 |
+
produce_structured_summary_radio,
|
| 433 |
aws_access_key_textbox,
|
| 434 |
aws_secret_key_textbox,
|
| 435 |
hf_api_key_textbox,
|
|
|
|
| 461 |
logged_content_df],
|
| 462 |
api_name="extract_topics", show_progress_on=output_messages_textbox).\
|
| 463 |
success(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, original_data_file_name_textbox, in_colnames, model_choice, conversation_metadata_textbox_placeholder, input_tokens_num, output_tokens_num, number_of_calls_num, estimated_time_taken_number, cost_code_choice_drop], None, preprocess=False, api_name="usage_logs").\
|
| 464 |
+
then(collect_output_csvs_and_create_excel_output, inputs=[in_data_files, in_colnames, original_data_file_name_textbox, in_group_col, model_choice, master_reference_df_state, master_unique_topics_df_state, summarised_output_df, missing_df_state, in_excel_sheets, usage_logs_state, model_name_map_state, output_folder_state, produce_structured_summary_radio], outputs=[topic_extraction_output_files_xlsx, summary_xlsx_output_files_list])
|
| 465 |
|
| 466 |
###
|
| 467 |
# DEDUPLICATION AND SUMMARISATION FUNCTIONS
|
|
|
|
| 483 |
success(sample_reference_table_summaries, inputs=[master_reference_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown], api_name="sample_summaries").\
|
| 484 |
success(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, google_api_key_textbox, temperature_slide, working_data_file_name_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_excel_sheets, in_colnames, log_files_output_list_state, summarise_format_radio, output_folder_state, context_textbox, aws_access_key_textbox, aws_secret_key_textbox, model_name_map_state, hf_api_key_textbox, logged_content_df], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output, overall_summarisation_input_files, input_tokens_num, output_tokens_num, number_of_calls_num, estimated_time_taken_number, output_messages_textbox, logged_content_df], api_name="summarise_topics", show_progress_on=[output_messages_textbox, summary_output_files]).\
|
| 485 |
success(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, original_data_file_name_textbox, in_colnames, model_choice, conversation_metadata_textbox_placeholder, input_tokens_num, output_tokens_num, number_of_calls_num, estimated_time_taken_number, cost_code_choice_drop], None, preprocess=False).\
|
| 486 |
+
then(collect_output_csvs_and_create_excel_output, inputs=[in_data_files, in_colnames, original_data_file_name_textbox, in_group_col, model_choice, master_reference_df_revised_summaries_state, master_unique_topics_df_revised_summaries_state, summarised_output_df, missing_df_state, in_excel_sheets, usage_logs_state, model_name_map_state, output_folder_state, produce_structured_summary_radio], outputs=[summary_output_files_xlsx, summary_xlsx_output_files_list])
|
| 487 |
|
| 488 |
# SUMMARISE WHOLE TABLE PAGE
|
| 489 |
overall_summarise_previous_data_btn.click(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
|
| 490 |
success(load_in_previous_data_files, inputs=[overall_summarisation_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, working_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
|
| 491 |
success(overall_summary, inputs=[master_unique_topics_df_state, model_choice, google_api_key_textbox, temperature_slide, working_data_file_name_textbox, output_folder_state, in_colnames, context_textbox, aws_access_key_textbox, aws_secret_key_textbox, model_name_map_state, hf_api_key_textbox, logged_content_df], outputs=[overall_summary_output_files, overall_summarised_output_markdown, summarised_output_df, conversation_metadata_textbox, input_tokens_num, output_tokens_num, number_of_calls_num, estimated_time_taken_number, output_messages_textbox, logged_content_df], scroll_to_output=True, api_name="overall_summary", show_progress_on=[output_messages_textbox, overall_summary_output_files]).\
|
| 492 |
success(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, original_data_file_name_textbox, in_colnames, model_choice, conversation_metadata_textbox_placeholder, input_tokens_num, output_tokens_num, number_of_calls_num, estimated_time_taken_number, cost_code_choice_drop], None, preprocess=False).\
|
| 493 |
+
then(collect_output_csvs_and_create_excel_output, inputs=[in_data_files, in_colnames, original_data_file_name_textbox, in_group_col, model_choice, master_reference_df_state, master_unique_topics_df_state, summarised_output_df, missing_df_state, in_excel_sheets, usage_logs_state, model_name_map_state, output_folder_state, produce_structured_summary_radio], outputs=[overall_summary_output_files_xlsx, summary_xlsx_output_files_list])
|
| 494 |
|
| 495 |
|
| 496 |
# All in one button
|
|
|
|
| 530 |
force_zero_shot_radio,
|
| 531 |
in_excel_sheets,
|
| 532 |
force_single_topic_radio,
|
| 533 |
+
produce_structured_summary_radio,
|
| 534 |
aws_access_key_textbox,
|
| 535 |
aws_secret_key_textbox,
|
| 536 |
hf_api_key_textbox,
|
|
|
|
| 585 |
show_progress_on=[output_messages_textbox], api_name="all_in_one_pipeline"
|
| 586 |
).\
|
| 587 |
success(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, original_data_file_name_textbox, in_colnames, model_choice, conversation_metadata_textbox_placeholder, input_tokens_num, output_tokens_num, number_of_calls_num, estimated_time_taken_number, cost_code_choice_drop], None, preprocess=False).\
|
| 588 |
+
then(collect_output_csvs_and_create_excel_output, inputs=[in_data_files, in_colnames, original_data_file_name_textbox, in_group_col, model_choice, master_reference_df_revised_summaries_state, master_unique_topics_df_revised_summaries_state, summarised_output_df, missing_df_state, in_excel_sheets, usage_logs_state, model_name_map_state, output_folder_state, produce_structured_summary_radio], outputs=[overall_summary_output_files_xlsx, summary_xlsx_output_files_list]).\
|
| 589 |
success(move_overall_summary_output_files_to_front_page, inputs=[summary_xlsx_output_files_list], outputs=[topic_extraction_output_files_xlsx])
|
| 590 |
|
| 591 |
###
|
|
|
|
| 616 |
success(fn=join_cols_onto_reference_df, inputs=[master_reference_df_state, file_data_state, join_colnames, reference_df_data_file_name_textbox], outputs=[master_reference_df_state_joined, out_join_files])
|
| 617 |
|
| 618 |
# Export to xlsx file
|
| 619 |
+
export_xlsx_btn.click(collect_output_csvs_and_create_excel_output, inputs=[in_data_files, in_colnames, original_data_file_name_textbox, in_group_col, model_choice, master_reference_df_state, master_unique_topics_df_state, summarised_output_df, missing_df_state, in_excel_sheets, usage_logs_state, model_name_map_state, output_folder_state, produce_structured_summary_radio], outputs=[out_xlsx_files, summary_xlsx_output_files_list], api_name="export_xlsx")
|
| 620 |
|
| 621 |
# If relevant environment variable is set, load in the default cost code file from S3 or locally
|
| 622 |
if GET_COST_CODES == "True" and (COST_CODES_PATH or S3_COST_CODES_PATH):
|
example_data/case_note_headers_specific.csv
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
General Topic,Subtopic
|
| 2 |
+
Mental health,Anger
|
| 3 |
+
Mental health,Social issues
|
| 4 |
+
Physical health,General
|
| 5 |
+
Physical health,Substance misuse
|
| 6 |
+
Behaviour at school,Behaviour at school
|
| 7 |
+
Trends over time,Trends over time
|
example_data/{dummy_consultation_r_zero_shot_col_Response_text_Qwen_3_4B_topic_analysis.xlsx → combined_case_notes_col_Case_Note_Gemma_3_4B_structured_summaries.xlsx}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:322a081b29d4fb40ccae7d47aa74fda772a002eda576ddc98d6acc86366cff11
|
| 3 |
+
size 13502
|
example_data/combined_case_notes_col_Case_Note_Gemma_3_4B_topic_analysis_grouped.xlsx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8e1eaede9af75b6ab695b1cfc6c01ec875abf14521249ba7257bd4bb0afd7ee8
|
| 3 |
+
size 28673
|
example_data/dummy_consultation_r_col_Response_text_Gemma_3_4B_topic_analysis_zero_shot.xlsx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a5f0e36143d8362391e3b11d1c20e3a2a1b7536b8f0c972e3d44644eb9ae4e82
|
| 3 |
+
size 27592
|
pyproject.toml
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
[project]
|
| 2 |
name = "Large language model topic modelling"
|
| 3 |
-
version = "0.
|
| 4 |
description = "Topic model open text data files with a large language model."
|
| 5 |
requires-python = ">=3.10"
|
|
|
|
| 1 |
[project]
|
| 2 |
name = "Large language model topic modelling"
|
| 3 |
+
version = "0.3.0"
|
| 4 |
description = "Topic model open text data files with a large language model."
|
| 5 |
requires-python = ">=3.10"
|
tools/combine_sheets_into_xlsx.py
CHANGED
|
@@ -93,7 +93,7 @@ def csvs_to_excel(
|
|
| 93 |
unique_reference_numbers:list=[]
|
| 94 |
):
|
| 95 |
if intro_text is None:
|
| 96 |
-
intro_text =
|
| 97 |
|
| 98 |
wb = Workbook()
|
| 99 |
# Remove default sheet
|
|
@@ -166,21 +166,47 @@ def csvs_to_excel(
|
|
| 166 |
###
|
| 167 |
# Run the functions
|
| 168 |
###
|
| 169 |
-
def collect_output_csvs_and_create_excel_output(in_data_files:List, chosen_cols:list[str], reference_data_file_name_textbox:str, in_group_col:str, model_choice:str, master_reference_df_state:pd.DataFrame, master_unique_topics_df_state:pd.DataFrame, summarised_output_df:pd.DataFrame, missing_df_state:pd.DataFrame, excel_sheets:str, usage_logs_location:str="", model_name_map:dict={}, output_folder:str=OUTPUT_FOLDER):
|
| 170 |
'''
|
| 171 |
-
Collect together output
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
'''
|
| 173 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
if not chosen_cols:
|
| 175 |
raise Exception("Could not find chosen column")
|
| 176 |
|
| 177 |
today_date = datetime.today().strftime('%Y-%m-%d')
|
| 178 |
original_data_file_path = os.path.abspath(in_data_files[0])
|
| 179 |
|
| 180 |
-
csv_files =
|
| 181 |
-
sheet_names =
|
| 182 |
-
column_widths =
|
| 183 |
-
wrap_text_columns =
|
| 184 |
short_file_name = os.path.basename(reference_data_file_name_textbox)
|
| 185 |
reference_pivot_table = pd.DataFrame()
|
| 186 |
reference_table_csv_path = ""
|
|
@@ -191,21 +217,64 @@ def collect_output_csvs_and_create_excel_output(in_data_files:List, chosen_cols:
|
|
| 191 |
number_of_responses_with_topic_assignment = 0
|
| 192 |
|
| 193 |
if in_group_col: group = in_group_col
|
| 194 |
-
else: group = "All"
|
| 195 |
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
|
| 201 |
-
|
| 202 |
csv_files.append(overall_summary_csv_path)
|
| 203 |
sheet_names.append("Overall summary")
|
| 204 |
column_widths["Overall summary"] = {"A": 20, "B": 100}
|
| 205 |
wrap_text_columns["Overall summary"] = ['B']
|
| 206 |
|
| 207 |
-
file_output_list = []
|
| 208 |
-
|
| 209 |
if not master_reference_df_state.empty:
|
| 210 |
# Simplify table to just responses column and the Response reference number
|
| 211 |
file_data, file_name, num_batches = load_in_data_file(in_data_files, chosen_cols, 1, in_excel_sheets=excel_sheets)
|
|
@@ -234,50 +303,62 @@ def collect_output_csvs_and_create_excel_output(in_data_files:List, chosen_cols:
|
|
| 234 |
master_unique_topics_df_state.to_csv(unique_topic_table_csv_path, index = None)
|
| 235 |
|
| 236 |
if unique_topic_table_csv_path:
|
| 237 |
-
#unique_topic_table_csv_path = unique_topic_table_csv_path[0]
|
| 238 |
csv_files.append(unique_topic_table_csv_path)
|
| 239 |
sheet_names.append("Topic summary")
|
| 240 |
column_widths["Topic summary"] = {"A": 25, "B": 25, "C": 15, "D": 15, "F":100}
|
| 241 |
wrap_text_columns["Topic summary"] = ["B", "F"]
|
| 242 |
else:
|
| 243 |
-
|
|
|
|
| 244 |
if reference_table_csv_path:
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
|
|
|
|
|
|
|
|
|
| 249 |
else:
|
| 250 |
-
|
| 251 |
|
| 252 |
if reference_pivot_table_csv_path:
|
| 253 |
-
|
| 254 |
-
|
|
|
|
|
|
|
|
|
|
| 255 |
|
| 256 |
-
|
| 257 |
-
|
| 258 |
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
|
| 263 |
-
|
| 264 |
-
|
| 265 |
|
| 266 |
-
|
| 267 |
-
|
| 268 |
|
| 269 |
-
|
|
|
|
|
|
|
| 270 |
|
| 271 |
if not missing_df_state.empty:
|
| 272 |
missing_df_state_csv_path = output_folder + "missing_df_state_df_for_xlsx.csv"
|
| 273 |
missing_df_state.to_csv(missing_df_state_csv_path, index = None)
|
| 274 |
|
| 275 |
if missing_df_state_csv_path:
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
|
| 282 |
new_csv_files = csv_files.copy()
|
| 283 |
|
|
@@ -353,7 +434,7 @@ def collect_output_csvs_and_create_excel_output(in_data_files:List, chosen_cols:
|
|
| 353 |
|
| 354 |
# Save outputs for each batch. If master file created, label file as master
|
| 355 |
file_path_details = f"{file_name_cleaned}_col_{in_column_cleaned}_{model_choice_clean_short}"
|
| 356 |
-
output_xlsx_filename = output_folder + file_path_details + "_topic_analysis.xlsx"
|
| 357 |
|
| 358 |
xlsx_output_filename = csvs_to_excel(
|
| 359 |
csv_files = csv_files,
|
|
|
|
| 93 |
unique_reference_numbers:list=[]
|
| 94 |
):
|
| 95 |
if intro_text is None:
|
| 96 |
+
intro_text = list()
|
| 97 |
|
| 98 |
wb = Workbook()
|
| 99 |
# Remove default sheet
|
|
|
|
| 166 |
###
|
| 167 |
# Run the functions
|
| 168 |
###
|
| 169 |
+
def collect_output_csvs_and_create_excel_output(in_data_files:List, chosen_cols:list[str], reference_data_file_name_textbox:str, in_group_col:str, model_choice:str, master_reference_df_state:pd.DataFrame, master_unique_topics_df_state:pd.DataFrame, summarised_output_df:pd.DataFrame, missing_df_state:pd.DataFrame, excel_sheets:str, usage_logs_location:str="", model_name_map:dict={}, output_folder:str=OUTPUT_FOLDER, structured_summaries:str="No"):
|
| 170 |
'''
|
| 171 |
+
Collect together output CSVs from various output boxes and combine them into a single output Excel file.
|
| 172 |
+
|
| 173 |
+
Args:
|
| 174 |
+
in_data_files (List): A list of paths to the input data files.
|
| 175 |
+
chosen_cols (list[str]): A list of column names selected for analysis.
|
| 176 |
+
reference_data_file_name_textbox (str): The name of the reference data file.
|
| 177 |
+
in_group_col (str): The column used for grouping the data.
|
| 178 |
+
model_choice (str): The LLM model chosen for the analysis.
|
| 179 |
+
master_reference_df_state (pd.DataFrame): The master DataFrame containing reference data.
|
| 180 |
+
master_unique_topics_df_state (pd.DataFrame): The master DataFrame containing unique topics data.
|
| 181 |
+
summarised_output_df (pd.DataFrame): DataFrame containing the summarised output.
|
| 182 |
+
missing_df_state (pd.DataFrame): DataFrame containing information about missing data.
|
| 183 |
+
excel_sheets (str): Information regarding Excel sheets, typically sheet names or structure.
|
| 184 |
+
usage_logs_location (str, optional): Path to the usage logs CSV file. Defaults to "".
|
| 185 |
+
model_name_map (dict, optional): A dictionary mapping model choices to their display names. Defaults to {}.
|
| 186 |
+
output_folder (str, optional): The directory where the output Excel file will be saved. Defaults to OUTPUT_FOLDER.
|
| 187 |
+
structured_summaries (str, optional): Indicates whether structured summaries are being produced ("Yes" or "No"). Defaults to "No".
|
| 188 |
+
|
| 189 |
+
Returns:
|
| 190 |
+
tuple: A tuple containing:
|
| 191 |
+
- list: A list of paths to the generated Excel output files.
|
| 192 |
+
- list: A duplicate of the list of paths to the generated Excel output files (for UI compatibility).
|
| 193 |
'''
|
| 194 |
|
| 195 |
+
if structured_summaries == "Yes":
|
| 196 |
+
structured_summaries = True
|
| 197 |
+
else:
|
| 198 |
+
structured_summaries = False
|
| 199 |
+
|
| 200 |
if not chosen_cols:
|
| 201 |
raise Exception("Could not find chosen column")
|
| 202 |
|
| 203 |
today_date = datetime.today().strftime('%Y-%m-%d')
|
| 204 |
original_data_file_path = os.path.abspath(in_data_files[0])
|
| 205 |
|
| 206 |
+
csv_files = list()
|
| 207 |
+
sheet_names = list()
|
| 208 |
+
column_widths = dict()
|
| 209 |
+
wrap_text_columns = dict()
|
| 210 |
short_file_name = os.path.basename(reference_data_file_name_textbox)
|
| 211 |
reference_pivot_table = pd.DataFrame()
|
| 212 |
reference_table_csv_path = ""
|
|
|
|
| 217 |
number_of_responses_with_topic_assignment = 0
|
| 218 |
|
| 219 |
if in_group_col: group = in_group_col
|
| 220 |
+
else: group = "All"
|
| 221 |
|
| 222 |
+
overall_summary_csv_path = output_folder + "overall_summary_for_xlsx.csv"
|
| 223 |
+
|
| 224 |
+
if structured_summaries is True and not master_unique_topics_df_state.empty:
|
| 225 |
+
print("Producing overall summary based on structured summaries.")
|
| 226 |
+
# Create structured summary from master_unique_topics_df_state
|
| 227 |
+
structured_summary_data = list()
|
| 228 |
+
|
| 229 |
+
print("master_unique_topics_df_state:", master_unique_topics_df_state)
|
| 230 |
+
# Group by 'Group' column
|
| 231 |
+
for group_name, group_df in master_unique_topics_df_state.groupby('Group'):
|
| 232 |
+
group_summary = f"## {group_name}\n\n"
|
| 233 |
+
|
| 234 |
+
# Group by 'General topic' within each group
|
| 235 |
+
for general_topic, topic_df in group_df.groupby('General topic'):
|
| 236 |
+
group_summary += f"### {general_topic}\n\n"
|
| 237 |
+
|
| 238 |
+
# Add subtopics under each general topic
|
| 239 |
+
for _, row in topic_df.iterrows():
|
| 240 |
+
subtopic = row['Subtopic']
|
| 241 |
+
summary = row['Summary']
|
| 242 |
+
# sentiment = row.get('Sentiment', '')
|
| 243 |
+
# num_responses = row.get('Number of responses', '')
|
| 244 |
+
|
| 245 |
+
# Create subtopic entry
|
| 246 |
+
subtopic_entry = f"**{subtopic}**"
|
| 247 |
+
# if sentiment:
|
| 248 |
+
# subtopic_entry += f" ({sentiment})"
|
| 249 |
+
# if num_responses:
|
| 250 |
+
# subtopic_entry += f" - {num_responses} responses"
|
| 251 |
+
subtopic_entry += "\n\n"
|
| 252 |
+
|
| 253 |
+
if summary and pd.notna(summary):
|
| 254 |
+
subtopic_entry += f"{summary}\n\n"
|
| 255 |
+
|
| 256 |
+
group_summary += subtopic_entry
|
| 257 |
+
|
| 258 |
+
# Add to structured summary data
|
| 259 |
+
structured_summary_data.append({
|
| 260 |
+
'Group': group_name,
|
| 261 |
+
'Summary': group_summary.strip()
|
| 262 |
+
})
|
| 263 |
+
|
| 264 |
+
# Create DataFrame for structured summary
|
| 265 |
+
structured_summary_df = pd.DataFrame(structured_summary_data)
|
| 266 |
+
structured_summary_df.to_csv(overall_summary_csv_path, index=False)
|
| 267 |
+
else:
|
| 268 |
+
# Use original summarised_output_df
|
| 269 |
+
structured_summary_df = summarised_output_df
|
| 270 |
+
structured_summary_df.to_csv(overall_summary_csv_path, index = None)
|
| 271 |
|
| 272 |
+
if not structured_summary_df.empty:
|
| 273 |
csv_files.append(overall_summary_csv_path)
|
| 274 |
sheet_names.append("Overall summary")
|
| 275 |
column_widths["Overall summary"] = {"A": 20, "B": 100}
|
| 276 |
wrap_text_columns["Overall summary"] = ['B']
|
| 277 |
|
|
|
|
|
|
|
| 278 |
if not master_reference_df_state.empty:
|
| 279 |
# Simplify table to just responses column and the Response reference number
|
| 280 |
file_data, file_name, num_batches = load_in_data_file(in_data_files, chosen_cols, 1, in_excel_sheets=excel_sheets)
|
|
|
|
| 303 |
master_unique_topics_df_state.to_csv(unique_topic_table_csv_path, index = None)
|
| 304 |
|
| 305 |
if unique_topic_table_csv_path:
|
|
|
|
| 306 |
csv_files.append(unique_topic_table_csv_path)
|
| 307 |
sheet_names.append("Topic summary")
|
| 308 |
column_widths["Topic summary"] = {"A": 25, "B": 25, "C": 15, "D": 15, "F":100}
|
| 309 |
wrap_text_columns["Topic summary"] = ["B", "F"]
|
| 310 |
else:
|
| 311 |
+
print("Relevant unique topic files not found, excluding from xlsx output.")
|
| 312 |
+
|
| 313 |
if reference_table_csv_path:
|
| 314 |
+
if structured_summaries:
|
| 315 |
+
print("Structured summaries are being produced, excluding response level data from xlsx output.")
|
| 316 |
+
else:
|
| 317 |
+
csv_files.append(reference_table_csv_path)
|
| 318 |
+
sheet_names.append("Response level data")
|
| 319 |
+
column_widths["Response level data"] = {"A": 15, "B": 30, "C": 40, "H":100}
|
| 320 |
+
wrap_text_columns["Response level data"] = ["C", "G"]
|
| 321 |
else:
|
| 322 |
+
print("Relevant reference files not found, excluding from xlsx output.")
|
| 323 |
|
| 324 |
if reference_pivot_table_csv_path:
|
| 325 |
+
if structured_summaries:
|
| 326 |
+
print("Structured summaries are being produced, excluding topic response pivot table from xlsx output.")
|
| 327 |
+
else:
|
| 328 |
+
csv_files.append(reference_pivot_table_csv_path)
|
| 329 |
+
sheet_names.append("Topic response pivot table")
|
| 330 |
|
| 331 |
+
if reference_pivot_table.empty:
|
| 332 |
+
reference_pivot_table = pd.read_csv(reference_pivot_table_csv_path)
|
| 333 |
|
| 334 |
+
# Base widths and wrap
|
| 335 |
+
column_widths["Topic response pivot table"] = {"A": 25, "B": 100}
|
| 336 |
+
wrap_text_columns["Topic response pivot table"] = ["B"]
|
| 337 |
|
| 338 |
+
num_cols = len(reference_pivot_table.columns)
|
| 339 |
+
col_letters = [get_column_letter(i) for i in range(3, num_cols + 1)]
|
| 340 |
|
| 341 |
+
for col_letter in col_letters:
|
| 342 |
+
column_widths["Topic response pivot table"][col_letter] = 25
|
| 343 |
|
| 344 |
+
wrap_text_columns["Topic response pivot table"].extend(col_letters)
|
| 345 |
+
else:
|
| 346 |
+
print("Relevant reference pivot table files not found, excluding from xlsx output.")
|
| 347 |
|
| 348 |
if not missing_df_state.empty:
|
| 349 |
missing_df_state_csv_path = output_folder + "missing_df_state_df_for_xlsx.csv"
|
| 350 |
missing_df_state.to_csv(missing_df_state_csv_path, index = None)
|
| 351 |
|
| 352 |
if missing_df_state_csv_path:
|
| 353 |
+
if structured_summaries:
|
| 354 |
+
print("Structured summaries are being produced, excluding missing responses from xlsx output.")
|
| 355 |
+
else:
|
| 356 |
+
csv_files.append(missing_df_state_csv_path)
|
| 357 |
+
sheet_names.append("Missing responses")
|
| 358 |
+
column_widths["Missing responses"] = {"A": 25, "B": 30, "C": 50}
|
| 359 |
+
wrap_text_columns["Missing responses"] = ["C"]
|
| 360 |
+
else:
|
| 361 |
+
print("Relevant missing responses files not found, excluding from xlsx output.")
|
| 362 |
|
| 363 |
new_csv_files = csv_files.copy()
|
| 364 |
|
|
|
|
| 434 |
|
| 435 |
# Save outputs for each batch. If master file created, label file as master
|
| 436 |
file_path_details = f"{file_name_cleaned}_col_{in_column_cleaned}_{model_choice_clean_short}"
|
| 437 |
+
output_xlsx_filename = output_folder + file_path_details + ("_structured_summaries" if structured_summaries else "_topic_analysis") + ".xlsx"
|
| 438 |
|
| 439 |
xlsx_output_filename = csvs_to_excel(
|
| 440 |
csv_files = csv_files,
|
tools/config.py
CHANGED
|
@@ -429,7 +429,7 @@ S3_ALLOW_LIST_PATH = get_or_create_env_var('S3_ALLOW_LIST_PATH', '') # default_a
|
|
| 429 |
if ALLOW_LIST_PATH: OUTPUT_ALLOW_LIST_PATH = ALLOW_LIST_PATH
|
| 430 |
else: OUTPUT_ALLOW_LIST_PATH = 'config/default_allow_list.csv'
|
| 431 |
|
| 432 |
-
FILE_INPUT_HEIGHT = get_or_create_env_var('FILE_INPUT_HEIGHT', '
|
| 433 |
|
| 434 |
SHOW_EXAMPLES = get_or_create_env_var('SHOW_EXAMPLES', 'True')
|
| 435 |
|
|
|
|
| 429 |
if ALLOW_LIST_PATH: OUTPUT_ALLOW_LIST_PATH = ALLOW_LIST_PATH
|
| 430 |
else: OUTPUT_ALLOW_LIST_PATH = 'config/default_allow_list.csv'
|
| 431 |
|
| 432 |
+
FILE_INPUT_HEIGHT = int(get_or_create_env_var('FILE_INPUT_HEIGHT', '125'))
|
| 433 |
|
| 434 |
SHOW_EXAMPLES = get_or_create_env_var('SHOW_EXAMPLES', 'True')
|
| 435 |
|
tools/dedup_summaries.py
CHANGED
|
@@ -955,7 +955,7 @@ def overall_summary(topic_summary_df:pd.DataFrame,
|
|
| 955 |
tic = time.perf_counter()
|
| 956 |
|
| 957 |
if (model_choice == CHOSEN_LOCAL_MODEL_TYPE) & (RUN_LOCAL_MODEL == "1") & (not local_model):
|
| 958 |
-
progress(0.1, f"Using
|
| 959 |
local_model = get_model()
|
| 960 |
tokenizer = get_tokenizer()
|
| 961 |
assistant_model = get_assistant_model()
|
|
|
|
| 955 |
tic = time.perf_counter()
|
| 956 |
|
| 957 |
if (model_choice == CHOSEN_LOCAL_MODEL_TYPE) & (RUN_LOCAL_MODEL == "1") & (not local_model):
|
| 958 |
+
progress(0.1, f"Using model: {CHOSEN_LOCAL_MODEL_TYPE}")
|
| 959 |
local_model = get_model()
|
| 960 |
tokenizer = get_tokenizer()
|
| 961 |
assistant_model = get_assistant_model()
|
tools/example_table_outputs.py
CHANGED
|
@@ -16,32 +16,27 @@ dummy_consultation_table = """| General topic | Subtopic |
|
|
| 16 |
| Development proposal | Noise pollution | Neutral | All | 1 | Potential for increased noise pollution due to the development is a concern. |
|
| 17 |
| Economic impact | Economic decline | Negative | All | 1 | Worries about a negative impact on the local economy are expressed, suggesting potential harm. |"""
|
| 18 |
|
| 19 |
-
dummy_consultation_table_zero_shot = """| General topic
|
| 20 |
-
|
| 21 |
-
|
|
| 22 |
-
|
|
| 23 |
-
|
|
| 24 |
-
|
|
| 25 |
-
|
|
| 26 |
-
|
|
| 27 |
-
|
|
| 28 |
-
|
|
| 29 |
-
|
|
| 30 |
-
|
|
| 31 |
-
|
|
| 32 |
-
| Community
|
| 33 |
-
|
|
| 34 |
-
|
|
| 35 |
-
|
|
| 36 |
-
| Impact on local
|
| 37 |
-
| Impact on local
|
| 38 |
-
| Impact on local
|
| 39 |
-
|
|
| 40 |
-
| Impact on quality of life | Negative impact on local quality of life | Negative | All | 1 | Residents express concern that the development will degrade the overall quality of life due to<br>increased noise, congestion, or other disturbances. |
|
| 41 |
-
| Impact on the character of the area | Negative impact on local character | Negative | All | 1 | There is concern that the development will alter the unique character of the area, potentially<br>leading to a loss of authenticity and community identity. |
|
| 42 |
-
| Need for family housing | Provision of housing for families | Positive | All | 1 | The development will provide much-needed family housing, meeting a critical demand for affordable<br>and suitable homes for families. |
|
| 43 |
-
| Noise pollution | Noise pollution | Negative | All | 1 | The development will increase noise pollution in the area, raising concerns about quality of life<br>and community disturbance. |
|
| 44 |
-
| Parking | Parking | Positive | All | 1 | The development will provide much-needed parking spaces, addressing a key infrastructure need in the<br>area. | |"""
|
| 45 |
|
| 46 |
case_notes_table = """| General topic | Subtopic | Sentiment | Group | Number of responses | Revised summary |
|
| 47 |
|:------------------|:----------------------------|:------------|:--------|----------------------:|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|
@@ -58,4 +53,42 @@ case_notes_table = """| General topic | Subtopic | Sentim
|
|
| 58 |
| School engagement | Academic performance | Negative | All | 2 | Analysis of the provided text reveals concerns regarding student engagement and academic<br>performance. specifically, jamie’s reduced involvement in class is flagged as a potential indicator<br>of negative consequences, with declining grades reported as a direct result. this suggests a<br>concerning downward trend in alex’s academic progress, highlighting a need for further investigation<br>into the underlying causes of this shift.<br>the combined observations point to a possible<br>correlation between decreased... |
|
| 59 |
| Substance use | Substance use (unspecified) | Negative | All | 2 | Concerns regarding ongoing substance use prompted discussion about the possibility of a short-term<br>residential treatment program. alex’s involvement highlighted a potential issue, as they reported<br>occasional substance use, though the specific substances involved were not detailed during the<br>consultation. this lack of specificity regarding the substances used raises a need for further<br>investigation into the nature and frequency of alex’s substance use.<br>the consultation focused on<br>assessing the ri... |
|
| 60 |
| Family dynamics | Stepfather relationship | Negative | All | 1 | Alex displayed sudden outbursts of anger when discussing his new stepfather, indicating significant<br>distress related to this family change. |
|
| 61 |
-
| School engagement | Academic performance | Positive | All | 1 | Jamie's academic performance has slightly improved, indicating a potential positive change. |"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
| Development proposal | Noise pollution | Neutral | All | 1 | Potential for increased noise pollution due to the development is a concern. |
|
| 17 |
| Economic impact | Economic decline | Negative | All | 1 | Worries about a negative impact on the local economy are expressed, suggesting potential harm. |"""
|
| 18 |
|
| 19 |
+
dummy_consultation_table_zero_shot = """| General topic | Subtopic | Sentiment | Group | Number of responses | Revised summary |
|
| 20 |
+
|:---------------------------|:------------------------------------|:------------|:--------|----------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
| 21 |
+
| Planning & development | Impact on the character of the area | Negative | All | 10 | Residents overwhelmingly express strong objections to the proposed development, primarily focusing<br>on its incompatibility with the established character of the area. A central concern is the<br>development's height and design, which they believe clashes significantly with the existing<br>aesthetic and creates a sense of being overshadowed by taller structures, leading to a feeling of<br>crampedness. Many respondents specifically highlighted the potential for the development to<br>negatively impact Main Stre... |
|
| 22 |
+
| Environmental impact | Impact on the local environment | Negative | All | 8 | Several concerns have been raised regarding the potential negative impacts of a development on the<br>local environment. Multiple respondents expressed worry about the development’s possible detrimental<br>effects on the surrounding environment and quality of life, highlighting a significant area of<br>concern. These anxieties include potential damage to the environment and a general feeling of unease<br>about the development’s consequences.<br><br>Despite a single positive note regarding the provision<br>of green s... |
|
| 23 |
+
| Infrastructure & transport | Traffic congestion | Negative | All | 7 | Concerns regarding increased traffic congestion are prevalent in the dataset, largely stemming from<br>the anticipated impact of the proposed development. Specifically, Main Street is predicted to<br>experience heightened congestion due to the increased volume of traffic it will attract. Multiple<br>responses repeatedly highlight this anticipation as a key issue associated with the<br>project.<br><br>Despite the consistent apprehension about traffic congestion, no direct responses<br>offer specific solutions or miti... |
|
| 24 |
+
| Planning & development | Need for family housing | Positive | All | 7 | The proposed development is overwhelmingly viewed as a crucial solution to the need for family<br>housing within the community. Multiple sources highlight its significance in providing much-needed<br>homes, particularly for families, and specifically addressing the demand for affordable family<br>housing options. Several respondents emphasized the beneficial impact on local residents, with the<br>development also anticipated to create jobs and offer facilities geared towards young people<br>alongside housing. ... |
|
| 25 |
+
| Quality of life | Impact on quality of life | Negative | All | 7 | Analysis of the provided text reveals significant concerns regarding a proposed development's<br>potential negative impact on the quality of life within the area. Residents are particularly worried<br>that the development will overshadow existing buildings, creating a sense of crampedness and<br>diminishing their living experience. Furthermore, anxieties extend beyond immediate residential<br>impacts, encompassing broader concerns about the development’s effects on local businesses, schools,<br>and crucial inf... |
|
| 26 |
+
| Economic impact | Investment and job creation | Positive | All | 6 | The proposed development is overwhelmingly viewed positively, with significant anticipation for its<br>economic impact on the area. Residents and observers alike believe it will stimulate considerable<br>investment and generate numerous job opportunities, particularly for local residents. Furthermore,<br>the project is expected to revitalize the town center and provide crucial affordable housing,<br>potentially benefiting young people seeking to establish themselves in the<br>community.<br><br>Specifically, the deve... |
|
| 27 |
+
| Infrastructure & transport | Parking | Negative | All | 6 | Analysis of the '{column_name}' column reveals significant concerns regarding the potential impact<br>of a new development on Main Street. The primary issue identified is increased traffic congestion,<br>directly linked to the development’s activity. Furthermore, there is widespread apprehension that<br>the project will worsen existing parking problems, with multiple respondents explicitly stating a<br>lack of adequate parking provisions as a key worry. <br><br>Specifically, numerous individuals<br>expressed concern... |
|
| 28 |
+
| Community & local life | Amenities for the local community | Positive | All | 5 | The proposed development is anticipated to significantly benefit the local community, offering a<br>range of amenities and a positive contribution to the area. Specifically, the project will deliver<br>crucial green space alongside facilities designed to cater to the needs of young people and the<br>broader community.<br><br>Furthermore, the development is expected to address critical social needs<br>by providing much-needed community facilities and social housing, indicating a commitment to<br>supporting local resi... |
|
| 29 |
+
| Environmental impact | Impact on local wildlife | Neutral | All | 4 | No specific responses were provided, and the dataset contained no information relevant to the<br>specified consultation context. Consequently, a summary cannot be generated based on the provided<br>data. <br><br>Due to the absence of any textual data within the dataset, there is no content to<br>consolidate and summarize. |
|
| 30 |
+
| Improvement of main street | Improvement of main street | Positive | All | 4 | This development is being hailed as a positive step for the revitalization of Main Street, primarily<br>due to its anticipated improvement in the street’s appearance. Stakeholders view this initiative as<br>a crucial element in breathing new life into the area, suggesting a significant upgrade to the<br>existing landscape.<br><br>Specifically, the project aims to enhance the visual appeal of Main<br>Street, representing a tangible advancement in its overall attractiveness and desirability. The<br>development is wide... |
|
| 31 |
+
| Planning & development | Impact on views | Negative | All | 4 | A primary concern expressed regarding the proposed development is its potential negative impact on<br>existing views. Multiple respondents voiced worries about how the development might obstruct or<br>diminish the current vistas, alongside specific concerns about its effect on views from neighboring<br>properties. This suggests a significant sensitivity to the visual landscape and its value within the<br>community.<br><br>Furthermore, the potential aesthetic consequences of the development are<br>highlighted, with s... |
|
| 32 |
+
| Community & local life | Amenities for the local community | Negative | All | 2 | Residents are voicing significant concerns regarding a proposed development, primarily focusing on<br>its anticipated detrimental effects on local amenities. A key point of contention is the planned<br>removal of the existing cafe, which is being viewed as a substantial loss to the community’s social<br>fabric and a vital local resource.<br><br>The overall sentiment suggests a strong apprehension that<br>the development will diminish the quality of life for those living nearby, highlighting a desire to<br>preserve c... |
|
| 33 |
+
| Impact on local businesses | Impact on local businesses | Negative | All | 2 | A primary concern expressed relates to the potential detrimental effects of the development on local<br>businesses. There’s a clear worry that the project will negatively impact these businesses,<br>suggesting a potential loss of revenue, customer base, or even business closure. The repeated<br>emphasis on a “negative impact” highlights a significant apprehension regarding the economic<br>repercussions for the existing business community.<br><br>The sentiment underscores a desire to<br>mitigate potential harm and li... |
|
| 34 |
+
| Impact on local heritage | Impact on local heritage | Negative | All | 2 | There are growing concerns regarding the potential negative impact of the development on the local<br>heritage. While specific details and references haven’t been explicitly stated, the underlying<br>sentiment suggests a worry about the development’s effects on historically significant elements<br>within the area. This implies a recognition that the proposed project could, perhaps inadvertently,<br>threaten or diminish the cultural value and character of the local environment.<br><br>The presence<br>of these concern... |
|
| 35 |
+
| Environmental impact | Impact on local wildlife | Negative | All | 1 | Concerns regarding the negative impact of the development on local wildlife. |
|
| 36 |
+
| Impact on local heritage | Impact on local heritage | Neutral | All | 1 | No specific responses mention this topic. |
|
| 37 |
+
| Impact on local schools | Impact on local schools | Negative | All | 1 | Concerns about the negative impact on the local schools. |
|
| 38 |
+
| Impact on local schools | Impact on local schools | Neutral | All | 1 | No specific responses mention this topic. |
|
| 39 |
+
| Infrastructure & transport | Parking | Positive | All | 1 | The development is expected to provide much-needed parking spaces. |"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
case_notes_table = """| General topic | Subtopic | Sentiment | Group | Number of responses | Revised summary |
|
| 42 |
|:------------------|:----------------------------|:------------|:--------|----------------------:|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|
|
|
| 53 |
| School engagement | Academic performance | Negative | All | 2 | Analysis of the provided text reveals concerns regarding student engagement and academic<br>performance. specifically, jamie’s reduced involvement in class is flagged as a potential indicator<br>of negative consequences, with declining grades reported as a direct result. this suggests a<br>concerning downward trend in alex’s academic progress, highlighting a need for further investigation<br>into the underlying causes of this shift.<br>the combined observations point to a possible<br>correlation between decreased... |
|
| 54 |
| Substance use | Substance use (unspecified) | Negative | All | 2 | Concerns regarding ongoing substance use prompted discussion about the possibility of a short-term<br>residential treatment program. alex’s involvement highlighted a potential issue, as they reported<br>occasional substance use, though the specific substances involved were not detailed during the<br>consultation. this lack of specificity regarding the substances used raises a need for further<br>investigation into the nature and frequency of alex’s substance use.<br>the consultation focused on<br>assessing the ri... |
|
| 55 |
| Family dynamics | Stepfather relationship | Negative | All | 1 | Alex displayed sudden outbursts of anger when discussing his new stepfather, indicating significant<br>distress related to this family change. |
|
| 56 |
+
| School engagement | Academic performance | Positive | All | 1 | Jamie's academic performance has slightly improved, indicating a potential positive change. |"""
|
| 57 |
+
|
| 58 |
+
case_notes_table_grouped = """| General topic | Subtopic | Sentiment | Group | Number of responses | Revised summary |
|
| 59 |
+
|:--------------------|:---------------------------|:------------|:---------|----------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
| 60 |
+
| Trends over time | Trends over time | Negative | Alex D. | 7 | Alex’s case note reveals a troubling deterioration in his well-being marked by a gradual escalation<br>of issues. Initially, the record details an incident involving a physical altercation, which quickly<br>spiraled into increasingly concerning behaviours at home, specifically escalating aggression. Over<br>subsequent meetings, observations consistently pointed towards heightened agitation and expressions<br>of hopelessness, indicating a worsening emotional state and a significant decline in his overall<br>con... |
|
| 61 |
+
| Physical health | Substance misuse | Negative | Alex D. | 6 | Alex’s substance use remains a significant concern, necessitating continued vigilance and support<br>despite recent positive developments in group therapy. While Alex has acknowledged instances of<br>substance use, the details surrounding these occurrences have not been shared, raising questions<br>about the extent and nature of the problem. Concerns were specifically noted regarding potential<br>substance abuse, highlighting a need for further investigation and assessment.<br><br>Ongoing<br>monitoring is crucial to... |
|
| 62 |
+
| Behaviour at school | Behaviour at school | Negative | Alex D. | 3 | A recent case note details a troubling incident involving a physical altercation at school,<br>alongside concerning admissions from Alex regarding alcohol use. This event has sparked worries<br>about potential behavioural issues within the school setting, suggesting a need for further<br>investigation and support. Alex’s demeanor was notably problematic, characterized by sullen behavior<br>and a deliberate avoidance of eye contact, indicating a possible struggle with emotional<br>regulation.<br><br>Furthermore, Alex... |
|
| 63 |
+
| Mental health | Anger | Negative | Alex D. | 3 | Alex exhibits a pronounced anger issue, characterized by frustration and a tendency to blame others<br>for triggering his aggressive behavior. He demonstrated this significantly when discussing his<br>personal life, particularly relating to his new stepfather, suggesting a volatile emotional response<br>to this change. The observed outbursts highlight a need for immediate intervention to manage his<br>escalating anger.<br><br>Further investigation reveals that Alex’s anger is closely linked to his<br>home environmen... |
|
| 64 |
+
| Mental health | Self-harm | Negative | Alex D. | 3 | The analysis reveals significant concerns regarding Alex’s mental health, centering around potential<br>self-harm behaviors. Indications suggest a possible diagnosis of Oppositional Defiant Disorder<br>alongside a co-occurring substance use disorder, warranting a comprehensive treatment plan. Alex<br>demonstrated visible signs of self-harm and openly confessed to experiencing thoughts of self-harm,<br>highlighting a critical need for immediate intervention.<br><br>Following this disclosure, an<br>immediate referral ... |
|
| 65 |
+
| Mental health | Social issues | Negative | Alex D. | 3 | Alex exhibits a pattern of blaming others for his problematic behavior, indicating underlying<br>challenges in social interaction and conflict resolution. This behavior appears to be contributing<br>to further instability in his life. Specifically, his mother voiced concerns regarding his new<br>social circle and increasingly frequent late-night activities, suggesting she perceives these<br>relationships and outings as potentially risky.<br><br>The mother’s observations highlight a<br>potential area of concern for A... |
|
| 66 |
+
| Mental health | Depression | Negative | Jamie L. | 6 | Jamie is currently experiencing concerning symptoms indicative of depression, as noted by both<br>Jamie’s behavior and parental observations. Specifically, he demonstrates limited social<br>interaction, struggles with his mood, and has difficulty engaging with his schoolwork. These<br>difficulties appear persistent, with parents reporting ongoing struggles despite occasional positive<br>moments. <br><br>Further assessment suggests a more pronounced picture, with indications of moderate<br>depression characterized by... |
|
| 67 |
+
| Mental health | Social isolation | Negative | Jamie L. | 4 | Jamie is experiencing significant social isolation, which is negatively affecting both his academic<br>performance and his general well-being. He has expressed feelings of loneliness and difficulty<br>sleeping, strongly suggesting a core social issue is contributing to his distress. Current efforts<br>are focused on promoting increased social interaction to address these challenges.<br><br>The report<br>highlights the urgency of this situation, emphasizing the need for intervention to mitigate Jamie’s<br>isolation a... |
|
| 68 |
+
| Mental health | Medication | Neutral | Jamie L. | 3 | Consideration is being given to medication as a potential intervention alongside therapy to manage<br>depressive symptoms. Initial feedback on the antidepressant is positive. |
|
| 69 |
+
| Mental health | Withdrawal & sadness | Negative | Jamie L. | 3 | Jamie is experiencing a significant downturn in his emotional state, characterized by withdrawal,<br>sadness, and a pervasive sense of emptiness and hopelessness. These negative feelings appear to be<br>triggered by recent reports of tardiness and decreased participation, suggesting a possible link<br>between his behavior and external pressures or expectations. The combination of these symptoms<br>points to a low mood and a feeling of struggle, indicating a potentially serious situation requiring<br>attention.... |
|
| 70 |
+
| Mental health | Low self-worth | Negative | Jamie L. | 2 | Parents are increasingly concerned about Jamie’s well-being due to observed difficulties and a<br>potential lack of self-worth. These concerns are primarily fueled by Jamie’s own statements, where<br>he articulated feelings of low self-esteem and a significant struggle to find<br>motivation.<br><br>Further investigation revealed a direct link between Jamie’s emotional state and<br>recent family financial hardships. The pressures of these struggles appear to have deeply impacted<br>his self-perception and ability to ... |
|
| 71 |
+
| Trends over time | Increasing withdrawal | Negative | Jamie L. | 2 | A significant and worrying trend is emerging regarding withdrawal, necessitating continuous<br>observation and targeted intervention strategies. Specifically, Jamie is exhibiting a noticeable<br>decline in engagement with family activities, representing a key indicator of this broader issue.<br>This withdrawal suggests a potential underlying problem requiring careful assessment and proactive<br>support.<br><br>The observed pattern of withdrawal highlights the importance of sustained monitoring<br>to understand its p... |
|
| 72 |
+
| Behaviour at school | Attendance issues | Negative | Jamie L. | 1 | Jamie’s consistent tardiness was a concern leading to a meeting. |
|
| 73 |
+
| Behaviour at school | Reduced participation | Negative | Jamie L. | 1 | Jamie’s decreased participation in class was noted. |
|
| 74 |
+
| Behaviour at school | Social engagement | Negative | Jamie L. | 1 | Jamie's withdrawal from family activities and hobbies was highlighted. |
|
| 75 |
+
| Behaviour at school | Social engagement | Positive | Jamie L. | 1 | Encouraging Jamie to join school clubs and groups is a strategy to foster social connection and<br>improve his social engagement. |
|
| 76 |
+
| Family & social | Family communication | Negative | Jamie L. | 1 | Parents expressed concerns about Jamie’s withdrawal and lack of communication within the family. |
|
| 77 |
+
| Family & social | Family communication | Neutral | Jamie L. | 1 | Parents are actively involved in Jamie's care and are communicating their observations to the care<br>team. |
|
| 78 |
+
| Family & social | Family financial struggles | Negative | Jamie L. | 1 | Jamie's low motivation is attributed to recent family financial difficulties. |"""
|
| 79 |
+
|
| 80 |
+
case_notes_table_structured_summary = """| Main heading | Subheading | Summary | Group |
|
| 81 |
+
|:--------------------|:--------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:---------|
|
| 82 |
+
| Behaviour at school | Behaviour at school | Several cases involved disruptions at school, including increased absences, declining grades, and a<br>physical altercation. Alex displayed sullenness, avoidance, and agitation, sometimes reacting with<br>frustration. A key theme was isolation and a lack of connection with peers and school staff. | Alex D. |
|
| 83 |
+
| Mental health | Anger | Anger was a prominent feature across multiple cases, particularly when discussing home life and<br>family dynamics. Outbursts of anger were observed, especially related to a new stepfather, and Alex<br>displayed defensiveness when questioned about his actions. | Alex D. |
|
| 84 |
+
| Mental health | Social issues | Alex experienced feelings of isolation and difficulty connecting with others. He had a new group of<br>friends and engaged in late-night outings, which raised concerns about potential risky behaviours<br>and social influences. | Alex D. |
|
| 85 |
+
| Physical health | General | Signs of self-harm were present on Alex’s arms, indicating a heightened level of distress and<br>potentially a need for immediate support. He displayed visible agitation and defensive behaviour<br>during questioning. | Alex D. |
|
| 86 |
+
| Physical health | Substance misuse | Substance use was a recurring concern, with Alex admitting to occasional substance use and his<br>mother reporting potential signs of abuse. Alcohol use was noted in several instances, leading to<br>recommendations for assessment and potential intervention. | Alex D. |
|
| 87 |
+
| Trends over time | Trends over time | There was a gradual escalation of concerning behaviours over time. Early interventions focused on<br>initial meetings and observation, progressing to more intensive interventions like referrals to<br>mental health professionals, residential treatment programs, and family counseling. | Alex D. |
|
| 88 |
+
| Behaviour at school | Behaviour at school | Jamie exhibited concerning behaviours at school, including consistent tardiness and decreased<br>participation in class. This was accompanied by withdrawn behaviour and signs of sadness, suggesting<br>a need for immediate intervention to address potential underlying issues impacting his academic<br>performance. | Jamie L. |
|
| 89 |
+
| Mental health | Anger | There is no direct indication of anger in Jamie's case notes. | Jamie L. |
|
| 90 |
+
| Mental health | Mental health | Jamie displayed concerning signs of mental health difficulties, including feelings of emptiness,<br>hopelessness, low self-worth, and isolation. He reported difficulty sleeping and a lack of<br>motivation. The need for a comprehensive mental health assessment was highlighted to fully<br>understand the nature and severity of his condition. | Jamie L. |
|
| 91 |
+
| Mental health | Social issues | Jamie experienced significant social difficulties, including limited social interactions, feelings<br>of isolation, and a lack of engagement with family activities and hobbies. He spends a lot of time<br>alone in his room. Recommendations focused on fostering connection through school clubs and family<br>therapy were made. | Jamie L. |
|
| 92 |
+
| Physical health | General | While no direct physical health concerns were explicitly stated, Jamie's emotional state and<br>associated symptoms (difficulty sleeping) warrant consideration of his overall well-being and<br>potential physical manifestations of his mental health challenges. | Jamie L. |
|
| 93 |
+
| Physical health | Substance misuse | There is no indication of substance misuse in the provided case notes. | Jamie L. |
|
| 94 |
+
| Trends over time | Trends over time | Jamie��s case demonstrates fluctuating progress. Initial feedback indicated slight improvements in<br>mood on some days, but overall he continues to struggle. A shift occurred with the commencement of<br>antidepressant medication, showing initial positive feedback in terms of mood and energy levels,<br>requiring continued monitoring and adjustment. | Jamie L. |"""
|
tools/llm_api_call.py
CHANGED
|
@@ -324,7 +324,7 @@ def write_llm_output_and_logs(response_text: str,
|
|
| 324 |
batch_basic_response_df:pd.DataFrame,
|
| 325 |
model_name_map:dict,
|
| 326 |
group_name:str = "All",
|
| 327 |
-
|
| 328 |
first_run: bool = False,
|
| 329 |
return_logs: bool = False,
|
| 330 |
output_folder:str=OUTPUT_FOLDER) -> Tuple:
|
|
@@ -349,7 +349,7 @@ def write_llm_output_and_logs(response_text: str,
|
|
| 349 |
- batch_basic_response_df (pd.DataFrame): The dataframe that contains the response data.
|
| 350 |
- model_name_map (dict): The dictionary that maps the model choice to the model name.
|
| 351 |
- group_name (str, optional): The name of the current group.
|
| 352 |
-
-
|
| 353 |
- first_run (bool): A boolean indicating if this is the first run through this function in this process. Defaults to False.
|
| 354 |
- output_folder (str): The name of the folder where output files are saved.
|
| 355 |
"""
|
|
@@ -405,11 +405,14 @@ def write_llm_output_and_logs(response_text: str,
|
|
| 405 |
else:
|
| 406 |
# Something went wrong with the table output, so add empty columns
|
| 407 |
print("Table output has wrong number of columns, adding with blank values")
|
| 408 |
-
#
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
|
|
|
|
|
|
|
|
|
| 413 |
if "Sentiment" not in topic_with_response_df.columns:
|
| 414 |
topic_with_response_df["Sentiment"] = "Not assessed"
|
| 415 |
if "Response References" not in topic_with_response_df.columns:
|
|
@@ -443,12 +446,8 @@ def write_llm_output_and_logs(response_text: str,
|
|
| 443 |
# Iterate through each row in the original DataFrame
|
| 444 |
for index, row in topic_with_response_df.iterrows():
|
| 445 |
references = re.findall(r'\d+', str(row.iloc[3])) if pd.notna(row.iloc[3]) else []
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
# references = re.findall(r'\d+', str(row.iloc[4])) if pd.notna(row.iloc[4]) else []
|
| 449 |
-
# If batch size is 1, references will always be 1
|
| 450 |
-
if batch_size_number == 1:
|
| 451 |
-
references = "1"
|
| 452 |
|
| 453 |
# Filter out references that are outside the valid range
|
| 454 |
if references:
|
|
@@ -460,32 +459,52 @@ def write_llm_output_and_logs(response_text: str,
|
|
| 460 |
# If any reference can't be converted to int, skip this row
|
| 461 |
print("Response value could not be converted to number:", references)
|
| 462 |
continue
|
|
|
|
|
|
|
| 463 |
|
| 464 |
topic = row.iloc[0] if pd.notna(row.iloc[0]) else ""
|
| 465 |
subtopic = row.iloc[1] if pd.notna(row.iloc[1]) else ""
|
| 466 |
sentiment = row.iloc[2] if pd.notna(row.iloc[2]) else ""
|
| 467 |
summary = row.iloc[4] if pd.notna(row.iloc[4]) else ""
|
|
|
|
| 468 |
# If the reference response column is very long, and there's nothing in the summary column, assume that the summary was put in the reference column
|
| 469 |
if not summary and (len(str(row.iloc[3])) > 30):
|
| 470 |
-
summary = row.iloc[3]
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 489 |
|
| 490 |
reference_data.append({
|
| 491 |
'Response References': response_ref_no,
|
|
@@ -512,11 +531,11 @@ def write_llm_output_and_logs(response_text: str,
|
|
| 512 |
out_reference_df.drop_duplicates(["Response References", "General topic", "Subtopic", "Sentiment"], inplace=True)
|
| 513 |
|
| 514 |
# Try converting response references column to int, keep as string if fails
|
| 515 |
-
|
| 516 |
-
|
| 517 |
-
|
| 518 |
-
|
| 519 |
-
|
| 520 |
|
| 521 |
out_reference_df.sort_values(["Start row of group", "Response References", "General topic", "Subtopic", "Sentiment"], inplace=True)
|
| 522 |
|
|
@@ -706,7 +725,7 @@ def extract_topics(in_data_file: GradioFileData,
|
|
| 706 |
output_folder:str=OUTPUT_FOLDER,
|
| 707 |
force_single_topic_prompt:str=force_single_topic_prompt,
|
| 708 |
group_name:str="All",
|
| 709 |
-
|
| 710 |
aws_access_key_textbox:str='',
|
| 711 |
aws_secret_key_textbox:str='',
|
| 712 |
hf_api_key_textbox:str='',
|
|
@@ -722,7 +741,7 @@ def extract_topics(in_data_file: GradioFileData,
|
|
| 722 |
assistant_model:object=list(),
|
| 723 |
max_rows:int=max_rows,
|
| 724 |
original_full_file_name:str="",
|
| 725 |
-
|
| 726 |
progress=Progress(track_tqdm=False)):
|
| 727 |
|
| 728 |
'''
|
|
@@ -760,7 +779,7 @@ def extract_topics(in_data_file: GradioFileData,
|
|
| 760 |
- force_zero_shot_radio (str, optional): Should responses be forced into a zero shot topic or not.
|
| 761 |
- in_excel_sheets (List[str], optional): List of excel sheets to load from input file.
|
| 762 |
- force_single_topic_radio (str, optional): Should the model be forced to assign only one single topic to each response (effectively a classifier).
|
| 763 |
-
-
|
| 764 |
- output_folder (str, optional): Output folder where results will be stored.
|
| 765 |
- force_single_topic_prompt (str, optional): The prompt for forcing the model to assign only one single topic to each response.
|
| 766 |
- aws_access_key_textbox (str, optional): AWS access key for account with Bedrock permissions.
|
|
@@ -777,7 +796,7 @@ def extract_topics(in_data_file: GradioFileData,
|
|
| 777 |
- assistant_model: Assistant model object for local inference.
|
| 778 |
- max_rows: The maximum number of rows to process.
|
| 779 |
- original_full_file_name: The original full file name.
|
| 780 |
-
-
|
| 781 |
- progress (Progress): A progress tracker.
|
| 782 |
|
| 783 |
'''
|
|
@@ -881,6 +900,9 @@ def extract_topics(in_data_file: GradioFileData,
|
|
| 881 |
elif sentiment_checkbox == "Negative or Positive": sentiment_prompt = sentiment_prefix + negative_or_positive_sentiment_prompt + sentiment_suffix
|
| 882 |
elif sentiment_checkbox == "Do not assess sentiment": sentiment_prompt = "" # Just remove line completely. Previous: sentiment_prefix + do_not_assess_sentiment_prompt + sentiment_suffix
|
| 883 |
else: sentiment_prompt = sentiment_prefix + default_sentiment_prompt + sentiment_suffix
|
|
|
|
|
|
|
|
|
|
| 884 |
|
| 885 |
topics_loop_description = "Extracting topics from response batches (each batch of " + str(batch_size) + " responses)."
|
| 886 |
total_batches_to_do = num_batches - latest_batch_completed
|
|
@@ -995,9 +1017,9 @@ def extract_topics(in_data_file: GradioFileData,
|
|
| 995 |
if existing_topic_summary_df['Description'].isnull().all():
|
| 996 |
existing_topic_summary_df.drop("Description", axis = 1, inplace = True)
|
| 997 |
|
| 998 |
-
if
|
| 999 |
if "General topic" in topics_df_for_markdown.columns:
|
| 1000 |
-
topics_df_for_markdown = topics_df_for_markdown.rename(columns={"General topic":"Main
|
| 1001 |
if "Subtopic" in topics_df_for_markdown.columns:
|
| 1002 |
topics_df_for_markdown = topics_df_for_markdown.rename(columns={"Subtopic":"Subheading"})
|
| 1003 |
|
|
@@ -1013,17 +1035,17 @@ def extract_topics(in_data_file: GradioFileData,
|
|
| 1013 |
topic_assignment_prompt = topic_assignment_prompt.replace("Assign topics", "Assign a topic").replace("assign Subtopics", "assign a Subtopic").replace("Subtopics", "Subtopic").replace("Topics", "Topic").replace("topics", "a topic")
|
| 1014 |
|
| 1015 |
# Format the summary prompt with the response table and topics
|
| 1016 |
-
if
|
| 1017 |
formatted_summary_prompt = add_existing_topics_prompt.format(response_table=normalised_simple_markdown_table,
|
| 1018 |
topics=unique_topics_markdown,
|
| 1019 |
topic_assignment=topic_assignment_prompt,
|
| 1020 |
force_single_topic=force_single_topic_prompt,
|
| 1021 |
sentiment_choices=sentiment_prompt,
|
| 1022 |
response_reference_format=response_reference_format,
|
| 1023 |
-
add_existing_topics_summary_format=
|
| 1024 |
else:
|
| 1025 |
formatted_summary_prompt = structured_summary_prompt.format(response_table=normalised_simple_markdown_table,
|
| 1026 |
-
topics=unique_topics_markdown)
|
| 1027 |
|
| 1028 |
full_prompt = formatted_system_prompt + "\n" + formatted_summary_prompt
|
| 1029 |
|
|
@@ -1040,7 +1062,7 @@ def extract_topics(in_data_file: GradioFileData,
|
|
| 1040 |
responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(summary_prompt_list, formatted_system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, google_client, google_config, model_choice, temperature, reported_batch_no, local_model, tokenizer, bedrock_runtime, model_source, MAX_OUTPUT_VALIDATION_ATTEMPTS, assistant_prefill=add_existing_topics_assistant_prefill, master = True)
|
| 1041 |
|
| 1042 |
# Return output tables
|
| 1043 |
-
topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, new_topic_df, new_reference_df, new_topic_summary_df, master_batch_out_file_part, is_error = write_llm_output_and_logs(response_text, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_topic_summary_df, batch_size, chosen_cols, batch_basic_response_df, model_name_map, group_name,
|
| 1044 |
|
| 1045 |
full_prompt = formatted_system_prompt + "\n" + formatted_summary_prompt
|
| 1046 |
|
|
@@ -1079,7 +1101,14 @@ def extract_topics(in_data_file: GradioFileData,
|
|
| 1079 |
|
| 1080 |
# Outputs for markdown table output
|
| 1081 |
unique_table_df_display_table = new_topic_summary_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
|
| 1082 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1083 |
|
| 1084 |
whole_conversation_metadata_str = ' '.join(whole_conversation_metadata)
|
| 1085 |
|
|
@@ -1106,9 +1135,9 @@ def extract_topics(in_data_file: GradioFileData,
|
|
| 1106 |
#print("Using AWS Bedrock model:", model_choice)
|
| 1107 |
|
| 1108 |
# Format the summary prompt with the response table and topics
|
| 1109 |
-
if
|
| 1110 |
formatted_initial_table_prompt = initial_table_prompt.format(response_table=normalised_simple_markdown_table, sentiment_choices=sentiment_prompt,
|
| 1111 |
-
response_reference_format=response_reference_format, add_existing_topics_summary_format=
|
| 1112 |
else:
|
| 1113 |
unique_topics_markdown="No suggested headings for this summary"
|
| 1114 |
formatted_initial_table_prompt = structured_summary_prompt.format(response_table=normalised_simple_markdown_table, topics=unique_topics_markdown)
|
|
@@ -1121,7 +1150,7 @@ def extract_topics(in_data_file: GradioFileData,
|
|
| 1121 |
|
| 1122 |
responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(batch_prompts, formatted_system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, google_client, google_config, model_choice, temperature, reported_batch_no, local_model, tokenizer,bedrock_runtime, model_source, MAX_OUTPUT_VALIDATION_ATTEMPTS, assistant_prefill=initial_table_assistant_prefill)
|
| 1123 |
|
| 1124 |
-
topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, topic_table_df, reference_df, new_topic_summary_df, batch_file_path_details, is_error = write_llm_output_and_logs(response_text, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_topic_summary_df, batch_size, chosen_cols, batch_basic_response_df, model_name_map, group_name,
|
| 1125 |
|
| 1126 |
# If error in table parsing, leave function
|
| 1127 |
if is_error == True: raise Exception("Error in output table parsing")
|
|
@@ -1243,7 +1272,14 @@ def extract_topics(in_data_file: GradioFileData,
|
|
| 1243 |
|
| 1244 |
# Outputs for markdown table output
|
| 1245 |
unique_table_df_display_table = final_out_topic_summary_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
|
| 1246 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1247 |
|
| 1248 |
# Ensure that we are only returning the final results to outputs
|
| 1249 |
out_file_paths = [x for x in out_file_paths if '_final_' in x]
|
|
@@ -1312,14 +1348,14 @@ def wrapper_extract_topics_per_column_value(
|
|
| 1312 |
force_zero_shot_radio: str = "No",
|
| 1313 |
in_excel_sheets: List[str] = list(),
|
| 1314 |
force_single_topic_radio: str = "No",
|
| 1315 |
-
|
| 1316 |
aws_access_key_textbox:str="",
|
| 1317 |
aws_secret_key_textbox:str="",
|
| 1318 |
hf_api_key_textbox:str="",
|
| 1319 |
azure_api_key_textbox:str="",
|
| 1320 |
output_folder: str = OUTPUT_FOLDER,
|
| 1321 |
existing_logged_content:list=list(),
|
| 1322 |
-
|
| 1323 |
force_single_topic_prompt: str = force_single_topic_prompt,
|
| 1324 |
max_tokens: int = max_tokens,
|
| 1325 |
model_name_map: dict = model_name_map,
|
|
@@ -1330,7 +1366,7 @@ def wrapper_extract_topics_per_column_value(
|
|
| 1330 |
tokenizer:object=None,
|
| 1331 |
assistant_model:object=None,
|
| 1332 |
max_rows:int=max_rows,
|
| 1333 |
-
progress=Progress(track_tqdm=
|
| 1334 |
) -> Tuple: # Mimicking the return tuple structure of extract_topics
|
| 1335 |
"""
|
| 1336 |
A wrapper function that iterates through unique values in a specified grouping column
|
|
@@ -1366,7 +1402,7 @@ def wrapper_extract_topics_per_column_value(
|
|
| 1366 |
:param force_zero_shot_radio: Option to force responses into zero-shot topics.
|
| 1367 |
:param in_excel_sheets: List of Excel sheet names if applicable.
|
| 1368 |
:param force_single_topic_radio: Option to force a single topic per response.
|
| 1369 |
-
:param
|
| 1370 |
:param aws_access_key_textbox: AWS access key for Bedrock.
|
| 1371 |
:param aws_secret_key_textbox: AWS secret key for Bedrock.
|
| 1372 |
:param hf_api_key_textbox: Hugging Face API key for local models.
|
|
@@ -1374,7 +1410,7 @@ def wrapper_extract_topics_per_column_value(
|
|
| 1374 |
:param output_folder: The folder where output files will be saved.
|
| 1375 |
:param existing_logged_content: A list of existing logged content.
|
| 1376 |
:param force_single_topic_prompt: Prompt for forcing a single topic.
|
| 1377 |
-
:param
|
| 1378 |
:param max_tokens: Maximum tokens for LLM generation.
|
| 1379 |
:param model_name_map: Dictionary mapping model names to their properties.
|
| 1380 |
:param max_time_for_loop: Maximum time allowed for the processing loop.
|
|
@@ -1543,7 +1579,7 @@ def wrapper_extract_topics_per_column_value(
|
|
| 1543 |
output_folder=output_folder,
|
| 1544 |
force_single_topic_prompt=force_single_topic_prompt,
|
| 1545 |
group_name=group_value,
|
| 1546 |
-
|
| 1547 |
aws_access_key_textbox=aws_access_key_textbox,
|
| 1548 |
aws_secret_key_textbox=aws_secret_key_textbox,
|
| 1549 |
hf_api_key_textbox=hf_api_key_textbox,
|
|
@@ -1559,7 +1595,7 @@ def wrapper_extract_topics_per_column_value(
|
|
| 1559 |
max_rows=max_rows,
|
| 1560 |
existing_logged_content=all_logged_content,
|
| 1561 |
original_full_file_name=original_file_name,
|
| 1562 |
-
|
| 1563 |
progress=progress
|
| 1564 |
)
|
| 1565 |
|
|
@@ -1598,8 +1634,7 @@ def wrapper_extract_topics_per_column_value(
|
|
| 1598 |
model_choice_clean_short = clean_column_name(model_choice_clean, max_length=20, front_characters=False)
|
| 1599 |
column_clean = clean_column_name(chosen_cols, max_length=20)
|
| 1600 |
|
| 1601 |
-
if "Group" in acc_reference_df.columns:
|
| 1602 |
-
|
| 1603 |
|
| 1604 |
acc_reference_df_path = output_folder + overall_file_name + "_col_" + column_clean + "_all_final_reference_table_" + model_choice_clean_short + ".csv"
|
| 1605 |
acc_topic_summary_df_path = output_folder + overall_file_name + "_col_" + column_clean + "_all_final_unique_topics_" + model_choice_clean_short + ".csv"
|
|
@@ -1624,7 +1659,13 @@ def wrapper_extract_topics_per_column_value(
|
|
| 1624 |
|
| 1625 |
# Outputs for markdown table output
|
| 1626 |
unique_table_df_display_table = acc_topic_summary_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
|
| 1627 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1628 |
|
| 1629 |
acc_input_tokens, acc_output_tokens, acc_number_of_calls = calculate_tokens_from_metadata(acc_whole_conversation_metadata, model_choice, model_name_map)
|
| 1630 |
|
|
@@ -1814,7 +1855,7 @@ def all_in_one_pipeline(
|
|
| 1814 |
model_name_map_state: dict = model_name_map,
|
| 1815 |
usage_logs_location: str = "",
|
| 1816 |
existing_logged_content:list=list(),
|
| 1817 |
-
|
| 1818 |
model: object = None,
|
| 1819 |
tokenizer: object = None,
|
| 1820 |
assistant_model: object = None,
|
|
@@ -1869,7 +1910,7 @@ def all_in_one_pipeline(
|
|
| 1869 |
model_name_map_state (dict, optional): Mapping of model names. Defaults to model_name_map.
|
| 1870 |
usage_logs_location (str, optional): Location for usage logs. Defaults to "".
|
| 1871 |
existing_logged_content (list, optional): Existing logged content. Defaults to list().
|
| 1872 |
-
|
| 1873 |
model (object, optional): Loaded local model object. Defaults to None.
|
| 1874 |
tokenizer (object, optional): Loaded local tokenizer object. Defaults to None.
|
| 1875 |
assistant_model (object, optional): Loaded local assistant model object. Defaults to None.
|
|
@@ -1947,7 +1988,7 @@ def all_in_one_pipeline(
|
|
| 1947 |
force_zero_shot_radio=force_zero_shot_choice,
|
| 1948 |
in_excel_sheets=in_excel_sheets,
|
| 1949 |
force_single_topic_radio=force_single_topic_choice,
|
| 1950 |
-
|
| 1951 |
aws_access_key_textbox=aws_access_key_text,
|
| 1952 |
aws_secret_key_textbox=aws_secret_key_text,
|
| 1953 |
hf_api_key_textbox=hf_api_key_text,
|
|
@@ -1959,7 +2000,7 @@ def all_in_one_pipeline(
|
|
| 1959 |
tokenizer=tokenizer,
|
| 1960 |
assistant_model=assistant_model,
|
| 1961 |
max_rows=max_rows,
|
| 1962 |
-
|
| 1963 |
)
|
| 1964 |
|
| 1965 |
total_input_tokens += out_input_tokens
|
|
@@ -1973,6 +2014,60 @@ def all_in_one_pipeline(
|
|
| 1973 |
text_output_file_list_state = out_file_paths_1
|
| 1974 |
log_files_output_list_state = out_log_files
|
| 1975 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1976 |
# 2) Deduplication
|
| 1977 |
(
|
| 1978 |
ref_df_loaded,
|
|
@@ -2009,8 +2104,6 @@ def all_in_one_pipeline(
|
|
| 2009 |
|
| 2010 |
summary_reference_table_sample_state, summarised_references_markdown = sample_reference_table_summaries(ref_df_after_dedup, random_seed)
|
| 2011 |
|
| 2012 |
-
print("model:", model)
|
| 2013 |
-
|
| 2014 |
(
|
| 2015 |
_summary_reference_table_sample_state,
|
| 2016 |
master_unique_topics_df_revised_summaries_state,
|
|
@@ -2128,8 +2221,13 @@ def all_in_one_pipeline(
|
|
| 2128 |
|
| 2129 |
|
| 2130 |
# Map to the UI outputs list expected by the new single-call wiring
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2131 |
return (
|
| 2132 |
-
|
| 2133 |
out_topics_table,
|
| 2134 |
unique_df_after_dedup,
|
| 2135 |
ref_df_after_dedup,
|
|
|
|
| 324 |
batch_basic_response_df:pd.DataFrame,
|
| 325 |
model_name_map:dict,
|
| 326 |
group_name:str = "All",
|
| 327 |
+
produce_structured_summary_radio:str = "No",
|
| 328 |
first_run: bool = False,
|
| 329 |
return_logs: bool = False,
|
| 330 |
output_folder:str=OUTPUT_FOLDER) -> Tuple:
|
|
|
|
| 349 |
- batch_basic_response_df (pd.DataFrame): The dataframe that contains the response data.
|
| 350 |
- model_name_map (dict): The dictionary that maps the model choice to the model name.
|
| 351 |
- group_name (str, optional): The name of the current group.
|
| 352 |
+
- produce_structured_summary_radio (str, optional): Whether the option to produce structured summaries has been selected.
|
| 353 |
- first_run (bool): A boolean indicating if this is the first run through this function in this process. Defaults to False.
|
| 354 |
- output_folder (str): The name of the folder where output files are saved.
|
| 355 |
"""
|
|
|
|
| 405 |
else:
|
| 406 |
# Something went wrong with the table output, so add empty columns
|
| 407 |
print("Table output has wrong number of columns, adding with blank values")
|
| 408 |
+
# First, rename first two columns that should always exist.
|
| 409 |
+
new_column_names = {
|
| 410 |
+
topic_with_response_df.columns[0]: "General topic",
|
| 411 |
+
topic_with_response_df.columns[1]: "Subtopic"
|
| 412 |
+
}
|
| 413 |
+
topic_with_response_df.rename(columns=new_column_names, inplace=True)
|
| 414 |
+
|
| 415 |
+
# Add empty columns if they are not present
|
| 416 |
if "Sentiment" not in topic_with_response_df.columns:
|
| 417 |
topic_with_response_df["Sentiment"] = "Not assessed"
|
| 418 |
if "Response References" not in topic_with_response_df.columns:
|
|
|
|
| 446 |
# Iterate through each row in the original DataFrame
|
| 447 |
for index, row in topic_with_response_df.iterrows():
|
| 448 |
references = re.findall(r'\d+', str(row.iloc[3])) if pd.notna(row.iloc[3]) else []
|
| 449 |
+
|
| 450 |
+
if batch_size_number == 1: references = "1"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 451 |
|
| 452 |
# Filter out references that are outside the valid range
|
| 453 |
if references:
|
|
|
|
| 459 |
# If any reference can't be converted to int, skip this row
|
| 460 |
print("Response value could not be converted to number:", references)
|
| 461 |
continue
|
| 462 |
+
else:
|
| 463 |
+
references = ""
|
| 464 |
|
| 465 |
topic = row.iloc[0] if pd.notna(row.iloc[0]) else ""
|
| 466 |
subtopic = row.iloc[1] if pd.notna(row.iloc[1]) else ""
|
| 467 |
sentiment = row.iloc[2] if pd.notna(row.iloc[2]) else ""
|
| 468 |
summary = row.iloc[4] if pd.notna(row.iloc[4]) else ""
|
| 469 |
+
|
| 470 |
# If the reference response column is very long, and there's nothing in the summary column, assume that the summary was put in the reference column
|
| 471 |
if not summary and (len(str(row.iloc[3])) > 30):
|
| 472 |
+
summary = row.iloc[3]
|
| 473 |
+
|
| 474 |
+
index_row = index
|
| 475 |
+
|
| 476 |
+
if produce_structured_summary_radio != "Yes": summary = row_number_string_start + summary
|
| 477 |
+
|
| 478 |
+
if references:
|
| 479 |
+
existing_reference_numbers = True
|
| 480 |
+
# Create a new entry for each reference number
|
| 481 |
+
for ref in references:
|
| 482 |
+
# Add start_row back onto reference_number
|
| 483 |
+
if batch_basic_response_df.empty:
|
| 484 |
+
try:
|
| 485 |
+
response_ref_no = str(int(ref) + int(start_row))
|
| 486 |
+
except ValueError:
|
| 487 |
+
print("Reference is not a number")
|
| 488 |
+
continue
|
| 489 |
+
else:
|
| 490 |
+
try:
|
| 491 |
+
response_ref_no = batch_basic_response_df.loc[batch_basic_response_df["Reference"]==str(ref), "Original Reference"].iloc[0]
|
| 492 |
+
except ValueError:
|
| 493 |
+
print("Reference is not a number")
|
| 494 |
+
continue
|
| 495 |
+
|
| 496 |
+
reference_data.append({
|
| 497 |
+
'Response References': response_ref_no,
|
| 498 |
+
'General topic': topic,
|
| 499 |
+
'Subtopic': subtopic,
|
| 500 |
+
'Sentiment': sentiment,
|
| 501 |
+
'Summary': summary,
|
| 502 |
+
"Start row of group": start_row_reported
|
| 503 |
+
})
|
| 504 |
+
else:
|
| 505 |
+
existing_reference_numbers = False
|
| 506 |
+
# In this case, set to 0 to show that this applies to no specific reference number
|
| 507 |
+
response_ref_no = 0
|
| 508 |
|
| 509 |
reference_data.append({
|
| 510 |
'Response References': response_ref_no,
|
|
|
|
| 531 |
out_reference_df.drop_duplicates(["Response References", "General topic", "Subtopic", "Sentiment"], inplace=True)
|
| 532 |
|
| 533 |
# Try converting response references column to int, keep as string if fails
|
| 534 |
+
if existing_reference_numbers is True:
|
| 535 |
+
try:
|
| 536 |
+
out_reference_df["Response References"] = out_reference_df["Response References"].astype(int)
|
| 537 |
+
except Exception as e:
|
| 538 |
+
print("Could not convert Response References column to integer due to", e)
|
| 539 |
|
| 540 |
out_reference_df.sort_values(["Start row of group", "Response References", "General topic", "Subtopic", "Sentiment"], inplace=True)
|
| 541 |
|
|
|
|
| 725 |
output_folder:str=OUTPUT_FOLDER,
|
| 726 |
force_single_topic_prompt:str=force_single_topic_prompt,
|
| 727 |
group_name:str="All",
|
| 728 |
+
produce_structured_summary_radio:str="No",
|
| 729 |
aws_access_key_textbox:str='',
|
| 730 |
aws_secret_key_textbox:str='',
|
| 731 |
hf_api_key_textbox:str='',
|
|
|
|
| 741 |
assistant_model:object=list(),
|
| 742 |
max_rows:int=max_rows,
|
| 743 |
original_full_file_name:str="",
|
| 744 |
+
additional_instructions_summary_format:str="",
|
| 745 |
progress=Progress(track_tqdm=False)):
|
| 746 |
|
| 747 |
'''
|
|
|
|
| 779 |
- force_zero_shot_radio (str, optional): Should responses be forced into a zero shot topic or not.
|
| 780 |
- in_excel_sheets (List[str], optional): List of excel sheets to load from input file.
|
| 781 |
- force_single_topic_radio (str, optional): Should the model be forced to assign only one single topic to each response (effectively a classifier).
|
| 782 |
+
- produce_structured_summary_radio (str, optional): Should the model create a structured summary instead of extracting topics.
|
| 783 |
- output_folder (str, optional): Output folder where results will be stored.
|
| 784 |
- force_single_topic_prompt (str, optional): The prompt for forcing the model to assign only one single topic to each response.
|
| 785 |
- aws_access_key_textbox (str, optional): AWS access key for account with Bedrock permissions.
|
|
|
|
| 796 |
- assistant_model: Assistant model object for local inference.
|
| 797 |
- max_rows: The maximum number of rows to process.
|
| 798 |
- original_full_file_name: The original full file name.
|
| 799 |
+
- additional_instructions_summary_format: Initial instructions to guide the format for the initial summary of the topics.
|
| 800 |
- progress (Progress): A progress tracker.
|
| 801 |
|
| 802 |
'''
|
|
|
|
| 900 |
elif sentiment_checkbox == "Negative or Positive": sentiment_prompt = sentiment_prefix + negative_or_positive_sentiment_prompt + sentiment_suffix
|
| 901 |
elif sentiment_checkbox == "Do not assess sentiment": sentiment_prompt = "" # Just remove line completely. Previous: sentiment_prefix + do_not_assess_sentiment_prompt + sentiment_suffix
|
| 902 |
else: sentiment_prompt = sentiment_prefix + default_sentiment_prompt + sentiment_suffix
|
| 903 |
+
|
| 904 |
+
if context_textbox: context_textbox = "The context of this analysis is '" + context_textbox + "'."
|
| 905 |
+
else: context_textbox = ""
|
| 906 |
|
| 907 |
topics_loop_description = "Extracting topics from response batches (each batch of " + str(batch_size) + " responses)."
|
| 908 |
total_batches_to_do = num_batches - latest_batch_completed
|
|
|
|
| 1017 |
if existing_topic_summary_df['Description'].isnull().all():
|
| 1018 |
existing_topic_summary_df.drop("Description", axis = 1, inplace = True)
|
| 1019 |
|
| 1020 |
+
if produce_structured_summary_radio == "Yes":
|
| 1021 |
if "General topic" in topics_df_for_markdown.columns:
|
| 1022 |
+
topics_df_for_markdown = topics_df_for_markdown.rename(columns={"General topic":"Main heading"})
|
| 1023 |
if "Subtopic" in topics_df_for_markdown.columns:
|
| 1024 |
topics_df_for_markdown = topics_df_for_markdown.rename(columns={"Subtopic":"Subheading"})
|
| 1025 |
|
|
|
|
| 1035 |
topic_assignment_prompt = topic_assignment_prompt.replace("Assign topics", "Assign a topic").replace("assign Subtopics", "assign a Subtopic").replace("Subtopics", "Subtopic").replace("Topics", "Topic").replace("topics", "a topic")
|
| 1036 |
|
| 1037 |
# Format the summary prompt with the response table and topics
|
| 1038 |
+
if produce_structured_summary_radio != "Yes":
|
| 1039 |
formatted_summary_prompt = add_existing_topics_prompt.format(response_table=normalised_simple_markdown_table,
|
| 1040 |
topics=unique_topics_markdown,
|
| 1041 |
topic_assignment=topic_assignment_prompt,
|
| 1042 |
force_single_topic=force_single_topic_prompt,
|
| 1043 |
sentiment_choices=sentiment_prompt,
|
| 1044 |
response_reference_format=response_reference_format,
|
| 1045 |
+
add_existing_topics_summary_format=additional_instructions_summary_format)
|
| 1046 |
else:
|
| 1047 |
formatted_summary_prompt = structured_summary_prompt.format(response_table=normalised_simple_markdown_table,
|
| 1048 |
+
topics=unique_topics_markdown, summary_format=additional_instructions_summary_format)
|
| 1049 |
|
| 1050 |
full_prompt = formatted_system_prompt + "\n" + formatted_summary_prompt
|
| 1051 |
|
|
|
|
| 1062 |
responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(summary_prompt_list, formatted_system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, google_client, google_config, model_choice, temperature, reported_batch_no, local_model, tokenizer, bedrock_runtime, model_source, MAX_OUTPUT_VALIDATION_ATTEMPTS, assistant_prefill=add_existing_topics_assistant_prefill, master = True)
|
| 1063 |
|
| 1064 |
# Return output tables
|
| 1065 |
+
topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, new_topic_df, new_reference_df, new_topic_summary_df, master_batch_out_file_part, is_error = write_llm_output_and_logs(response_text, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_topic_summary_df, batch_size, chosen_cols, batch_basic_response_df, model_name_map, group_name, produce_structured_summary_radio, first_run=False, output_folder=output_folder)
|
| 1066 |
|
| 1067 |
full_prompt = formatted_system_prompt + "\n" + formatted_summary_prompt
|
| 1068 |
|
|
|
|
| 1101 |
|
| 1102 |
# Outputs for markdown table output
|
| 1103 |
unique_table_df_display_table = new_topic_summary_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
|
| 1104 |
+
|
| 1105 |
+
if produce_structured_summary_radio == "Yes":
|
| 1106 |
+
unique_table_df_display_table = unique_table_df_display_table[["General topic", "Subtopic", "Summary"]]
|
| 1107 |
+
unique_table_df_display_table.rename(columns={"General topic":"Main heading", "Subtopic":"Subheading"}, inplace=True)
|
| 1108 |
+
else:
|
| 1109 |
+
unique_table_df_display_table = unique_table_df_display_table[["General topic", "Subtopic", "Sentiment", "Number of responses", "Summary"]]
|
| 1110 |
+
|
| 1111 |
+
unique_table_df_display_table_markdown = unique_table_df_display_table.to_markdown(index=False)
|
| 1112 |
|
| 1113 |
whole_conversation_metadata_str = ' '.join(whole_conversation_metadata)
|
| 1114 |
|
|
|
|
| 1135 |
#print("Using AWS Bedrock model:", model_choice)
|
| 1136 |
|
| 1137 |
# Format the summary prompt with the response table and topics
|
| 1138 |
+
if produce_structured_summary_radio != "Yes":
|
| 1139 |
formatted_initial_table_prompt = initial_table_prompt.format(response_table=normalised_simple_markdown_table, sentiment_choices=sentiment_prompt,
|
| 1140 |
+
response_reference_format=response_reference_format, add_existing_topics_summary_format=additional_instructions_summary_format)
|
| 1141 |
else:
|
| 1142 |
unique_topics_markdown="No suggested headings for this summary"
|
| 1143 |
formatted_initial_table_prompt = structured_summary_prompt.format(response_table=normalised_simple_markdown_table, topics=unique_topics_markdown)
|
|
|
|
| 1150 |
|
| 1151 |
responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(batch_prompts, formatted_system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, google_client, google_config, model_choice, temperature, reported_batch_no, local_model, tokenizer,bedrock_runtime, model_source, MAX_OUTPUT_VALIDATION_ATTEMPTS, assistant_prefill=initial_table_assistant_prefill)
|
| 1152 |
|
| 1153 |
+
topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, topic_table_df, reference_df, new_topic_summary_df, batch_file_path_details, is_error = write_llm_output_and_logs(response_text, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_topic_summary_df, batch_size, chosen_cols, batch_basic_response_df, model_name_map, group_name, produce_structured_summary_radio, first_run=True, output_folder=output_folder)
|
| 1154 |
|
| 1155 |
# If error in table parsing, leave function
|
| 1156 |
if is_error == True: raise Exception("Error in output table parsing")
|
|
|
|
| 1272 |
|
| 1273 |
# Outputs for markdown table output
|
| 1274 |
unique_table_df_display_table = final_out_topic_summary_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
|
| 1275 |
+
|
| 1276 |
+
if produce_structured_summary_radio == "Yes":
|
| 1277 |
+
unique_table_df_display_table = unique_table_df_display_table[["General topic", "Subtopic", "Summary"]]
|
| 1278 |
+
unique_table_df_display_table.rename(columns={"General topic":"Main heading", "Subtopic":"Subheading"}, inplace=True)
|
| 1279 |
+
else:
|
| 1280 |
+
unique_table_df_display_table = unique_table_df_display_table[["General topic", "Subtopic", "Sentiment", "Number of responses", "Summary"]]
|
| 1281 |
+
|
| 1282 |
+
unique_table_df_display_table_markdown = unique_table_df_display_table.to_markdown(index=False)
|
| 1283 |
|
| 1284 |
# Ensure that we are only returning the final results to outputs
|
| 1285 |
out_file_paths = [x for x in out_file_paths if '_final_' in x]
|
|
|
|
| 1348 |
force_zero_shot_radio: str = "No",
|
| 1349 |
in_excel_sheets: List[str] = list(),
|
| 1350 |
force_single_topic_radio: str = "No",
|
| 1351 |
+
produce_structured_summary_radio: str = "No",
|
| 1352 |
aws_access_key_textbox:str="",
|
| 1353 |
aws_secret_key_textbox:str="",
|
| 1354 |
hf_api_key_textbox:str="",
|
| 1355 |
azure_api_key_textbox:str="",
|
| 1356 |
output_folder: str = OUTPUT_FOLDER,
|
| 1357 |
existing_logged_content:list=list(),
|
| 1358 |
+
additional_instructions_summary_format:str="",
|
| 1359 |
force_single_topic_prompt: str = force_single_topic_prompt,
|
| 1360 |
max_tokens: int = max_tokens,
|
| 1361 |
model_name_map: dict = model_name_map,
|
|
|
|
| 1366 |
tokenizer:object=None,
|
| 1367 |
assistant_model:object=None,
|
| 1368 |
max_rows:int=max_rows,
|
| 1369 |
+
progress=Progress(track_tqdm=True) # type: ignore
|
| 1370 |
) -> Tuple: # Mimicking the return tuple structure of extract_topics
|
| 1371 |
"""
|
| 1372 |
A wrapper function that iterates through unique values in a specified grouping column
|
|
|
|
| 1402 |
:param force_zero_shot_radio: Option to force responses into zero-shot topics.
|
| 1403 |
:param in_excel_sheets: List of Excel sheet names if applicable.
|
| 1404 |
:param force_single_topic_radio: Option to force a single topic per response.
|
| 1405 |
+
:param produce_structured_summary_radio: Option to produce a structured summary.
|
| 1406 |
:param aws_access_key_textbox: AWS access key for Bedrock.
|
| 1407 |
:param aws_secret_key_textbox: AWS secret key for Bedrock.
|
| 1408 |
:param hf_api_key_textbox: Hugging Face API key for local models.
|
|
|
|
| 1410 |
:param output_folder: The folder where output files will be saved.
|
| 1411 |
:param existing_logged_content: A list of existing logged content.
|
| 1412 |
:param force_single_topic_prompt: Prompt for forcing a single topic.
|
| 1413 |
+
:param additional_instructions_summary_format: Initial instructions to guide the format for the initial summary of the topics.
|
| 1414 |
:param max_tokens: Maximum tokens for LLM generation.
|
| 1415 |
:param model_name_map: Dictionary mapping model names to their properties.
|
| 1416 |
:param max_time_for_loop: Maximum time allowed for the processing loop.
|
|
|
|
| 1579 |
output_folder=output_folder,
|
| 1580 |
force_single_topic_prompt=force_single_topic_prompt,
|
| 1581 |
group_name=group_value,
|
| 1582 |
+
produce_structured_summary_radio=produce_structured_summary_radio,
|
| 1583 |
aws_access_key_textbox=aws_access_key_textbox,
|
| 1584 |
aws_secret_key_textbox=aws_secret_key_textbox,
|
| 1585 |
hf_api_key_textbox=hf_api_key_textbox,
|
|
|
|
| 1595 |
max_rows=max_rows,
|
| 1596 |
existing_logged_content=all_logged_content,
|
| 1597 |
original_full_file_name=original_file_name,
|
| 1598 |
+
additional_instructions_summary_format=additional_instructions_summary_format,
|
| 1599 |
progress=progress
|
| 1600 |
)
|
| 1601 |
|
|
|
|
| 1634 |
model_choice_clean_short = clean_column_name(model_choice_clean, max_length=20, front_characters=False)
|
| 1635 |
column_clean = clean_column_name(chosen_cols, max_length=20)
|
| 1636 |
|
| 1637 |
+
if "Group" in acc_reference_df.columns:
|
|
|
|
| 1638 |
|
| 1639 |
acc_reference_df_path = output_folder + overall_file_name + "_col_" + column_clean + "_all_final_reference_table_" + model_choice_clean_short + ".csv"
|
| 1640 |
acc_topic_summary_df_path = output_folder + overall_file_name + "_col_" + column_clean + "_all_final_unique_topics_" + model_choice_clean_short + ".csv"
|
|
|
|
| 1659 |
|
| 1660 |
# Outputs for markdown table output
|
| 1661 |
unique_table_df_display_table = acc_topic_summary_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
|
| 1662 |
+
if produce_structured_summary_radio == "Yes":
|
| 1663 |
+
unique_table_df_display_table = unique_table_df_display_table[["General topic", "Subtopic", "Summary", "Group"]]
|
| 1664 |
+
unique_table_df_display_table.rename(columns={"General topic":"Main heading", "Subtopic":"Subheading"}, inplace=True)
|
| 1665 |
+
acc_markdown_output = unique_table_df_display_table.to_markdown(index=False)
|
| 1666 |
+
else:
|
| 1667 |
+
acc_markdown_output = unique_table_df_display_table[["General topic", "Subtopic", "Sentiment", "Number of responses", "Summary", "Group"]].to_markdown(index=False)
|
| 1668 |
+
|
| 1669 |
|
| 1670 |
acc_input_tokens, acc_output_tokens, acc_number_of_calls = calculate_tokens_from_metadata(acc_whole_conversation_metadata, model_choice, model_name_map)
|
| 1671 |
|
|
|
|
| 1855 |
model_name_map_state: dict = model_name_map,
|
| 1856 |
usage_logs_location: str = "",
|
| 1857 |
existing_logged_content:list=list(),
|
| 1858 |
+
additional_instructions_summary_format:str="",
|
| 1859 |
model: object = None,
|
| 1860 |
tokenizer: object = None,
|
| 1861 |
assistant_model: object = None,
|
|
|
|
| 1910 |
model_name_map_state (dict, optional): Mapping of model names. Defaults to model_name_map.
|
| 1911 |
usage_logs_location (str, optional): Location for usage logs. Defaults to "".
|
| 1912 |
existing_logged_content (list, optional): Existing logged content. Defaults to list().
|
| 1913 |
+
additional_instructions_summary_format (str, optional): Summary format for adding existing topics. Defaults to "".
|
| 1914 |
model (object, optional): Loaded local model object. Defaults to None.
|
| 1915 |
tokenizer (object, optional): Loaded local tokenizer object. Defaults to None.
|
| 1916 |
assistant_model (object, optional): Loaded local assistant model object. Defaults to None.
|
|
|
|
| 1988 |
force_zero_shot_radio=force_zero_shot_choice,
|
| 1989 |
in_excel_sheets=in_excel_sheets,
|
| 1990 |
force_single_topic_radio=force_single_topic_choice,
|
| 1991 |
+
produce_structured_summary_radio=produce_structures_summary_choice,
|
| 1992 |
aws_access_key_textbox=aws_access_key_text,
|
| 1993 |
aws_secret_key_textbox=aws_secret_key_text,
|
| 1994 |
hf_api_key_textbox=hf_api_key_text,
|
|
|
|
| 2000 |
tokenizer=tokenizer,
|
| 2001 |
assistant_model=assistant_model,
|
| 2002 |
max_rows=max_rows,
|
| 2003 |
+
additional_instructions_summary_format=additional_instructions_summary_format
|
| 2004 |
)
|
| 2005 |
|
| 2006 |
total_input_tokens += out_input_tokens
|
|
|
|
| 2014 |
text_output_file_list_state = out_file_paths_1
|
| 2015 |
log_files_output_list_state = out_log_files
|
| 2016 |
|
| 2017 |
+
# If producing structured summaries, return the outputs after extraction
|
| 2018 |
+
if produce_structures_summary_choice == "Yes":
|
| 2019 |
+
|
| 2020 |
+
# Write logged content to file
|
| 2021 |
+
column_clean = clean_column_name(chosen_cols, max_length=20)
|
| 2022 |
+
model_choice_clean = model_name_map[model_choice]["short_name"]
|
| 2023 |
+
model_choice_clean_short = clean_column_name(model_choice_clean, max_length=20, front_characters=False)
|
| 2024 |
+
|
| 2025 |
+
out_logged_content_df_path = output_folder + original_file_name + "_col_" + column_clean + "_logs_" + model_choice_clean_short + ".json"
|
| 2026 |
+
|
| 2027 |
+
with open(out_logged_content_df_path, "w", encoding='utf-8-sig', errors='replace') as f:
|
| 2028 |
+
f.write(json.dumps(out_logged_content))
|
| 2029 |
+
|
| 2030 |
+
log_files_output_list_state.append(out_logged_content_df_path)
|
| 2031 |
+
out_log_files.append(out_logged_content_df_path)
|
| 2032 |
+
|
| 2033 |
+
# Map to the UI outputs list expected by the new single-call wiring
|
| 2034 |
+
return (
|
| 2035 |
+
display_markdown,
|
| 2036 |
+
out_topics_table,
|
| 2037 |
+
out_topic_summary_df,
|
| 2038 |
+
out_reference_df,
|
| 2039 |
+
topic_extraction_output_files,
|
| 2040 |
+
text_output_file_list_state,
|
| 2041 |
+
out_latest_batch_completed,
|
| 2042 |
+
out_log_files,
|
| 2043 |
+
log_files_output_list_state,
|
| 2044 |
+
out_conversation_metadata,
|
| 2045 |
+
total_time_taken,
|
| 2046 |
+
out_file_paths_1,
|
| 2047 |
+
list(), # summarisation_input_files is not available yet
|
| 2048 |
+
out_gradio_df,
|
| 2049 |
+
list(), # modification_input_files placeholder
|
| 2050 |
+
out_join_files,
|
| 2051 |
+
out_missing_df,
|
| 2052 |
+
total_input_tokens,
|
| 2053 |
+
total_output_tokens,
|
| 2054 |
+
total_number_of_calls,
|
| 2055 |
+
out_message[0],
|
| 2056 |
+
pd.DataFrame(), # summary_reference_table_sample_state is not available yet
|
| 2057 |
+
"", # summarised_references_markdown is not available yet
|
| 2058 |
+
out_topic_summary_df,
|
| 2059 |
+
out_reference_df,
|
| 2060 |
+
list(), # summary_output_files is not available yet
|
| 2061 |
+
list(), # summarised_outputs_list is not available yet
|
| 2062 |
+
0, # latest_summary_completed_num is not available yet
|
| 2063 |
+
list(), # overall_summarisation_input_files is not available yet
|
| 2064 |
+
list(), # overall_summary_output_files is not available yet
|
| 2065 |
+
"", # overall_summarised_output_markdown is not available yet
|
| 2066 |
+
pd.DataFrame(), # summarised_output_df is not available yet
|
| 2067 |
+
out_logged_content
|
| 2068 |
+
)
|
| 2069 |
+
|
| 2070 |
+
|
| 2071 |
# 2) Deduplication
|
| 2072 |
(
|
| 2073 |
ref_df_loaded,
|
|
|
|
| 2104 |
|
| 2105 |
summary_reference_table_sample_state, summarised_references_markdown = sample_reference_table_summaries(ref_df_after_dedup, random_seed)
|
| 2106 |
|
|
|
|
|
|
|
| 2107 |
(
|
| 2108 |
_summary_reference_table_sample_state,
|
| 2109 |
master_unique_topics_df_revised_summaries_state,
|
|
|
|
| 2221 |
|
| 2222 |
|
| 2223 |
# Map to the UI outputs list expected by the new single-call wiring
|
| 2224 |
+
# Use the original markdown with renamed columns if produce_structured_summary_radio is "Yes"
|
| 2225 |
+
final_display_markdown = display_markdown_updated if display_markdown_updated else display_markdown
|
| 2226 |
+
if produce_structures_summary_choice == "Yes":
|
| 2227 |
+
final_display_markdown = unique_table_df_display_table_markdown
|
| 2228 |
+
|
| 2229 |
return (
|
| 2230 |
+
final_display_markdown,
|
| 2231 |
out_topics_table,
|
| 2232 |
unique_df_after_dedup,
|
| 2233 |
ref_df_after_dedup,
|
tools/prompts.py
CHANGED
|
@@ -4,7 +4,7 @@
|
|
| 4 |
|
| 5 |
generic_system_prompt = """You are a researcher analysing responses from an open text dataset. You are analysing a single column from this dataset."""
|
| 6 |
|
| 7 |
-
system_prompt = """You are a researcher analysing responses from an open text dataset. You are analysing a single column from this dataset called '{column_name}'.
|
| 8 |
|
| 9 |
markdown_additional_prompt = """ You will be given a request for a markdown table. You must respond with ONLY the markdown table. Do not include any introduction, explanation, or concluding text."""
|
| 10 |
|
|
@@ -78,16 +78,17 @@ default_sentiment_prompt = "In the third column named 'Sentiment', write the sen
|
|
| 78 |
|
| 79 |
structured_summary_prompt = """Your task is to write a structured summary for open text responses.
|
| 80 |
|
| 81 |
-
Create a new markdown table based on the response table below with the headings 'Main heading', 'Subheading'
|
| 82 |
|
| 83 |
-
For each of the responses in the Response table, you will create a row for each summary associated with each of the Main headings and Subheadings from the Headings table. If there is no Headings table, created your own headings. In the first and second columns, write a Main heading and Subheading from the Headings table.
|
|
|
|
| 84 |
|
| 85 |
Do not add any other columns. Do not add any other text to your response.
|
| 86 |
|
| 87 |
Responses are shown in the following Response table:
|
| 88 |
{response_table}
|
| 89 |
|
| 90 |
-
Headings
|
| 91 |
{topics}
|
| 92 |
|
| 93 |
New table:"""
|
|
|
|
| 4 |
|
| 5 |
generic_system_prompt = """You are a researcher analysing responses from an open text dataset. You are analysing a single column from this dataset."""
|
| 6 |
|
| 7 |
+
system_prompt = """You are a researcher analysing responses from an open text dataset. You are analysing a single column from this dataset called '{column_name}'. {consultation_context}."""
|
| 8 |
|
| 9 |
markdown_additional_prompt = """ You will be given a request for a markdown table. You must respond with ONLY the markdown table. Do not include any introduction, explanation, or concluding text."""
|
| 10 |
|
|
|
|
| 78 |
|
| 79 |
structured_summary_prompt = """Your task is to write a structured summary for open text responses.
|
| 80 |
|
| 81 |
+
Create a new markdown table based on the response table below with the headings 'Main heading', 'Subheading' and 'Summary'.
|
| 82 |
|
| 83 |
+
For each of the responses in the Response table, you will create a row for each summary associated with each of the Main headings and Subheadings from the Headings table. If there is no Headings table, created your own headings. In the first and second columns, write a Main heading and Subheading from the Headings table. Then in Summary, write a detailed and comprehensive summary that covers all information relevant to the Main heading and Subheading on the same row.
|
| 84 |
+
{summary_format}
|
| 85 |
|
| 86 |
Do not add any other columns. Do not add any other text to your response.
|
| 87 |
|
| 88 |
Responses are shown in the following Response table:
|
| 89 |
{response_table}
|
| 90 |
|
| 91 |
+
Headings to structure the summary are in the following table:
|
| 92 |
{topics}
|
| 93 |
|
| 94 |
New table:"""
|