import streamlit as st import pandas as pd import random import time import string import gspread import os import json import datetime import re from oauth2client.service_account import ServiceAccountCredentials # Set page config at the very beginning st.set_page_config(page_title="LLM Output Evaluation", layout="wide") # Define the primary highlight color (keeping it consistent with previous apps) HIGHLIGHT_COLOR = "#2c7be5" # --- ALL UTILITY FUNCTIONS DEFINED AT THE TOP (Solving NameError) --- # Load worker-specific stimuli @st.cache_data def load_worker_stimuli(worker_id): file_path = os.path.join("stimuli", f"worker_{worker_id:02d}.jsonl") print(file_path) if not os.path.exists(file_path): return None stimuli_list = [] with open(file_path, 'r') as f: for line in f: try: item = json.loads(line) except json.JSONDecodeError: continue # 필수 필드 확인 required_fields = ["stimuli_id", "keyword_sentence", "keyword", "engaged_events", "generalizable_properties", "source", "scene_soft_cluster"] if not all(k in item for k in required_fields): continue # evoked_emotions 처리 emotions = [] if item.get("evoked_emotions"): for emo in item["evoked_emotions"]: if isinstance(emo, dict) and "emotion" in emo and "explanation" in emo: emotions.append(f"{emo['emotion']}: {emo['explanation']}") else: emotions.append(str(emo)) else: emotions = ["None observed"] # 정제된 entry 구성 entry = { "stimuli_id": item["stimuli_id"], "text": item["keyword_sentence"], "keyword": item["keyword"], "scene": int(item["scene_soft_cluster"]), "source": item["source"], "scene_output": { "1. Engaged Events: What is happening in the situation?": item["engaged_events"], "2. Generalizable Properties: What are the relevant properties of " + item["keyword"] + " in the situation?": item["generalizable_properties"], "3. Evoked Emotions: Which emotions do you observe in the situation?": emotions } } stimuli_list.append(entry) return stimuli_list def highlight_keyword(sentence, keyword, color=HIGHLIGHT_COLOR): """Highlights a specific keyword in a sentence, ignoring case.""" # Use word boundaries (\b) to match whole words and ignore case return re.sub(r'\b' + re.escape(keyword) + r'\b', r"\g<0>", sentence, flags=re.IGNORECASE) def generate_passcode(worker_id): suffix = ''.join(random.choices(string.ascii_uppercase + string.digits, k=6)) return f"EXP2-main-W{worker_id:02d}-{suffix}" def get_google_creds(): service_account_json = os.getenv("SERVICE_ACCOUNT_JSON") if service_account_json: try: creds_dict = json.loads(service_account_json) scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"] creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope) return gspread.authorize(creds) except json.JSONDecodeError: st.error("Invalid JSON format in SERVICE_ACCOUNT_JSON environment variable. Please ensure it's a single, valid JSON string.") return None except Exception as e: st.error(f"Error loading Google credentials: {e}") return None else: st.error("Google service account credentials (SERVICE_ACCOUNT_JSON) not found in environment variables. Please configure your Streamlit app secrets or local environment.") return None def upload_to_google_drive(response_df): if response_df.empty: st.warning("No responses to upload.") return try: client = get_google_creds() if client is None: st.error("❌ Google credentials not loaded. Cannot upload results.") return sheet_name = "EXP2-main" # Sheet name for Experiment 2 try: sheet = client.open(sheet_name).sheet1 except gspread.exceptions.SpreadsheetNotFound: st.info(f"Creating new Google Sheet: {sheet_name}") sheet = client.create(sheet_name).sheet1 # Get current headers from the sheet current_sheet_headers = sheet.row_values(1) if sheet.row_count > 0 else [] expected_headers = list(response_df.columns) # Add headers if the sheet is empty or headers don't match if not current_sheet_headers or current_sheet_headers != expected_headers: # if sheet.row_count > 0: # st.warning("Google Sheet headers do not match. Data will be appended, but consider manual alignment or creating a new sheet/worksheet.") if not current_sheet_headers: # Only add if sheet is truly empty after potential clear sheet.append_row(expected_headers) # st.info("Added headers to the Google Sheet.") # elif current_sheet_headers != expected_headers: # st.error("Existing sheet headers mismatch. Data will be appended, but columns might be misaligned.") # Prepare data: Replace NaN, inf with empty string, then convert to list of lists response_df_clean = response_df.replace([float("inf"), float("-inf")], None).fillna("") data_to_upload = response_df_clean.values.tolist() # Append all rows at once for efficiency if data_to_upload: sheet.append_rows(data_to_upload) st.success("✅ Your responses have been recorded successfully.") # Clear responses after successful upload to prevent re-uploading on rerun st.session_state.responses = [] else: st.warning("No new responses to upload.") except Exception as e: st.error("❌ Error uploading to Google Drive:") st.error(f"Details: {e}") # Function to record responses for the current section def record_section_responses(idx, sec_idx, current_sample_data, current_section_title, acc_score, comp_score, interp_score): worker_id = st.session_state.get("worker_id", "N/A") passcode = st.session_state.get("passcode", "N/A") timestamp = datetime.datetime.now().isoformat() # Calculate response_time_sec *before* appending to state, as time.time() changes. start_time_for_section = st.session_state.get("response_start_time", time.time()) response_time = time.time() - start_time_for_section # Define common fields for all metrics from this section base_record = { "timestamp": timestamp, "worker_id": worker_id, "passcode": passcode, "sample_index": idx, "section_index_within_sample": sec_idx, "section_title": current_section_title, "original_text": current_sample_data["text"], "keyword": current_sample_data["keyword"], "response_time_sec": response_time, } # Record each metric as a separate row st.session_state.responses.append({**base_record, "metric": "Accuracy", "score": acc_score}) st.session_state.responses.append({**base_record, "metric": "Completeness", "score": comp_score}) st.session_state.responses.append({**base_record, "metric": "Interpretability", "score": interp_score}) def generate_rating_prompt(section_title: str) -> str: # Remove leading number and colon if ". " in section_title: section_title = section_title.split(". ", 1)[1] if ":" in section_title: section_name = section_title.split(":", 1)[0].strip() else: section_name = section_title.strip() section_name = section_name.lower() if "engaged event" in section_name: return "How well does this capture the events involving the keyword in this situation? More specifically: " elif "generalizable propert" in section_name: # 'propert' for 'property' or 'properties' return "How well does this reflect the relevant properties of the keyword in this situation? More specifically: " elif "evoked emotion" in section_name: return "How well does this capture the emotions evoked by the keyword in this situation? More specifically: " else: return f"How well does this describe the {section_name}? More specifically: " # --- Data Definition for Samples (Moved to after utility functions) --- # stimuli_list = load_and_convert('exp2-stimuli-sentences.jsonl') # stimuli_list = load_and_convert('worker_01.jsonl') # --- Page Functions --- def instructions_1(): st.title("Experiment 2: LLM Scene Abstraction Evaluation") st.header("📖 Instructions (1/2)") # st.write(f""" # Welcome to Experiment 2! Here’s how it works: # - You will read a sentence that contains a specific **keyword**. # - You will then see **scene-level information about the keyword** in the given situation, generated by a large language model (LLM). # - The information is organized into three sections: # 1. **Engaged Events** — What is happening to the keyword in this situation? # 2. **Generalizable Properties** — What context-relevant properties of the keyword are revealed through this situation? # 3. **Evoked Emotions** — What emotions are associated with the keyword in this scene, and why? #
# Your task is to **evaluate each section** based on how well it reflects the information conveyed in the original sentence. # - For each section, please rate the following dimensions on a 1–5 scale: # - **Accuracy** — How accurate is it? Is the content factually consistent with the sentence? # - **Completeness** — How complete and rich is it? Does it fully capture the relevant aspects of the keyword? # - **Interpretability** — How interpretable is it? Is it easy to understand? #
# If you have questions or feedback, please feel free to let us know via email. #

# """, unsafe_allow_html=True) st.write(f"""

Welcome to Experiment 2! Here’s how it works:

You will read a sentence that contains a specific keyword.
You will then see scene-level information about the keyword in the given situation, generated by a large language model (LLM).

The information is organized into three sections:

Engaged Events — What is happening to the keyword in this situation?
Generalizable Properties — What context-relevant properties of the keyword are revealed through this situation?
Evoked Emotions — What emotions are associated with the keyword in this scene, and why?

Your task is to evaluate each section based on how well it reflects the information conveyed in the original sentence.

For each section, please rate the following dimensions on a 1–5 scale:

Accuracy — How accurate is it? Is the content factually consistent with the sentence?
Completeness — How complete and rich is it? Does it fully capture the relevant aspects of the keyword?
Interpretability — How interpretable is it? Is it easy to understand?

If you have questions or feedback, please feel free to let us know via email.

""", unsafe_allow_html=True) if st.button("Next ➡️"): st.session_state.step = "instructions_2" st.rerun() st.stop() def instructions_2(): st.title("Experiment 2: LLM Scene Abstraction Evaluation") st.header("📖 Instructions (2/2)") st.write(f""" Placeholder notation guide
In the scene descriptions, you will encounter placeholder labels like PersonX and AnimalX. These can be interpreted as follows: - PersonX: someone in the scene - PersonY: another individual in the scene - AnimalX: some animal in the scene - ObjectX: some non-living object in the scene - PersonGroupX: a group of people - AnimalGroupX: a group of animals (e.g., a flock of birds, a pack of wolves) These labels are used instead of specific names to help you focus on the roles and actions of each entity in the scene, rather than their exact names or identities. When you're ready, click below to begin!

""", unsafe_allow_html=True) if st.button("Start experiment ▶️"): st.session_state.step = "main_run" # The response_start_time will be set inside the training() function # when the first section is actually displayed. st.rerun() st.stop() def main_run(): st.title("Experiment 2: LLM Scene Abstraction Evaluation") # stimuli = stimuli_list # Using the predefined stimuli_list for training # Access stimuli_list from session state stimuli = st.session_state.stimuli_list idx = st.session_state.training_index # --- Handle Training Completion --- if idx >= len(stimuli): st.session_state.experiment_complete = True st.header("🎉 Experiment 2 Complete!") # st.markdown(""" #

# You've successfully completed the training phase of the experiment. Great work! 🎯

# If you have any questions, suggestions, or feedback about the task, please let us know.
# If everything is clear, just let us know that you're ready to proceed to the main experiment. #

# """, unsafe_allow_html=True) st.markdown(""" You’ve successfully completed **Experiment 2** — thank you! Please take a short break if needed. When you’re ready, proceed to the next experiment. If you have any questions or concerns before continuing, feel free to let us know. """, unsafe_allow_html=True) # Generate passcode if not already generated (e.g., for direct training start) if st.session_state.passcode is None: st.session_state.passcode = generate_passcode(st.session_state.get("worker_id", 0)) # Prepare DataFrame for upload from stored responses response_df = pd.DataFrame(st.session_state.responses) # Ensure column order for consistency in Google Sheet # Define all possible columns that could be in a response record all_possible_cols = [ "timestamp", "worker_id", "passcode", "sample_index", "section_index_within_sample", "section_title", "original_text", "keyword", "metric", "score", "response_time_sec" ] # Filter and reorder DataFrame columns to match expected order final_cols = [col for col in all_possible_cols if col in response_df.columns] response_df = response_df[final_cols] upload_to_google_drive(response_df) st.markdown("#### 🔑 Your Unique Completion Code") st.code(st.session_state.passcode) st.stop() # --- Display Current Sample and Section --- current_sample_data = stimuli[idx] total_samples = len(stimuli) # 🔄 Initialize section_index or reset for new sample if "section_index" not in st.session_state or \ st.session_state.section_index >= len(list(current_sample_data['scene_output'].keys())): st.session_state.section_index = 0 # Reset for new sample # Reset timer only when moving to a new *sample* or if it's the very first display st.session_state.response_start_time = time.time() section_keys = list(current_sample_data['scene_output'].keys()) sec_idx = st.session_state.section_index current_section_title = section_keys[sec_idx] # Ensure response_start_time is set for this particular section display # This specifically starts/restarts the timer for *this* section if it's new. # It's also set by the `st.session_state.section_index = 0` block above. if "response_start_time" not in st.session_state or st.session_state.response_start_time == 0: st.session_state.response_start_time = time.time() with st.form(key=f"form_{idx}_{sec_idx}"): # --- Left Column: Sentence and Section Summary --- left_col, right_col = st.columns([1,1]) with left_col: st.markdown(f"

Sentence {idx + 1} of {total_samples}

", unsafe_allow_html=True) # Keyword display st.markdown( f"

Keyword: {current_sample_data.get('keyword', 'N/A')}

", unsafe_allow_html=True ) # Text box st.markdown("Text:") text = current_sample_data['text'] keyword = current_sample_data['keyword'] pattern = re.compile(re.escape(keyword), re.IGNORECASE) text_with_bold = pattern.sub(r"\g<0>", text, count=1) st.markdown( f"""

{text_with_bold}

""", unsafe_allow_html=True ) # Section title and description box st.markdown("Scene information:") section_title_parts = current_section_title.split(":") bold_title = section_title_parts[0].strip() if len(section_title_parts) >= 1 else current_section_title subtitle = section_title_parts[1].strip() if len(section_title_parts) == 2 else "" st.markdown( f"""

{bold_title}

( {subtitle} )

""", unsafe_allow_html=True ) # Scene output bullets bullets = current_sample_data['scene_output'][current_section_title] st.markdown( "

", unsafe_allow_html=True ) # --- Right Column: Evaluation --- with right_col: prompt_text = generate_rating_prompt(current_section_title) highlight = "the keyword" if highlight in prompt_text: prompt_text = prompt_text.replace( highlight, f"{highlight}" ) st.markdown( f"

{prompt_text}

", unsafe_allow_html=True ) # Rating Keys (using session state to retrieve prior selections) acc_key = f"rating_acc_{idx}_{sec_idx}" comp_key = f"rating_comp_{idx}_{sec_idx}" interp_key = f"rating_interp_{idx}_{sec_idx}" # Retrieve current selected values from session state to pre-fill radio buttons current_acc_val = st.session_state.get(acc_key) current_comp_val = st.session_state.get(comp_key) current_interp_val = st.session_state.get(interp_key) # Accuracy st.markdown("

[Accuracy] How accurate is it? Is the content factually consistent with the sentence?

", unsafe_allow_html=True) acc = st.radio( label="Accuracy", options=[1,2,3,4,5], index=current_acc_val - 1 if current_acc_val else None, # Convert value (1-5) to index (0-4) key=acc_key, horizontal=True, label_visibility="collapsed" ) st.markdown("""

1 = Very inaccurate, 2 = Inaccurate, 3 = Neutral, 4 = Accurate, 5 = Very accurate

""", unsafe_allow_html=True) st.markdown("

", unsafe_allow_html=True) # Completeness st.markdown("

[Completeness] How complete and rich is it? Does it fully capture the relevant aspects of the keyword?

", unsafe_allow_html=True) comp = st.radio( label="Completeness", options=[1,2,3,4,5], index=current_comp_val - 1 if current_comp_val else None, key=comp_key, horizontal=True, label_visibility="collapsed" ) st.markdown("""

1 = Very incomplete, 2 = Incomplete, 3 = Moderate, 4 = Mostly complete, 5 = Very complete

""", unsafe_allow_html=True) st.markdown("

", unsafe_allow_html=True) # Interpretability st.markdown("

[Interpretability] How interpretable is it? Is it easy to understand?

", unsafe_allow_html=True) interp = st.radio( label="Interpretability", options=[1,2,3,4,5], index=current_interp_val - 1 if current_interp_val else None, key=interp_key, horizontal=True, label_visibility="collapsed" ) st.markdown("""

1 = Very difficult to interpret, 2 = Difficult, 3 = Moderate, 4 = Easy, 5 = Very easy to interpret

""", unsafe_allow_html=True) # Navigation Button within the form st.markdown("

", unsafe_allow_html=True) submit_button = st.form_submit_button("Next ➡️") # --- Form submission handler --- if submit_button: # Validate all ratings are selected if acc is None or comp is None or interp is None: st.warning("⚠️ Please complete all ratings before proceeding.") st.stop() # Validate response time response_time = time.time() - st.session_state.response_start_time if response_time < 4: # Minimum 5 seconds for careful evaluation per section st.warning("⚠️ Please take enough time to read and evaluate carefully before proceeding.") st.stop() # If all validations pass, record responses for this section record_section_responses( idx=idx, sec_idx=sec_idx, current_sample_data=current_sample_data, current_section_title=current_section_title, acc_score=acc, comp_score=comp, interp_score=interp ) # Move to the next section or next sample if st.session_state.section_index < len(section_keys) - 1: st.session_state.section_index += 1 st.session_state.response_start_time = time.time() # Reset timer for next section st.rerun() else: st.session_state.section_index = 0 # Reset section for next sample st.session_state.training_index += 1 # Move to next sample st.session_state.response_start_time = time.time() # Reset timer for the first section of the new sample st.rerun() st.stop() # --- Main App Flow Manager --- def instructions_page_manager(): # Page 0: Worker ID Input (first logical step) if st.session_state.step == "worker_id_input": st.title("Welcome to Experiment 2") st.write("Please enter your participant ID to begin the experiment:") with st.form(key='worker_id_form'): participant_input = st.text_input("Participant ID (e.g., 4)") submit_btn = st.form_submit_button("Submit") # if submit_btn: # try: # worker_id = int(participant_input) # st.session_state.worker_id = worker_id # st.session_state.passcode = generate_passcode(worker_id) # st.session_state.step = "instructions_1" # Move to instructions page 1 # st.rerun() # except ValueError: # st.error("Please enter a valid numeric ID.") # st.stop() # if submit_btn: # try: # worker_id = int(participant_input) # stimuli_list = load_worker_stimuli(worker_id) # if stimuli_list is not None: # st.session_state.worker_id = worker_id # st.session_state.passcode = generate_passcode(worker_id) # st.session_state.step = "instructions_1" # Move to instructions page 1 # st.rerun() # else: # st.error(f"No data found for worker ID {worker_id}. Please check your ID.") # except ValueError: # st.error("Please enter a valid numeric ID.") # st.stop() if submit_btn: try: worker_id = int(participant_input) # Load stimuli and store it in session state loaded_stimuli = load_worker_stimuli(worker_id) if loaded_stimuli is not None: st.session_state.worker_id = worker_id st.session_state.passcode = generate_passcode(worker_id) st.session_state.stimuli_list = loaded_stimuli # Store stimuli in session state st.session_state.step = "instructions_1" # Move to instructions page 1 st.rerun() else: st.error(f"No data found for worker ID {worker_id}. Please check your ID.") except ValueError: st.error("Please enter a valid numeric ID.") st.stop() # if submit_btn: # try: # worker_id = int(participant_input) # df = load_worker_data(worker_id) # if df is not None: # st.session_state.worker_id = worker_id # st.session_state.df = df # st.session_state.step = "instructions" # st.rerun() # else: # st.error(f"No data found for worker ID {worker_id}. Please check your ID.") # except ValueError: # st.error("Please enter a valid numeric ID.") # Page 1: Instructions (1/2) elif st.session_state.step == "instructions_1": instructions_1() # Page 2: Instructions (2/2) elif st.session_state.step == "instructions_2": instructions_2() # Training Phase elif st.session_state.step == "main_run": main_run() # Training Complete Page elif st.session_state.step == "experiment_complete": st.header("🎉 Experiment 2 Complete!") st.markdown(""" You’ve successfully completed **Experiment 2** — thank you! Please take a short break if needed. When you’re ready, proceed to the next experiment. If you have any questions or concerns before continuing, feel free to let us know. """, unsafe_allow_html=True) # Prepare DataFrame for upload response_df = pd.DataFrame(st.session_state.responses) # Define all possible columns for the final DataFrame for Google Sheets expected_upload_cols = [ "timestamp", "worker_id", "passcode", "sample_index", "section_index_within_sample", "section_title", "original_text", "keyword", "metric", "score", "response_time_sec" ] # Filter and reorder DataFrame columns to match expected order for upload final_response_df = response_df[[col for col in expected_upload_cols if col in response_df.columns]] upload_to_google_drive(final_response_df) st.markdown("#### 🔑 Your Unique Completion Code") st.code(st.session_state.passcode) st.stop() # --- Main App Entry Point --- if __name__ == "__main__": # Initialize session state variables if they don't exist if "step" not in st.session_state: st.session_state.step = "worker_id_input" # Initial step if "worker_id" not in st.session_state: st.session_state.worker_id = None if "passcode" not in st.session_state: st.session_state.passcode = None if "training_index" not in st.session_state: st.session_state.training_index = 0 if "section_index" not in st.session_state: # To track current section within a sample st.session_state.section_index = 0 if "responses" not in st.session_state: st.session_state.responses = [] # Stores all collected individual rating records if "response_start_time" not in st.session_state: # Timer for each section evaluation st.session_state.response_start_time = 0 # Initialize to 0, actual time.time() set when section displayed if "stimuli_list" not in st.session_state: # Add stimuli_list to session state st.session_state.stimuli_list = None # Global CSS styling (moved here for consistency as it applies globally) st.markdown(""" """, unsafe_allow_html=True) # Call the manager function to control the app flow instructions_page_manager()