import streamlit as st
import pandas as pd
import random
import time
import string
import gspread
import os
import json
import datetime
import re
from oauth2client.service_account import ServiceAccountCredentials
# Set page config at the very beginning
st.set_page_config(page_title="LLM Output Evaluation", layout="wide")
# Define the primary highlight color (keeping it consistent with previous apps)
HIGHLIGHT_COLOR = "#2c7be5"
# --- ALL UTILITY FUNCTIONS DEFINED AT THE TOP (Solving NameError) ---
# Load worker-specific stimuli
@st.cache_data
def load_worker_stimuli(worker_id):
file_path = os.path.join("stimuli", f"worker_{worker_id:02d}.jsonl")
print(file_path)
if not os.path.exists(file_path):
return None
stimuli_list = []
with open(file_path, 'r') as f:
for line in f:
try:
item = json.loads(line)
except json.JSONDecodeError:
continue
# 필수 필드 확인
required_fields = ["stimuli_id", "keyword_sentence", "keyword", "engaged_events", "generalizable_properties", "source", "scene_soft_cluster"]
if not all(k in item for k in required_fields):
continue
# evoked_emotions 처리
emotions = []
if item.get("evoked_emotions"):
for emo in item["evoked_emotions"]:
if isinstance(emo, dict) and "emotion" in emo and "explanation" in emo:
emotions.append(f"{emo['emotion']}: {emo['explanation']}")
else:
emotions.append(str(emo))
else:
emotions = ["None observed"]
# 정제된 entry 구성
entry = {
"stimuli_id": item["stimuli_id"],
"text": item["keyword_sentence"],
"keyword": item["keyword"],
"scene": int(item["scene_soft_cluster"]),
"source": item["source"],
"scene_output": {
"1. Engaged Events: What is happening in the situation?": item["engaged_events"],
"2. Generalizable Properties: What are the relevant properties of " + item["keyword"] + " in the situation?": item["generalizable_properties"],
"3. Evoked Emotions: Which emotions do you observe in the situation?": emotions
}
}
stimuli_list.append(entry)
return stimuli_list
def highlight_keyword(sentence, keyword, color=HIGHLIGHT_COLOR):
"""Highlights a specific keyword in a sentence, ignoring case."""
# Use word boundaries (\b) to match whole words and ignore case
return re.sub(r'\b' + re.escape(keyword) + r'\b',
r"\g<0>",
sentence, flags=re.IGNORECASE)
def generate_passcode(worker_id):
suffix = ''.join(random.choices(string.ascii_uppercase + string.digits, k=6))
return f"EXP2-main-W{worker_id:02d}-{suffix}"
def get_google_creds():
service_account_json = os.getenv("SERVICE_ACCOUNT_JSON")
if service_account_json:
try:
creds_dict = json.loads(service_account_json)
scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
return gspread.authorize(creds)
except json.JSONDecodeError:
st.error("Invalid JSON format in SERVICE_ACCOUNT_JSON environment variable. Please ensure it's a single, valid JSON string.")
return None
except Exception as e:
st.error(f"Error loading Google credentials: {e}")
return None
else:
st.error("Google service account credentials (SERVICE_ACCOUNT_JSON) not found in environment variables. Please configure your Streamlit app secrets or local environment.")
return None
def upload_to_google_drive(response_df):
if response_df.empty:
st.warning("No responses to upload.")
return
try:
client = get_google_creds()
if client is None:
st.error("❌ Google credentials not loaded. Cannot upload results.")
return
sheet_name = "EXP2-main" # Sheet name for Experiment 2
try:
sheet = client.open(sheet_name).sheet1
except gspread.exceptions.SpreadsheetNotFound:
st.info(f"Creating new Google Sheet: {sheet_name}")
sheet = client.create(sheet_name).sheet1
# Get current headers from the sheet
current_sheet_headers = sheet.row_values(1) if sheet.row_count > 0 else []
expected_headers = list(response_df.columns)
# Add headers if the sheet is empty or headers don't match
if not current_sheet_headers or current_sheet_headers != expected_headers:
# if sheet.row_count > 0:
# st.warning("Google Sheet headers do not match. Data will be appended, but consider manual alignment or creating a new sheet/worksheet.")
if not current_sheet_headers: # Only add if sheet is truly empty after potential clear
sheet.append_row(expected_headers)
# st.info("Added headers to the Google Sheet.")
# elif current_sheet_headers != expected_headers:
# st.error("Existing sheet headers mismatch. Data will be appended, but columns might be misaligned.")
# Prepare data: Replace NaN, inf with empty string, then convert to list of lists
response_df_clean = response_df.replace([float("inf"), float("-inf")], None).fillna("")
data_to_upload = response_df_clean.values.tolist()
# Append all rows at once for efficiency
if data_to_upload:
sheet.append_rows(data_to_upload)
st.success("✅ Your responses have been recorded successfully.")
# Clear responses after successful upload to prevent re-uploading on rerun
st.session_state.responses = []
else:
st.warning("No new responses to upload.")
except Exception as e:
st.error("❌ Error uploading to Google Drive:")
st.error(f"Details: {e}")
# Function to record responses for the current section
def record_section_responses(idx, sec_idx, current_sample_data, current_section_title, acc_score, comp_score, interp_score):
worker_id = st.session_state.get("worker_id", "N/A")
passcode = st.session_state.get("passcode", "N/A")
timestamp = datetime.datetime.now().isoformat()
# Calculate response_time_sec *before* appending to state, as time.time() changes.
start_time_for_section = st.session_state.get("response_start_time", time.time())
response_time = time.time() - start_time_for_section
# Define common fields for all metrics from this section
base_record = {
"timestamp": timestamp,
"worker_id": worker_id,
"passcode": passcode,
"sample_index": idx,
"section_index_within_sample": sec_idx,
"section_title": current_section_title,
"original_text": current_sample_data["text"],
"keyword": current_sample_data["keyword"],
"response_time_sec": response_time,
}
# Record each metric as a separate row
st.session_state.responses.append({**base_record, "metric": "Accuracy", "score": acc_score})
st.session_state.responses.append({**base_record, "metric": "Completeness", "score": comp_score})
st.session_state.responses.append({**base_record, "metric": "Interpretability", "score": interp_score})
def generate_rating_prompt(section_title: str) -> str:
# Remove leading number and colon
if ". " in section_title:
section_title = section_title.split(". ", 1)[1]
if ":" in section_title:
section_name = section_title.split(":", 1)[0].strip()
else:
section_name = section_title.strip()
section_name = section_name.lower()
if "engaged event" in section_name:
return "How well does this capture the events involving the keyword in this situation? More specifically: "
elif "generalizable propert" in section_name: # 'propert' for 'property' or 'properties'
return "How well does this reflect the relevant properties of the keyword in this situation? More specifically: "
elif "evoked emotion" in section_name:
return "How well does this capture the emotions evoked by the keyword in this situation? More specifically: "
else:
return f"How well does this describe the {section_name}? More specifically: "
# --- Data Definition for Samples (Moved to after utility functions) ---
# stimuli_list = load_and_convert('exp2-stimuli-sentences.jsonl')
# stimuli_list = load_and_convert('worker_01.jsonl')
# --- Page Functions ---
def instructions_1():
st.title("Experiment 2: LLM Scene Abstraction Evaluation")
st.header("📖 Instructions (1/2)")
# st.write(f"""
# Welcome to Experiment 2! Here’s how it works:
# - You will read a sentence that contains a specific **keyword**.
# - You will then see **scene-level information about the keyword** in the given situation, generated by a large language model (LLM).
# - The information is organized into three sections:
# 1. **Engaged Events** — What is happening to the keyword in this situation?
# 2. **Generalizable Properties** — What context-relevant properties of the keyword are revealed through this situation?
# 3. **Evoked Emotions** — What emotions are associated with the keyword in this scene, and why?
#
# Your task is to **evaluate each section** based on how well it reflects the information conveyed in the original sentence.
# - For each section, please rate the following dimensions on a 1–5 scale:
# - **Accuracy** — How accurate is it? Is the content factually consistent with the sentence?
# - **Completeness** — How complete and rich is it? Does it fully capture the relevant aspects of the keyword?
# - **Interpretability** — How interpretable is it? Is it easy to understand?
#
# If you have questions or feedback, please feel free to let us know via email.
#
# """, unsafe_allow_html=True)
st.write(f"""
Welcome to Experiment 2! Here’s how it works:
The information is organized into three sections:
Your task is to evaluate each section based on how well it reflects the information conveyed in the original sentence.
For each section, please rate the following dimensions on a 1–5 scale:
If you have questions or feedback, please feel free to let us know via email.
Sentence {idx + 1} of {total_samples}
", unsafe_allow_html=True) # Keyword display st.markdown( f"Keyword: {current_sample_data.get('keyword', 'N/A')}
", unsafe_allow_html=True ) # Text box st.markdown("Text:") text = current_sample_data['text'] keyword = current_sample_data['keyword'] pattern = re.compile(re.escape(keyword), re.IGNORECASE) text_with_bold = pattern.sub(r"\g<0>", text, count=1) st.markdown( f"""{prompt_text}
", unsafe_allow_html=True ) # Rating Keys (using session state to retrieve prior selections) acc_key = f"rating_acc_{idx}_{sec_idx}" comp_key = f"rating_comp_{idx}_{sec_idx}" interp_key = f"rating_interp_{idx}_{sec_idx}" # Retrieve current selected values from session state to pre-fill radio buttons current_acc_val = st.session_state.get(acc_key) current_comp_val = st.session_state.get(comp_key) current_interp_val = st.session_state.get(interp_key) # Accuracy st.markdown("[Accuracy] How accurate is it? Is the content factually consistent with the sentence?
", unsafe_allow_html=True) acc = st.radio( label="Accuracy", options=[1,2,3,4,5], index=current_acc_val - 1 if current_acc_val else None, # Convert value (1-5) to index (0-4) key=acc_key, horizontal=True, label_visibility="collapsed" ) st.markdown("""[Completeness] How complete and rich is it? Does it fully capture the relevant aspects of the keyword?
", unsafe_allow_html=True) comp = st.radio( label="Completeness", options=[1,2,3,4,5], index=current_comp_val - 1 if current_comp_val else None, key=comp_key, horizontal=True, label_visibility="collapsed" ) st.markdown("""[Interpretability] How interpretable is it? Is it easy to understand?
", unsafe_allow_html=True) interp = st.radio( label="Interpretability", options=[1,2,3,4,5], index=current_interp_val - 1 if current_interp_val else None, key=interp_key, horizontal=True, label_visibility="collapsed" ) st.markdown("""