SuperBench-Eval / app.py
Enderchef's picture
Update app.py
566e353 verified
raw
history blame
26.8 kB
import os
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from datasets import load_dataset, get_dataset_config_names
import torch
import re
import json
import pandas as pd
import traceback
import spaces
from datetime import datetime
# --- Environment and Caching ---
# It's good practice to ensure the cache directory exists.
CACHE_DIR = "evaluation_cache"
os.makedirs(CACHE_DIR, exist_ok=True)
EVAL_FILE = os.path.join(CACHE_DIR, "eval.jsonl")
# Cache to avoid reloading models and dataset configs
model_cache = {}
benchmark_subject_cache = {}
# Use environment variable for the Hugging Face token
HF_TOKEN = os.environ.get("HF_TOKEN")
# --- Constants for Benchmarks ---
MMLU_DATASET = "cais/mmlu"
# Temporarily remove MMLU-Pro references
# MMLU_PRO_DATASET = "TIGER-Lab/MMLU-Pro"
BENCHMARK_MAP = {
"MMLU": MMLU_DATASET,
# "MMLU-Pro": MMLU_PRO_DATASET # Temporarily removed
}
# --- Data Loading and Preparation ---
def get_all_benchmark_options():
"""
Fetches and caches the available subjects (configs) for each benchmark dataset.
This function now populates a global cache to avoid repeated API calls.
"""
if benchmark_subject_cache:
return benchmark_subject_cache
print("Fetching benchmark configurations for the first time...")
# Only iterate over the allowed benchmarks (MMLU)
for key, dataset_id in BENCHMARK_MAP.items():
try:
# Fetching dataset configurations requires authentication if the dataset is private
subjects = get_dataset_config_names(dataset_id, token=HF_TOKEN)
benchmark_subject_cache[key] = ["ALL"] + sorted([s for s in subjects if s != 'all']) # Sort subjects
except Exception as e:
print(f"Warning: Could not load configs for {key} ({dataset_id}). It might be private or unavailable. Error: {e}")
benchmark_subject_cache[key] = ["ALL"] # Provide a default
print("Benchmark configurations cached.")
return benchmark_subject_cache
# Initialize the cache on startup
ALL_BENCHMARK_SUBJECTS = get_all_benchmark_options()
@spaces.GPU()
def load_model(model_id):
"""
Loads a Hugging Face model and tokenizer, creating a text-generation pipeline.
Uses a cache to avoid reloading models.
"""
if not model_id:
raise ValueError("Model ID cannot be empty.")
gr.Info(f"Attempting to load model: {model_id}...")
if model_id in model_cache:
gr.Info(f"Model '{model_id}' found in cache.")
return model_cache[model_id]
try:
# Use bfloat16 for better performance on modern GPUs
dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float32
tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
token=HF_TOKEN,
torch_dtype=dtype,
trust_remote_code=True,
low_cpu_mem_usage=True, # Optimization for large models
).to("cuda" if torch.cuda.is_available() else "cpu")
# Create the pipeline for text generation
generator = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
device=0 if torch.cuda.is_available() else -1
)
model_cache[model_id] = generator
gr.Info(f"Model '{model_id}' loaded successfully.")
return generator
except Exception as e:
# Raise a more specific error to be caught by the main evaluation function
raise RuntimeError(f"Failed to load model '{model_id}'. Please verify the model ID and your Hugging Face token (if required). Error: {e}")
# --- Evaluation Logic ---
def format_prompt(item):
"""Formats the MMLU question and choices into a standardized prompt."""
prompt = f"Question: {item['question']}\n\nChoices:\nA. {item['choices'][0]}\nB. {item['choices'][1]}\nC. {item['choices'][2]}\nD. {item['choices'][3]}\n\nAnswer:"
return prompt, item['answer']
def get_choice_letter(index):
"""Converts a numerical choice index (0-3) to a letter (A-D)."""
return chr(ord('A') + index) if 0 <= index <= 3 else None
def extract_predicted_letter(output_text):
"""
Extracts the predicted letter from the model's output.
It looks for a letter (A, B, C, D) immediately following 'Answer:'.
"""
# Look for "Answer: X" and capture X
match = re.search(r"Answer:\s*([ABCD])", output_text.strip(), re.IGNORECASE)
if match:
return match.group(1).upper()
# Fallback: if the model just outputs a letter
match = re.search(r"^\s*([ABCD])\b", output_text.strip())
if match:
return match.group(1).upper()
return None
def evaluate_single_subject(generator, dataset_id, subject, sample_count, progress):
"""
Evaluates a model on a specific subject from a dataset.
"""
gr.Info(f"Loading dataset: {dataset_id} ({subject})...")
try:
# Load the 'test' split as it's standard for MMLU evaluation
dataset = load_dataset(dataset_id, subject, token=HF_TOKEN, split="test")
except Exception as e:
raise RuntimeError(f"Failed to load dataset '{dataset_id}' for subject '{subject}'. Error: {e}")
# Shuffle and select a subset of samples for evaluation
num_samples = min(sample_count, len(dataset))
dataset = dataset.shuffle(seed=42).select(range(num_samples))
correct_predictions = 0
results_details = []
for item in progress.tqdm(dataset, desc=f"Evaluating {subject}"):
prompt, correct_answer_idx = format_prompt(item)
expected_letter = get_choice_letter(correct_answer_idx)
# The generated text is often just after the prompt. We need to slice it.
full_prompt_text = generator.tokenizer.decode(generator.tokenizer.encode(prompt), skip_special_tokens=True)
# Generate a short response, aiming for a single letter answer.
# do_sample=False (greedy decoding) is crucial for reproducibility.
raw_output = generator(prompt, max_new_tokens=5, do_sample=False, pad_token_id=generator.tokenizer.eos_token_id)[0]["generated_text"]
# Isolate the newly generated part
generated_text_only = raw_output[len(full_prompt_text):].strip()
predicted_letter = extract_predicted_letter(generated_text_only)
is_correct = (predicted_letter == expected_letter)
if is_correct:
correct_predictions += 1
results_details.append({
"Question": item['question'],
"Correct": "βœ…" if is_correct else "❌",
"Expected": expected_letter,
"Predicted": predicted_letter or "N/A",
"Model Output": generated_text_only
})
accuracy = (correct_predictions / num_samples) * 100 if num_samples > 0 else 0
return accuracy, results_details
@spaces.GPU()
def run_evaluation(model_id, benchmark_category, subject_name, sample_count, progress=gr.Progress(track_tqdm=True)):
"""
Main function to orchestrate the entire evaluation process.
Handles single subject or 'ALL' subjects evaluation.
Returns a dictionary of Gradio updates.
"""
try:
gr.Info("Starting evaluation...")
generator = load_model(model_id)
dataset_id = BENCHMARK_MAP.get(benchmark_category)
if not dataset_id:
raise ValueError(f"Invalid benchmark category: {benchmark_category}")
all_results_details = []
summary_lines = []
total_correct = 0
total_samples = 0
subjects_to_run = []
if subject_name == "ALL":
# Exclude the "ALL" placeholder from the list of subjects to run
subjects_to_run = [s for s in ALL_BENCHMARK_SUBJECTS.get(benchmark_category, []) if s != "ALL"]
else:
subjects_to_run = [subject_name]
if not subjects_to_run:
gr.Warning(f"No subjects found for '{benchmark_category}'.")
# Return an empty but valid structure
return {
result_summary_output: gr.update(value="No subjects found to evaluate.", visible=True),
error_box: gr.update(visible=False),
details_box: gr.update(visible=False),
}
for i, subject in enumerate(subjects_to_run):
gr.Info(f"Evaluating {benchmark_category} - {subject} ({i+1}/{len(subjects_to_run)})...")
try:
accuracy, subject_details = evaluate_single_subject(generator, dataset_id, subject, sample_count, progress)
all_results_details.extend(subject_details)
num_correct = sum(1 for d in subject_details if d['Correct'] == "βœ…")
num_evaluated = len(subject_details)
total_correct += num_correct
total_samples += num_evaluated
summary_lines.append(f"- **{subject}**: {accuracy:.2f}% ({num_correct}/{num_evaluated})")
except Exception as e:
error_trace = traceback.format_exc()
gr.Error(f"Skipping {subject} due to an error: {e}")
summary_lines.append(f"- **{subject}**: Evaluation failed. See logs for details:\n```\n{error_trace}\n```")
continue
overall_accuracy = (total_correct / total_samples) * 100 if total_samples > 0 else 0
# --- Prepare Outputs ---
if subject_name == "ALL":
result_summary = f"### Overall Average Accuracy: {overall_accuracy:.2f}%\n"
result_summary += f"across {total_samples:,} total samples from {len(subjects_to_run)} subjects.\n\n---\n\n**Breakdown by Subject:**\n"
result_summary += "\n".join(summary_lines)
else:
result_summary = f"### Accuracy for {benchmark_category} - {subject_name}: {overall_accuracy:.2f}%\n"
result_summary += f"({total_correct:,}/{total_samples:,} correct)"
# Save results for leaderboard
record = {
"model_id": model_id,
"benchmark": benchmark_category,
"accuracy": overall_accuracy,
"subject": subject_name, # Record if it was an 'ALL' run
"sample_count": total_samples,
"timestamp": datetime.now().isoformat()
}
with open(EVAL_FILE, "a") as f:
f.write(json.dumps(record) + "\n")
gr.Info("Evaluation completed successfully!")
df_details = pd.DataFrame(all_results_details)
# Return a dictionary of component updates
return {
result_summary_output: gr.update(value=result_summary, visible=True),
error_box: gr.update(visible=False),
details_box: gr.update(visible=True),
detailed_results_df: gr.update(value=df_details)
}
except Exception as e:
error_message = f"An unexpected error occurred during setup: {e}"
error_details = traceback.format_exc()
gr.Error(error_message)
return {
result_summary_output: gr.update(visible=False),
error_box: gr.update(visible=True),
error_output: gr.update(value=error_message),
error_details_output: gr.update(value=error_details),
details_box: gr.update(visible=False)
}
# --- UI Helper Functions ---
def update_subject_dropdown(benchmark_category):
"""Updates the subject dropdown choices based on the selected benchmark."""
choices = ALL_BENCHMARK_SUBJECTS.get(benchmark_category, [])
default_value = "ALL" if "ALL" in choices else (choices[0] if choices else None)
return gr.update(choices=choices, value=default_value)
def load_leaderboard(benchmark_filter, progress=gr.Progress()):
"""
Loads and processes evaluation data to display on the leaderboard.
It now correctly averages scores for models that were evaluated on 'ALL' subjects.
"""
progress(0, desc="Loading Leaderboard...")
try:
if not os.path.exists(EVAL_FILE):
return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])
df = pd.read_json(EVAL_FILE, lines=True)
if df.empty:
return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])
# Coerce accuracy to numeric and filter valid entries
df['accuracy'] = pd.to_numeric(df['accuracy'], errors='coerce')
df.dropna(subset=['accuracy'], inplace=True)
# Filter by the selected benchmark (e.g., MMLU or MMLU-Pro)
df_filtered = df[(df['benchmark'] == benchmark_filter) & (df['subject'] == 'ALL')].copy()
if df_filtered.empty:
return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])
# Find the latest evaluation for each model
df_filtered['timestamp'] = pd.to_datetime(df_filtered['timestamp'])
latest_evals = df_filtered.loc[df_filtered.groupby('model_id')['timestamp'].idxmax()].copy()
leaderboard_df = latest_evals.sort_values(by="accuracy", ascending=False).copy()
# Add Rank
leaderboard_df.insert(0, 'Rank', range(1, len(leaderboard_df) + 1))
# Rename and format columns
leaderboard_df.rename(columns={
'model_id': 'Model ID',
'accuracy': 'Avg. Accuracy (%)',
'sample_count': 'Total Samples',
'timestamp': 'Date'
}, inplace=True)
leaderboard_df['Avg. Accuracy (%)'] = leaderboard_df['Avg. Accuracy (%)'].map('{:.2f}'.format)
leaderboard_df['Date'] = leaderboard_df['Date'].dt.strftime('%Y-%m-%d')
progress(1, desc="Done.")
return leaderboard_df[['Rank', 'Model ID', 'Avg. Accuracy (%)', 'Total Samples', 'Date']]
except Exception as e:
gr.Error(f"Error loading leaderboard: {e}")
traceback.print_exc()
return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])
# --- Gradio Interface Definition ---
# Black/Orange Theme and bigger to fit screen
custom_css = """
/* --- Global & Layout (Bigger to fit screen) --- */
body { font-family: 'Inter', sans-serif; background-color: #1a1a1a; color: #f0f0f0; } /* Dark background, light text */
.gradio-container { max-width: 95% !important; margin: auto; padding: 20px; } /* Wider container */
.gr-group {
border-radius: 12px !important;
box-shadow: 0 4px 12px rgba(0,0,0,0.3) !important; /* Darker shadow */
border: 1px solid #333 !important; /* Darker border */
background-color: #2a2a2a; /* Darker group background */
}
.gr-panel {
border-radius: 12px !important;
box-shadow: 0 4px 12px rgba(0,0,0,0.3) !important;
border: 1px solid #333 !important;
background-color: #2a2a2a;
}
/* --- Typography (Orange Hues) --- */
h1 { text-align: center; font-size: 3rem !important; font-weight: 800; color: #ff8c00; margin-bottom: 0.5rem; letter-spacing: -1.5px; } /* Orange title */
h3, h4 { color: #ffa500; } /* Orange headings */
.subtitle { text-align: center; color: #cccccc; font-size: 1.2rem; margin-bottom: 2.5rem; max-width: 900px; margin-left: auto; margin-right: auto;}
label { color: #f0f0f0 !important; } /* Label text color */
/* --- Tabs --- */
.gradio-tabs { background-color: #2a2a2a; border-radius: 12px; }
.gradio-tab-item { color: #f0f0f0; }
.gradio-tabs button {
background-color: #3a3a3a !important;
color: #f0f0f0 !important;
border-radius: 8px 8px 0 0 !important;
transition: all 0.3s ease;
}
.gradio-tabs button.selected {
background-color: #ff8c00 !important; /* Orange selected tab */
color: #1a1a1a !important; /* Dark text on orange */
font-weight: 700;
}
.gradio-tabs button:hover { background-color: #555 !important; }
/* --- Inputs --- */
.gr-textbox, .gr-dropdown, .gr-slider {
background-color: #3a3a3a !important;
color: #f0f0f0 !important;
border: 1px solid #555 !important;
border-radius: 8px !important;
}
.gr-textbox textarea, .gr-textbox input, .gr-dropdown input {
color: #f0f0f0 !important;
}
.gr-textbox.gr-text-input:focus-within {
border-color: #ff8c00 !important; /* Orange focus border */
box-shadow: 0 0 0 2px rgba(255, 140, 0, 0.5) !important;
}
/* --- Buttons --- */
.gr-button { font-weight: 600 !important; transition: all 0.2s ease; border-radius: 8px !important; }
.gr-button-primary {
background-color: #ff8c00 !important; /* Orange primary button */
color: #1a1a1a !important;
box-shadow: 0 4px 10px rgba(255, 140, 0, 0.3);
border: none;
}
.gr-button-primary:hover {
transform: translateY(-2px);
box-shadow: 0 6px 15px rgba(255, 140, 0, 0.5);
background-color: #ffa500 !important; /* Slightly lighter orange on hover */
}
.gr-button-secondary {
background-color: #444 !important;
color: #f0f0f0 !important;
border: 1px solid #555 !important;
}
.gr-button-secondary:hover {
background-color: #555 !important;
}
/* --- Custom Radio Buttons (Segmented Control) --- */
#leaderboard-toggle-group { display: flex; justify-content: center; align-items: center; gap: 1rem; margin-bottom: 1.5rem; }
#leaderboard-toggle { background-color: #3a3a3a; padding: 5px; border-radius: 10px; display: inline-flex; border: 1px solid #555; }
#leaderboard-toggle div.gr-form { display: flex; gap: 5px; }
#leaderboard-toggle input[type='radio'] { display: none; }
#leaderboard-toggle label {
padding: 8px 16px;
border-radius: 8px;
cursor: pointer;
transition: all 0.3s ease;
font-weight: 500;
color: #f0f0f0;
background: transparent;
border: none;
box-shadow: none;
}
#leaderboard-toggle input[type='radio']:checked + label {
background-color: #ff8c00; /* Orange selected */
color: #1a1a1a;
font-weight: 600;
box-shadow: 0 2px 5px rgba(255, 140, 0, 0.3);
}
#leaderboard-toggle label:hover {
background-color: #555;
}
/* --- Dataframe / Table Styling --- */
.leaderboard-table .gr-dataframe table { border-collapse: collapse; width: 100%; }
.leaderboard-table .gr-dataframe thead th {
background-color: #3a3a3a !important;
color: #ffa500 !important; /* Orange headers */
font-weight: 600 !important;
text-align: left;
padding: 12px 15px;
border-bottom: 2px solid #555;
}
.leaderboard-table .gr-dataframe tbody tr:nth-of-type(even) { background-color: #2f2f2f; } /* Alternating row color */
.leaderboard-table .gr-dataframe tbody tr:hover { background-color: #4a4a4a; } /* Hover effect */
.leaderboard-table .gr-dataframe tbody td {
padding: 12px 15px;
border-bottom: 1px solid #3a3a3a;
color: #f0f0f0;
}
.leaderboard-table .gr-dataframe tbody td:first-child { font-weight: 700; color: #ffcc99; } /* Lighter orange for rank */
/* --- Error & Result Panes --- */
#error-display-box {
background-color: #4a1e1e !important; /* Dark red for error */
border-color: #8c2f2f !important;
color: #ffc9c9 !important; /* Lighter red text */
}
#result-summary-box {
background-color: #1e3a2a !important; /* Dark green for success */
border-color: #2f8c4a !important;
color: #c9ffc9 !important; /* Lighter green text */
}
.gr-markdown p { color: #f0f0f0 !important; } /* Ensure markdown paragraph text is visible */
.gr-markdown strong { color: #ffa500 !important; } /* Strong text in orange */
.gradio-message { background-color: #ff8c00 !important; color: #1a1a1a !important; border: 1px solid #ff8c00 !important; } /* Gradio Info messages */
"""
with gr.Blocks(theme=gr.themes.Base(), css=custom_css) as demo:
gr.Markdown("<h1>πŸ† Open LLM Evaluator</h1>")
gr.Markdown("<p class='subtitle'>Benchmark leading models on MMLU. Your results contribute to a live leaderboard. Select a benchmark and run an evaluation, or view the current standings.</p>")
with gr.Tabs() as tabs:
# --- Leaderboard Tab ---
with gr.TabItem("πŸ“Š Leaderboard", id=0):
with gr.Column():
with gr.Row(elem_id="leaderboard-toggle-group"):
# Temporarily remove MMLU-Pro from radio options
leaderboard_type_toggle = gr.Radio(
["MMLU"],
label="Select Benchmark",
value="MMLU",
interactive=True,
elem_id="leaderboard-toggle",
container=False,
show_label=False,
)
refresh_button = gr.Button("πŸ”„ Refresh", size="sm")
leaderboard_table_output = gr.DataFrame(
headers=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"],
interactive=False,
datatype=["number", "str", "str", "number", "str"],
row_count=15, # Adjusted for more rows
elem_classes="leaderboard-table",
# Removed col_count to allow dynamic width
)
# --- Evaluation Tab ---
with gr.TabItem("πŸš€ Run Evaluation", id=1):
with gr.Row(variant='panel'):
with gr.Column(scale=2):
with gr.Group():
gr.Markdown("### 1. Configure Evaluation")
model_id_input = gr.Textbox(
label="Hugging Face Model ID",
placeholder="e.g., meta-llama/Meta-Llama-3-8B-Instruct",
interactive=True,
scale=2 # Increased scale for textbox
)
# Temporarily remove MMLU-Pro from radio options
benchmark_selection_radio = gr.Radio(
["MMLU"],
label="Benchmark",
value="MMLU",
interactive=True,
)
with gr.Row():
benchmark_subject_dropdown = gr.Dropdown(
label="Subject",
# Ensure only MMLU subjects are fetched
choices=ALL_BENCHMARK_SUBJECTS.get("MMLU", []),
value="ALL",
interactive=True
)
sample_count_slider = gr.Slider(
label="Samples per Subject",
minimum=5, maximum=100, value=25, step=5, interactive=True
)
run_button = gr.Button("Start Evaluation", variant="primary", scale=1)
with gr.Column(scale=3):
gr.Markdown("### 2. View Results")
# Panel for displaying the summary of results
with gr.Group(visible=False) as result_summary_box:
result_summary_output = gr.Markdown(elem_id="result-summary-box")
# Panel for displaying errors
with gr.Group(visible=False) as error_box:
error_output = gr.Textbox(label="Error Message", interactive=False, elem_id="error-display-box")
error_details_output = gr.Textbox(label="Error Details (Traceback)", interactive=False, lines=8)
# Panel for detailed, row-by-row results
with gr.Group(visible=False) as details_box:
gr.Markdown("#### Detailed Evaluation Log")
detailed_results_df = gr.DataFrame(
headers=["Question", "Correct", "Expected", "Predicted", "Model Output"],
datatype=["str", "str", "str", "str", "str"],
interactive=False,
row_count=10, # Adjusted for more rows
# Removed col_count to allow dynamic width
wrap=True,
)
# --- Event Handlers & Logic ---
# Update subject dropdown when benchmark type changes
benchmark_selection_radio.change(
fn=update_subject_dropdown,
inputs=[benchmark_selection_radio],
outputs=[benchmark_subject_dropdown]
)
# Main evaluation trigger
run_button.click(
fn=run_evaluation,
inputs=[model_id_input, benchmark_selection_radio, benchmark_subject_dropdown, sample_count_slider],
outputs=[result_summary_output, error_box, error_output, error_details_output, details_box, detailed_results_df]
).then(
# After evaluation, switch to the leaderboard tab and refresh it
lambda: gr.update(selected=0), outputs=[tabs]
).then(
load_leaderboard, inputs=[leaderboard_type_toggle], outputs=[leaderboard_table_output]
)
# Leaderboard loading logic
demo.load(
fn=load_leaderboard,
inputs=[leaderboard_type_toggle],
outputs=[leaderboard_table_output]
)
leaderboard_type_toggle.change(
fn=load_leaderboard,
inputs=[leaderboard_type_toggle],
outputs=[leaderboard_table_output],
show_progress='minimal'
)
refresh_button.click(
fn=load_leaderboard,
inputs=[leaderboard_type_toggle],
outputs=[leaderboard_table_output],
show_progress='full'
)
# Launch the Gradio app
if __name__ == "__main__":
demo.launch(debug=True)