SuperBench-Eval / app.py
Enderchef's picture
Update app.py
2d01a29 verified
raw
history blame
27.6 kB
import os
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from datasets import load_dataset, get_dataset_config_names
import torch
import re
import json
import pandas as pd
import matplotlib.pyplot as plt
import traceback # Import traceback for detailed error logging
# Cache to avoid reloading the model
model_cache = {}
HF_TOKEN = os.environ.get("HF_TOKEN")
# --- Constants for Benchmarks ---
MMLU_DATASET = "cais/mmlu"
MMLU_PRO_DATASET = "cais/mmlu_pro"
# Humanity's Last Exam is a composite benchmark, not a single dataset readily available like MMLU/MMLU-Pro.
# For this implementation, we will focus on MMLU and MMLU-Pro, which are direct datasets.
# Integrating HLE would require evaluating across multiple specific datasets.
def get_all_benchmark_options():
"""
Dynamically fetches all available subjects for MMLU and MMLU-Pro.
Returns a dictionary mapping benchmark dataset IDs to their subjects,
and a flattened list suitable for a Gradio dropdown.
"""
all_options = {}
gr_dropdown_options = []
# Get subjects for MMLU
try:
mmlu_subjects = get_dataset_config_names(MMLU_DATASET, token=HF_TOKEN)
all_options[MMLU_DATASET] = ["ALL"] + mmlu_subjects
gr_dropdown_options.extend([f"MMLU - {s}" for s in all_options[MMLU_DATASET]])
except Exception as e:
print(f"Warning: Could not load MMLU dataset configs. Error: {e}")
all_options[MMLU_DATASET] = []
# Get subjects for MMLU-Pro
try:
mmlu_pro_subjects = get_dataset_config_names(MMLU_PRO_DATASET, token=HF_TOKEN)
all_options[MMLU_PRO_DATASET] = ["ALL"] + mmlu_pro_subjects
gr_dropdown_options.extend([f"MMLU-Pro - {s}" for s in all_options[MMLU_PRO_DATASET]])
except Exception as e:
print(f"Warning: Could not load MMLU-Pro dataset configs. It might not be accessible or available. Error: {e}")
all_options[MMLU_PRO_DATASET] = []
return all_options, gr_dropdown_options
# Initialize these once globally when the app starts
ALL_BENCHMARK_SUBJECTS, GRADIO_DROPDOWN_OPTIONS = get_all_benchmark_options()
def load_model(model_id):
"""
Loads a Hugging Face model and its tokenizer, then creates a text-generation pipeline.
Uses a cache to avoid re-loading if the model is already in memory.
Provides Gradio Info/Error messages for user feedback.
Raises an exception if model loading fails.
"""
gr.Info(f"Attempting to load model: {model_id}...")
if model_id in model_cache:
gr.Info(f"Model '{model_id}' already loaded from cache.")
return model_cache[model_id]
try:
# Load tokenizer and model, using bfloat16 if CUDA is available for efficiency
tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
model = AutoModelForCausalLM.from_pretrained(
model_id,
token=HF_TOKEN,
torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
).to("cuda" if torch.cuda.is_available() else "cpu")
# Create a text-generation pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
# Cache the loaded generator
model_cache[model_id] = generator
gr.Info(f"Model '{model_id}' loaded successfully.")
return generator
except Exception as e:
# Re-raise the exception to be caught by the outer run_evaluation try-except
raise ValueError(f"Failed to load model '{model_id}'. Please verify the model ID and your Hugging Face token. Error: {e}")
def format_prompt(item):
"""
Formats a single MMLU/MMLU-Pro question item into a clear prompt for the LLM.
The prompt is designed for the model to output a single letter answer (A, B, C, D).
"""
prompt = f"""{item['question']}
A. {item['choices'][0]}
B. {item['choices'][1]}
C. {item['choices'][2]}
D. {item['choices'][3]}
Answer:"""
return prompt, item['answer'] # Returns the prompt string and the correct choice index (0-3)
def extract_choice_letter(output):
"""
Extracts the most likely choice letter (A, B, C, D) from the model's generated output.
It prioritizes an exact match after "Answer:", then looks for any single capital letter.
"""
# Look for "Answer: X" pattern first (e.g., "Answer: A" or "Answer: B")
match = re.search(r"Answer:\s*([ABCD])", output, re.IGNORECASE) # Added IGNORECASE for robustness
if match:
return match.group(1).upper() # Ensure it's uppercase
# Fallback: look for a single capital letter A-D anywhere in the output
match = re.search(r"\b([ABCD])\b", output.strip())
if match:
return match.group(1)
return None # Return None if no valid choice letter is found
def get_choice_letter(index):
"""Converts a numerical choice index (0-3) to a capital letter (A-D)."""
if 0 <= index <= 3:
return chr(ord('A') + index)
return None # Return None for invalid indices
def evaluate_single_subject(generator, dataset_id, subject, sample_count, progress):
"""
Evaluates a given model generator on a specific subject from a specified dataset.
Args:
generator: The Hugging Face pipeline for text generation.
dataset_id (str): The ID of the dataset (e.g., "cais/mmlu", "cais/mmlu_pro").
subject (str): The specific subject/config name within the dataset.
sample_count (int): The maximum number of samples to evaluate.
progress (gr.Progress): Gradio progress tracker.
Returns:
tuple: (accuracy, list_of_detailed_results)
Raises:
Exception: If dataset loading fails.
"""
gr.Info(f"Loading dataset: {dataset_id} - {subject}...")
try:
# Load the "test" split of the dataset
dataset = load_dataset(dataset_id, subject, token=HF_TOKEN)["test"]
except Exception as e:
# Re-raise the exception to be caught by the outer run_evaluation try-except
raise RuntimeError(f"Failed to load dataset '{dataset_id}' for subject '{subject}'. Error: {e}")
# Limit the number of samples and shuffle for consistent evaluation across runs
num_samples_to_evaluate = min(sample_count, len(dataset))
dataset = dataset.shuffle(seed=42).select(range(num_samples_to_evaluate))
correct_count = 0
subject_results = []
# Iterate through the selected samples with a progress bar
for i, item in enumerate(progress.tqdm(dataset, desc=f"Processing {subject} samples")):
prompt, answer_idx = format_prompt(item)
expected_letter = get_choice_letter(answer_idx)
# Generate only 1 new token for the answer (A, B, C, D)
# do_sample=False ensures deterministic output for a given prompt (greedy decoding)
output_raw = generator(prompt, max_new_tokens=1, do_sample=False)[0]["generated_text"]
# Check for potential reasoning model output
is_reasoning_model_output = '<' in output_raw or re.search(r"\b(because|therefore|thus|reasoning)\b", output_raw, re.IGNORECASE) is not None
# Extract the predicted letter from the model's raw output
predicted_letter = extract_choice_letter(output_raw)
is_correct = (predicted_letter == expected_letter)
correct_count += is_correct
# Store detailed results for logging and display
subject_results.append({
"question": item['question'],
"choices": item['choices'],
"model_raw_output": output_raw.strip(),
"expected_answer_letter": expected_letter,
"predicted_answer_letter": predicted_letter,
"is_correct": is_correct,
"is_reasoning_model_output": is_reasoning_model_output # Store the flag
})
# Calculate accuracy for the current subject
accuracy = (correct_count / len(dataset)) * 100 if len(dataset) > 0 else 0
return accuracy, subject_results
def run_evaluation(model_id, selected_benchmark_subject, sample_count, progress=gr.Progress()):
"""
Main function to orchestrate the evaluation process.
Handles single subject or 'ALL' subjects evaluation for MMLU/MMLU-Pro.
Returns Gradio.update objects to control UI component visibility and content.
"""
gr.Info("Starting evaluation...")
if not model_id:
gr.Warning("Please enter a Hugging Face Model ID before running the evaluation.")
# Return updates to hide logs/debug and show empty results
return "", gr.update(value="", visible=False), gr.update(visible=False), \
gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False)
# Parse the selected benchmark and subject from the dropdown string
parts = selected_benchmark_subject.split(" - ")
if len(parts) != 2:
gr.Error("Invalid benchmark selection format. Please select from the dropdown.")
return "", gr.update(value="", visible=False), gr.update(visible=False), \
gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False)
benchmark_name = parts[0]
subject_name = parts[1]
dataset_id_map = {
"MMLU": MMLU_DATASET,
"MMLU-Pro": MMLU_PRO_DATASET
}
current_dataset_id = dataset_id_map.get(benchmark_name)
if not current_dataset_id:
gr.Error(f"Unknown benchmark selected: {benchmark_name}. This should not happen.")
return "", gr.update(value="", visible=False), gr.update(visible=False), \
gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False)
try:
generator = load_model(model_id) # This function will raise an exception on failure
all_evaluation_results = []
total_correct_overall = 0
total_samples_overall = 0
eval_summary_lines = []
if subject_name == "ALL":
subjects_to_evaluate = ALL_BENCHMARK_SUBJECTS.get(current_dataset_id, [])
if "ALL" in subjects_to_evaluate:
subjects_to_evaluate.remove("ALL")
if not subjects_to_evaluate:
gr.Warning(f"No subjects found to evaluate for '{benchmark_name}'.")
return "", gr.update(value="", visible=False), gr.update(visible=False), \
gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False)
for i, sub in enumerate(progress.tqdm(subjects_to_evaluate, desc=f"Evaluating ALL {benchmark_name} subjects")):
gr.Info(f"Evaluating {benchmark_name} - {sub} ({i+1}/{len(subjects_to_evaluate)})...")
try:
accuracy, subject_details = evaluate_single_subject(generator, current_dataset_id, sub, sample_count, progress)
all_evaluation_results.extend(subject_details)
num_evaluated_samples = len(subject_details)
num_correct_in_subject = sum(d['is_correct'] for d in subject_details)
total_correct_overall += num_correct_in_subject
total_samples_overall += num_evaluated_samples
eval_summary_lines.append(f"- {benchmark_name} - {sub}: {accuracy:.2f}% ({num_correct_in_subject}/{num_evaluated_samples} samples)")
except Exception as e:
gr.Error(f"Skipping {benchmark_name} - {sub} due to an error: {e}")
eval_summary_lines.append(f"- {benchmark_name} - {sub}: Error during evaluation.")
continue
overall_accuracy = (total_correct_overall / total_samples_overall) * 100 if total_samples_overall > 0 else 0
score_string = f"Overall Average Accuracy for {benchmark_name}: {overall_accuracy:.2f}% across {total_samples_overall} total samples.\n\n"
score_string += "Detailed breakdown:\n" + "\n".join(eval_summary_lines)
else:
accuracy, subject_details = evaluate_single_subject(generator, current_dataset_id, subject_name, sample_count, progress)
all_evaluation_results.extend(subject_details)
overall_accuracy = accuracy
num_evaluated_samples = len(subject_details)
score_string = f"Accuracy for {benchmark_name} - {subject_name}: {accuracy:.2f}% out of {num_evaluated_samples} samples."
# Format detailed results for display in the text box
# The key change here is to wrap the entire multi-line string construction for each item
# within parentheses to ensure it's treated as a single element in the list comprehension.
formatted_details = "\n\n".join([
(
f"### Question:\n{item['question']}\n\n"
+ f"**Choices:**\n" + "\n".join([f"{get_choice_letter(i)}. {c}" for i, c in enumerate(item['choices'])]) + "\n\n"
+ (f"**Note:** Reasoning models are currently not fully supported for single-letter extraction. The original model output followed:\n" if item.get('is_reasoning_model_output') else "")
+ f"**Model Raw Output:** {item['model_raw_output']}\n"
+ f"**Expected Answer:** {item['expected_answer_letter']}\n"
+ f"**Predicted Answer:** {item['predicted_answer_letter']}\n"
+ f"**Correct:** {'Yes' if item['is_correct'] else 'No'}"
)
for item in all_evaluation_results
])
# Record the evaluation result to a JSONL file for the leaderboard
record = {
"model_id": model_id,
"benchmark": benchmark_name,
"subject": subject_name,
"accuracy": overall_accuracy,
"sample_count": total_samples_overall if subject_name == "ALL" else len(all_evaluation_results),
"timestamp": pd.Timestamp.now().isoformat()
}
with open("eval.jsonl", "a") as f:
f.write(json.dumps(record) + "\n")
gr.Info("Evaluation completed successfully!")
return score_string, \
gr.update(value="", visible=False), gr.update(visible=False), \
gr.update(visible=True), gr.update(visible=True), gr.update(value=formatted_details, visible=False)
except Exception as e:
error_message = str(e)
detailed_error_traceback = traceback.format_exc()
gr.Error("An error occurred during evaluation.")
# Return updates for error state
return "Error occurred during evaluation. We'll evaluate for you! If this persists, please open a community support tab for assistance.", \
gr.update(value=detailed_error_traceback, visible=True), gr.update(visible=True), \
gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False)
def save_text(text_content):
"""Saves the provided text content to a file and returns the file path for download."""
if not text_content:
gr.Warning("No evaluation results to download.")
return None
file_path = "evaluation_results.txt"
try:
with open(file_path, "w") as f:
f.write(text_content)
return file_path
except Exception as e:
gr.Error(f"Error saving file: {e}")
return None
def load_leaderboard():
"""
Loads evaluation data from 'eval.jsonl', computes average accuracy per model,
and prepares data for the leaderboard plot and table.
"""
try:
# Read the JSONL file into a pandas DataFrame
df = pd.read_json("eval.jsonl", lines=True)
# Calculate average accuracy per model across all recorded evaluations
df_avg = df.groupby("model_id")["accuracy"].mean().reset_index()
df_avg.columns = ["Model ID", "Average Accuracy (%)"]
# Sort models by average accuracy in descending order
df_sorted = df_avg.sort_values(by="Average Accuracy (%)", ascending=False)
# Select top 10 models for the bar chart
top_models = df_sorted.head(10)
# Create the matplotlib plot
fig, ax = plt.subplots(figsize=(10, 6)) # Adjust figure size for better readability
# For horizontal bars, it's often better to plot data sorted in ascending order
# so the highest bar appears at the top of the chart.
top_models_plot = top_models.sort_values(by="Average Accuracy (%)", ascending=True)
ax.barh(top_models_plot['Model ID'], top_models_plot['Average Accuracy (%)'], color='#007bff') # Use a nice blue color
ax.set_xlabel("Average Accuracy (%)", fontsize=12)
ax.set_ylabel("Model ID", fontsize=12)
ax.set_title("Top 10 Models by Average MMLU/MMLU-Pro Accuracy", fontsize=14)
ax.set_xlim(0, 100) # Ensure accuracy scale is 0-100%
ax.tick_params(axis='x', labelsize=10)
ax.tick_params(axis='y', labelsize=10)
ax.grid(axis='x', linestyle='--', alpha=0.7) # Add grid lines
plt.tight_layout() # Adjust layout to prevent labels overlapping
# Return the figure and the sorted dataframe as a list of dictionaries for Gradio Dataframe
return fig, df_sorted.to_dict('records')
except FileNotFoundError:
gr.Warning("No evaluation data found yet. Run an evaluation to populate the leaderboard!")
return plt.figure(), pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
except Exception as e:
gr.Error(f"Error loading leaderboard: {e}")
# Return an empty plot and dataframe in case of any other error
return plt.figure(), pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
# --- Gradio Interface Definition ---
with gr.Blocks(css="""
/* General body and container styling */
body { font-family: 'Inter', sans-serif; background-color: #f0f2f5; margin: 0; padding: 20px; }
.gradio-container {
max-width: 1200px;
margin: 20px auto;
padding: 30px;
box-shadow: 0 8px 16px rgba(0,0,0,0.15);
border-radius: 12px;
background-color: #ffffff;
border: 1px solid #e0e0e0;
}
/* Headings */
h1 {
color: #2c3e50;
text-align: center;
margin-bottom: 30px;
font-size: 2.5em;
font-weight: 700;
letter-spacing: -0.02em;
}
h3 { color: #34495e; font-size: 1.2em; margin-bottom: 10px; }
/* Markdown text */
.markdown-text { text-align: center; color: #555; line-height: 1.6; }
.markdown-text div { font-size: 1.1em; }
/* Buttons */
.gr-button {
background-color: #007bff; /* Primary blue */
color: white;
border: none;
padding: 12px 25px;
border-radius: 8px;
cursor: pointer;
transition: background-color 0.3s ease, transform 0.2s ease;
font-size: 1.1em;
font-weight: 600;
box-shadow: 0 4px 8px rgba(0,0,0,0.1);
}
.gr-button:hover {
background-color: #0056b3; /* Darker blue on hover */
transform: translateY(-2px); /* Slight lift effect */
}
.gr-button:active {
transform: translateY(0);
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
/* Specific button styling for debug/show details */
#debug-button, #show-details-button {
background-color: #6c757d; /* Grey for secondary actions */
}
#debug-button:hover, #show-details-button:hover {
background-color: #5a6268;
}
#download-button {
background-color: #28a745; /* Green for download */
}
#download-button:hover {
background-color: #218838;
}
/* Input/Output Boxes */
.gr-box {
border: 1px solid #dee2e6;
border-radius: 10px;
padding: 20px;
margin-bottom: 20px;
background-color: #fdfdfd;
box-shadow: inset 0 1px 3px rgba(0,0,0,0.05);
}
.gr-output-text {
white-space: pre-wrap;
word-wrap: break-word;
background-color: #f9f9fb;
border: 1px solid #e9ecef;
border-radius: 8px;
padding: 15px;
min-height: 100px; /* Ensure a minimum height */
}
/* Specific error output style */
#error-message-output {
background-color: #ffe0e0;
border-color: #ff9999;
color: #cc0000;
}
/* Labels for inputs */
.gr-textbox label, .gr-dropdown label, .gr-slider label {
font-weight: 600;
color: #495057;
margin-bottom: 8px;
display: block;
font-size: 1em;
}
/* Tab styling */
.gr-tab-item { padding: 25px; } /* More padding inside tabs */
.gr-tabs-nav button {
font-weight: 600;
font-size: 1.1em;
padding: 10px 20px;
border-top-left-radius: 8px;
border-top-right-radius: 8px;
}
""") as demo:
gr.Markdown("""
# πŸ€– LLM Benchmark Evaluator
""")
with gr.Tabs():
with gr.TabItem("πŸš€ Run Evaluation"):
gr.Markdown("""
<div style="text-align: center; margin-bottom: 20px; color: #666; font-size: 1.1em;">
Enter your Hugging Face Model ID, choose a benchmark (MMLU or MMLU-Pro),
select a subject (or 'ALL' for a comprehensive evaluation),
and specify the number of samples per subject.
</div>
""")
with gr.Column(elem_classes="gr-box"):
model_id_input = gr.Textbox(
label="Your Hugging Face Model ID",
placeholder="e.g., mistralai/Mistral-7B-Instruct-v0.2",
interactive=True
)
with gr.Row():
benchmark_subject_dropdown = gr.Dropdown(
label="Choose Benchmark and Subject",
choices=GRADIO_DROPDOWN_OPTIONS,
value="MMLU - ALL", # Default to MMLU ALL for initial load
interactive=True,
min_width=400 # Ensure sufficient width
)
sample_count_slider = gr.Slider(
label="Number of Samples per Subject (1-100)",
minimum=1,
maximum=100,
value=10, # Default to 10 samples
step=1,
interactive=True,
min_width=200
)
run_button = gr.Button("πŸš€ Run Evaluation", elem_classes="gr-button")
with gr.Column(elem_classes="gr-box"):
acc_output = gr.Textbox(
label="Benchmark Accuracy Results",
interactive=False,
elem_classes="gr-output-text",
lines=5,
placeholder="Evaluation results will appear here."
)
# Container for debug info, initially hidden
with gr.Column(visible=False, elem_id="debug-error-column") as debug_error_column:
error_message_output = gr.Textbox(
label="Debug Information (Error Details)",
lines=10, interactive=False, elem_classes="gr-output-text", elem_id="error-message-output",
placeholder="Error details will appear here if an error occurs."
)
debug_button = gr.Button("πŸ› Hide Debug Info", visible=True, elem_id="debug-button", elem_classes="gr-button")
with gr.Row():
show_details_button = gr.Button("πŸ” Show Detailed Logs", visible=False, elem_id="show-details-button", elem_classes="gr-button")
download_button = gr.Button("πŸ“₯ Download Full Evaluation Logs", visible=False, elem_id="download-button", elem_classes="gr-button")
# Detailed output, initially hidden
detail_output = gr.Textbox(
label="Detailed Evaluation Logs",
lines=20,
interactive=False,
elem_classes="gr-output-text",
placeholder="Detailed logs for each question will appear here upon successful evaluation.",
visible=False # Initially hidden
)
# Define button click actions
run_button.click(
run_evaluation,
inputs=[model_id_input, benchmark_subject_dropdown, sample_count_slider],
outputs=[
acc_output,
error_message_output, debug_error_column, # For error state
show_details_button, download_button, detail_output # For success state
]
)
# Toggle visibility of detail_output
show_details_button.click(
lambda s: gr.update(visible=not s), # Toggle visibility
inputs=[detail_output], # Pass the component itself as input
outputs=[detail_output] # The component to update
)
# Change button text based on visibility
show_details_button.click(
lambda s: "πŸ™ˆ Hide Detailed Logs" if not s else "πŸ” Show Detailed Logs",
inputs=[detail_output],
outputs=[show_details_button]
)
# Toggle visibility of debug error column
debug_button.click(
lambda s: gr.update(visible=not s), # Toggle visibility
inputs=[debug_error_column], # Pass the component itself as input
outputs=[debug_error_column] # The component to update
)
# Change debug button text based on visibility
debug_button.click(
lambda s: "πŸ› Show Debug Info" if not s else "πŸ› Hide Debug Info",
inputs=[debug_error_column],
outputs=[debug_button]
)
download_button.click(
save_text,
inputs=[detail_output],
outputs=gr.File(label="Download Evaluation Results", file_count="single", type="filepath")
)
with gr.TabItem("πŸ“Š Leaderboard"):
gr.Markdown("""
<div style="text-align: center; margin-bottom: 20px; color: #666; font-size: 1.1em;">
See how different models perform on average across all evaluated benchmarks.
This leaderboard updates with every new evaluation.
</div>
""")
with gr.Row():
leaderboard_plot_output = gr.Plot(label="Top 10 Models by Average Accuracy", scale=2) # Scale for better visibility
leaderboard_table_output = gr.Dataframe(
headers=["Model ID", "Average Accuracy (%)"],
interactive=False,
datatype=["str", "number"],
row_count=10, # Display top 10 rows initially, but can scroll
col_count=2,
label="Full Leaderboard Data"
)
# Load leaderboard when the tab is selected or when the app loads
demo.load(load_leaderboard, inputs=[], outputs=[leaderboard_plot_output, leaderboard_table_output])
# Launch the Gradio app
demo.launch()