Spaces:

Enderchef
/

SuperBench-Eval

Running on Zero

App Files Files Community

SuperBench-Eval / app.py

Enderchef

Update app.py

3b51590 verified about 2 months ago

raw

history blame

22.7 kB

	import os
	import gradio as gr
	from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
	from datasets import load_dataset, get_dataset_config_names
	import torch
	import re
	import json
	import pandas as pd
	import traceback
	import spaces
	from datetime import datetime

	# --- Environment and Caching ---

	# It's good practice to ensure the cache directory exists.
	CACHE_DIR = "evaluation_cache"
	os.makedirs(CACHE_DIR, exist_ok=True)
	EVAL_FILE = os.path.join(CACHE_DIR, "eval.jsonl")

	# Cache to avoid reloading models and dataset configs
	model_cache = {}
	benchmark_subject_cache = {}

	# Use environment variable for the Hugging Face token
	HF_TOKEN = os.environ.get("HF_TOKEN")

	# --- Constants for Benchmarks ---
	MMLU_DATASET = "cais/mmlu"
	MMLU_PRO_DATASET = "TIGER-Lab/MMLU-Pro"
	BENCHMARK_MAP = {
	"MMLU": MMLU_DATASET,
	"MMLU-Pro": MMLU_PRO_DATASET
	}

	# --- Data Loading and Preparation ---

	def get_all_benchmark_options():
	"""
	Fetches and caches the available subjects (configs) for each benchmark dataset.
	This function now populates a global cache to avoid repeated API calls.
	"""
	if benchmark_subject_cache:
	return benchmark_subject_cache

	print("Fetching benchmark configurations for the first time...")
	for key, dataset_id in BENCHMARK_MAP.items():
	try:
	# Fetching dataset configurations requires authentication if the dataset is private
	subjects = get_dataset_config_names(dataset_id, token=HF_TOKEN)
	benchmark_subject_cache[key] = ["ALL"] + sorted([s for s in subjects if s != 'all']) # Sort subjects
	except Exception as e:
	print(f"Warning: Could not load configs for {key} ({dataset_id}). It might be private or unavailable. Error: {e}")
	benchmark_subject_cache[key] = ["ALL"] # Provide a default
	print("Benchmark configurations cached.")
	return benchmark_subject_cache

	# Initialize the cache on startup
	ALL_BENCHMARK_SUBJECTS = get_all_benchmark_options()


	@spaces.GPU()
	def load_model(model_id):
	"""
	Loads a Hugging Face model and tokenizer, creating a text-generation pipeline.
	Uses a cache to avoid reloading models.
	"""
	if not model_id:
	raise ValueError("Model ID cannot be empty.")

	gr.Info(f"Attempting to load model: {model_id}...")
	if model_id in model_cache:
	gr.Info(f"Model '{model_id}' found in cache.")
	return model_cache[model_id]

	try:
	# Use bfloat16 for better performance on modern GPUs
	dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float32

	tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN, trust_remote_code=True)
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	token=HF_TOKEN,
	torch_dtype=dtype,
	trust_remote_code=True,
	low_cpu_mem_usage=True, # Optimization for large models
	).to("cuda" if torch.cuda.is_available() else "cpu")

	# Create the pipeline for text generation
	generator = pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer,
	device=0 if torch.cuda.is_available() else -1
	)

	model_cache[model_id] = generator
	gr.Info(f"Model '{model_id}' loaded successfully.")
	return generator
	except Exception as e:
	# Raise a more specific error to be caught by the main evaluation function
	raise RuntimeError(f"Failed to load model '{model_id}'. Please verify the model ID and your Hugging Face token (if required). Error: {e}")


	# --- Evaluation Logic ---

	def format_prompt(item):
	"""Formats the MMLU question and choices into a standardized prompt."""
	prompt = f"Question: {item['question']}\n\nChoices:\nA. {item['choices'][0]}\nB. {item['choices'][1]}\nC. {item['choices'][2]}\nD. {item['choices'][3]}\n\nAnswer:"
	return prompt, item['answer']

	def get_choice_letter(index):
	"""Converts a numerical choice index (0-3) to a letter (A-D)."""
	return chr(ord('A') + index) if 0 <= index <= 3 else None

	def extract_predicted_letter(output_text):
	"""
	Extracts the predicted letter from the model's output.
	It looks for a letter (A, B, C, D) immediately following 'Answer:'.
	"""
	# Look for "Answer: X" and capture X
	match = re.search(r"Answer:\s*([ABCD])", output_text.strip(), re.IGNORECASE)
	if match:
	return match.group(1).upper()

	# Fallback: if the model just outputs a letter
	match = re.search(r"^\s*([ABCD])\b", output_text.strip())
	if match:
	return match.group(1).upper()

	return None

	def evaluate_single_subject(generator, dataset_id, subject, sample_count, progress):
	"""
	Evaluates a model on a specific subject from a dataset.
	"""
	gr.Info(f"Loading dataset: {dataset_id} ({subject})...")
	try:
	# Load the 'test' split as it's standard for MMLU evaluation
	dataset = load_dataset(dataset_id, subject, token=HF_TOKEN, split="test")
	except Exception as e:
	raise RuntimeError(f"Failed to load dataset '{dataset_id}' for subject '{subject}'. Error: {e}")

	# Shuffle and select a subset of samples for evaluation
	num_samples = min(sample_count, len(dataset))
	dataset = dataset.shuffle(seed=42).select(range(num_samples))

	correct_predictions = 0
	results_details = []

	for item in progress.tqdm(dataset, desc=f"Evaluating {subject}"):
	prompt, correct_answer_idx = format_prompt(item)
	expected_letter = get_choice_letter(correct_answer_idx)

	# The generated text is often just after the prompt. We need to slice it.
	full_prompt_text = generator.tokenizer.decode(generator.tokenizer.encode(prompt), skip_special_tokens=True)

	# Generate a short response, aiming for a single letter answer.
	# do_sample=False (greedy decoding) is crucial for reproducibility.
	raw_output = generator(prompt, max_new_tokens=5, do_sample=False, pad_token_id=generator.tokenizer.eos_token_id)[0]["generated_text"]

	# Isolate the newly generated part
	generated_text_only = raw_output[len(full_prompt_text):].strip()

	predicted_letter = extract_predicted_letter(generated_text_only)
	is_correct = (predicted_letter == expected_letter)

	if is_correct:
	correct_predictions += 1

	results_details.append({
	"Question": item['question'],
	"Correct": "✅" if is_correct else "❌",
	"Expected": expected_letter,
	"Predicted": predicted_letter or "N/A",
	"Model Output": generated_text_only
	})

	accuracy = (correct_predictions / num_samples) * 100 if num_samples > 0 else 0
	return accuracy, results_details


	@spaces.GPU()
	def run_evaluation(model_id, benchmark_category, subject_name, sample_count, progress=gr.Progress(track_tqdm=True)):
	"""
	Main function to orchestrate the entire evaluation process.
	Handles single subject or 'ALL' subjects evaluation.
	Returns a dictionary of Gradio updates.
	"""
	try:
	gr.Info("Starting evaluation...")
	generator = load_model(model_id)

	dataset_id = BENCHMARK_MAP.get(benchmark_category)
	if not dataset_id:
	raise ValueError(f"Invalid benchmark category: {benchmark_category}")

	all_results_details = []
	summary_lines = []
	total_correct = 0
	total_samples = 0

	subjects_to_run = []
	if subject_name == "ALL":
	# Exclude the "ALL" placeholder from the list of subjects to run
	subjects_to_run = [s for s in ALL_BENCHMARK_SUBJECTS.get(benchmark_category, []) if s != "ALL"]
	else:
	subjects_to_run = [subject_name]

	if not subjects_to_run:
	gr.Warning(f"No subjects found for '{benchmark_category}'.")
	# Return an empty but valid structure
	return {
	result_summary_output: gr.update(value="No subjects found to evaluate.", visible=True),
	error_box: gr.update(visible=False),
	details_box: gr.update(visible=False),
	}

	for i, subject in enumerate(subjects_to_run):
	gr.Info(f"Evaluating {benchmark_category} - {subject} ({i+1}/{len(subjects_to_run)})...")
	try:
	accuracy, subject_details = evaluate_single_subject(generator, dataset_id, subject, sample_count, progress)

	all_results_details.extend(subject_details)
	num_correct = sum(1 for d in subject_details if d['Correct'] == "✅")
	num_evaluated = len(subject_details)

	total_correct += num_correct
	total_samples += num_evaluated
	summary_lines.append(f"- {subject}: {accuracy:.2f}% ({num_correct}/{num_evaluated})")

	except Exception as e:
	error_trace = traceback.format_exc()
	gr.Error(f"Skipping {subject} due to an error: {e}")
	summary_lines.append(f"- {subject}: Evaluation failed. See logs for details:\n```\n{error_trace}\n```")
	continue

	overall_accuracy = (total_correct / total_samples) * 100 if total_samples > 0 else 0

	# --- Prepare Outputs ---
	if subject_name == "ALL":
	result_summary = f"### Overall Average Accuracy: {overall_accuracy:.2f}%\n"
	result_summary += f"across {total_samples:,} total samples from {len(subjects_to_run)} subjects.\n\n---\n\nBreakdown by Subject:\n"
	result_summary += "\n".join(summary_lines)
	else:
	result_summary = f"### Accuracy for {benchmark_category} - {subject_name}: {overall_accuracy:.2f}%\n"
	result_summary += f"({total_correct:,}/{total_samples:,} correct)"

	# Save results for leaderboard
	record = {
	"model_id": model_id,
	"benchmark": benchmark_category,
	"accuracy": overall_accuracy,
	"subject": subject_name, # Record if it was an 'ALL' run
	"sample_count": total_samples,
	"timestamp": datetime.now().isoformat()
	}
	with open(EVAL_FILE, "a") as f:
	f.write(json.dumps(record) + "\n")

	gr.Info("Evaluation completed successfully!")

	df_details = pd.DataFrame(all_results_details)

	# Return a dictionary of component updates
	return {
	result_summary_output: gr.update(value=result_summary, visible=True),
	error_box: gr.update(visible=False),
	details_box: gr.update(visible=True),
	detailed_results_df: gr.update(value=df_details)
	}

	except Exception as e:
	error_message = f"An unexpected error occurred during setup: {e}"
	error_details = traceback.format_exc()
	gr.Error(error_message)

	return {
	result_summary_output: gr.update(visible=False),
	error_box: gr.update(visible=True),
	error_output: gr.update(value=error_message),
	error_details_output: gr.update(value=error_details),
	details_box: gr.update(visible=False)
	}


	# --- UI Helper Functions ---

	def update_subject_dropdown(benchmark_category):
	"""Updates the subject dropdown choices based on the selected benchmark."""
	choices = ALL_BENCHMARK_SUBJECTS.get(benchmark_category, [])
	default_value = "ALL" if "ALL" in choices else (choices[0] if choices else None)
	return gr.update(choices=choices, value=default_value)

	def load_leaderboard(benchmark_filter, progress=gr.Progress()):
	"""
	Loads and processes evaluation data to display on the leaderboard.
	It now correctly averages scores for models that were evaluated on 'ALL' subjects.
	"""
	progress(0, desc="Loading Leaderboard...")
	try:
	if not os.path.exists(EVAL_FILE):
	return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])

	df = pd.read_json(EVAL_FILE, lines=True)
	if df.empty:
	return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])

	# Coerce accuracy to numeric and filter valid entries
	df['accuracy'] = pd.to_numeric(df['accuracy'], errors='coerce')
	df.dropna(subset=['accuracy'], inplace=True)

	# Filter by the selected benchmark (e.g., MMLU or MMLU-Pro)
	df_filtered = df[(df['benchmark'] == benchmark_filter) & (df['subject'] == 'ALL')].copy()

	if df_filtered.empty:
	return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])

	# Find the latest evaluation for each model
	df_filtered['timestamp'] = pd.to_datetime(df_filtered['timestamp'])
	latest_evals = df_filtered.loc[df_filtered.groupby('model_id')['timestamp'].idxmax()].copy()

	leaderboard_df = latest_evals.sort_values(by="accuracy", ascending=False).copy()

	# Add Rank
	leaderboard_df.insert(0, 'Rank', range(1, len(leaderboard_df) + 1))

	# Rename and format columns
	leaderboard_df.rename(columns={
	'model_id': 'Model ID',
	'accuracy': 'Avg. Accuracy (%)',
	'sample_count': 'Total Samples',
	'timestamp': 'Date'
	}, inplace=True)

	leaderboard_df['Avg. Accuracy (%)'] = leaderboard_df['Avg. Accuracy (%)'].map('{:.2f}'.format)
	leaderboard_df['Date'] = leaderboard_df['Date'].dt.strftime('%Y-%m-%d')

	progress(1, desc="Done.")
	return leaderboard_df[['Rank', 'Model ID', 'Avg. Accuracy (%)', 'Total Samples', 'Date']]

	except Exception as e:
	gr.Error(f"Error loading leaderboard: {e}")
	traceback.print_exc()
	return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])


	# --- Gradio Interface Definition ---

	with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), css="""
	/* --- Global & Layout --- */
	body { font-family: 'Inter', sans-serif; background-color: #f8f9fa; }
	.gradio-container { max-width: 1280px !important; margin: auto; }
	.gr-group { border-radius: 12px !important; box-shadow: 0 4px 12px rgba(0,0,0,0.05) !important; border: 1px solid #e9ecef !important; background-color: white; }

	/* --- Typography --- */
	h1 { text-align: center; font-size: 2.5rem !important; font-weight: 800; color: #212529; margin-bottom: 0.5rem; letter-spacing: -1.5px; }
	.subtitle { text-align: center; color: #6c757d; font-size: 1.1rem; margin-bottom: 2.5rem; max-width: 800px; margin-left: auto; margin-right: auto;}

	/* --- Buttons & Inputs --- */
	.gr-button { font-weight: 600 !important; transition: all 0.2s ease; }
	.gr-button-primary { box-shadow: 0 4px 10px rgba(59, 130, 246, 0.2); }
	.gr-button-primary:hover { transform: translateY(-2px); box-shadow: 0 6px 15px rgba(59, 130, 246, 0.3); }

	/* --- Custom Radio Buttons (Segmented Control) --- */
	#leaderboard-toggle-group { display: flex; justify-content: center; align-items: center; gap: 1rem; margin-bottom: 1.5rem; }
	#leaderboard-toggle { background-color: #e9ecef; padding: 5px; border-radius: 10px; display: inline-flex; }
	#leaderboard-toggle div.gr-form { display: flex; gap: 5px; }
	#leaderboard-toggle input[type='radio'] { display: none; }
	#leaderboard-toggle label { padding: 8px 16px; border-radius: 8px; cursor: pointer; transition: all 0.3s ease; font-weight: 500; color: #495057; background: transparent; border: none; box-shadow: none; }
	#leaderboard-toggle input[type='radio']:checked + label { background-color: white; color: #0d6efd; font-weight: 600; box-shadow: 0 2px 5px rgba(0,0,0,0.1); }

	/* --- Dataframe / Table Styling --- */
	.leaderboard-table .gr-dataframe table { border-collapse: collapse; width: 100%; }
	.leaderboard-table .gr-dataframe thead th { background-color: #f8f9fa !important; color: #495057 !important; font-weight: 600 !important; text-align: left; padding: 12px 15px; border-bottom: 2px solid #dee2e6; }
	.leaderboard-table .gr-dataframe tbody tr:nth-of-type(even) { background-color: #fdfdff; }
	.leaderboard-table .gr-dataframe tbody tr:hover { background-color: #f0f6ff; }
	.leaderboard-table .gr-dataframe tbody td { padding: 12px 15px; border-bottom: 1px solid #e9ecef; }
	.leaderboard-table .gr-dataframe tbody td:first-child { font-weight: 700; color: #495057; }


	/* --- Error & Result Panes --- */
	#error-display-box { background-color: #fff3f3 !important; border-color: #ffc9c9 !important; }
	#result-summary-box { background-color: #f3f9ff !important; border-color: #cde4ff !important; }
	""") as demo:
	gr.Markdown("<h1>🏆 Open LLM Evaluator</h1>")
	gr.Markdown("<p class='subtitle'>Benchmark leading models on MMLU and MMLU-Pro. Your results contribute to a live leaderboard. Select a benchmark and run an evaluation, or view the current standings.</p>")

	with gr.Tabs() as tabs:
	# --- Leaderboard Tab ---
	with gr.TabItem("📊 Leaderboard", id=0):
	with gr.Column():
	with gr.Row(elem_id="leaderboard-toggle-group"):
	leaderboard_type_toggle = gr.Radio(
	["MMLU", "MMLU-Pro"],
	label="Select Benchmark",
	value="MMLU",
	interactive=True,
	elem_id="leaderboard-toggle",
	container=False,
	show_label=False,
	)
	refresh_button = gr.Button("🔄 Refresh", size="sm")

	leaderboard_table_output = gr.DataFrame(
	headers=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"],
	interactive=False,
	datatype=["number", "str", "str", "number", "str"],
	row_count=15,
	elem_classes="leaderboard-table"
	)

	# --- Evaluation Tab ---
	with gr.TabItem("🚀 Run Evaluation", id=1):
	with gr.Row(variant='panel'):
	with gr.Column(scale=2):
	with gr.Group():
	gr.Markdown("### 1. Configure Evaluation")
	model_id_input = gr.Textbox(
	label="Hugging Face Model ID",
	placeholder="e.g., meta-llama/Meta-Llama-3-8B-Instruct",
	interactive=True
	)
	benchmark_selection_radio = gr.Radio(
	["MMLU", "MMLU-Pro"],
	label="Benchmark",
	value="MMLU",
	interactive=True,
	)
	with gr.Row():
	benchmark_subject_dropdown = gr.Dropdown(
	label="Subject",
	choices=ALL_BENCHMARK_SUBJECTS.get("MMLU", []),
	value="ALL",
	interactive=True
	)
	sample_count_slider = gr.Slider(
	label="Samples per Subject",
	minimum=5, maximum=100, value=25, step=5, interactive=True
	)

	run_button = gr.Button("Start Evaluation", variant="primary", scale=1)

	with gr.Column(scale=3):
	gr.Markdown("### 2. View Results")

	# Panel for displaying the summary of results
	with gr.Group(visible=False) as result_summary_box:
	result_summary_output = gr.Markdown(elem_id="result-summary-box")

	# Panel for displaying errors
	with gr.Group(visible=False) as error_box:
	error_output = gr.Textbox(label="Error Message", interactive=False, elem_id="error-display-box")
	error_details_output = gr.Textbox(label="Error Details (Traceback)", interactive=False, lines=8)

	# Panel for detailed, row-by-row results
	with gr.Group(visible=False) as details_box:
	gr.Markdown("#### Detailed Evaluation Log")
	detailed_results_df = gr.DataFrame(
	headers=["Question", "Correct", "Expected", "Predicted", "Model Output"],
	datatype=["str", "str", "str", "str", "str"],
	interactive=False,
	row_count=10,
	col_count=5,
	wrap=True,
	)

	# --- Event Handlers & Logic ---

	# Update subject dropdown when benchmark type changes
	benchmark_selection_radio.change(
	fn=update_subject_dropdown,
	inputs=[benchmark_selection_radio],
	outputs=[benchmark_subject_dropdown]
	)

	# Main evaluation trigger
	run_button.click(
	fn=run_evaluation,
	inputs=[model_id_input, benchmark_selection_radio, benchmark_subject_dropdown, sample_count_slider],
	outputs=[result_summary_output, error_box, error_output, error_details_output, details_box, detailed_results_df]
	).then(
	# After evaluation, switch to the leaderboard tab and refresh it
	lambda: gr.update(selected=0), outputs=[tabs]
	).then(
	load_leaderboard, inputs=[leaderboard_type_toggle], outputs=[leaderboard_table_output]
	)

	# Leaderboard loading logic
	demo.load(
	fn=load_leaderboard,
	inputs=[leaderboard_type_toggle],
	outputs=[leaderboard_table_output]
	)
	leaderboard_type_toggle.change(
	fn=load_leaderboard,
	inputs=[leaderboard_type_toggle],
	outputs=[leaderboard_table_output],
	show_progress='minimal'
	)
	refresh_button.click(
	fn=load_leaderboard,
	inputs=[leaderboard_type_toggle],
	outputs=[leaderboard_table_output],
	show_progress='full'
	)

	# Launch the Gradio app
	if __name__ == "__main__":
	demo.launch(debug=True)