Spaces:

Enderchef
/

SuperBench-Eval

Running on Zero

App Files Files Community

SuperBench-Eval / app.py

Enderchef

Update app.py

05331fd verified about 2 months ago

raw

history blame

33 kB

	import os
	import gradio as gr
	from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
	from datasets import load_dataset, get_dataset_config_names
	import torch
	import re
	import json
	import pandas as pd
	import matplotlib.pyplot as plt
	import traceback # Import traceback for detailed error logging
	import spaces # Import the spaces library

	# Cache to avoid reloading the model
	model_cache = {}

	HF_TOKEN = os.environ.get("HF_TOKEN")

	# --- Constants for Benchmarks ---
	MMLU_DATASET = "cais/mmlu"
	MMLU_PRO_DATASET = "cais/mmlu_pro"

	def get_all_benchmark_options():
	"""
	Dynamically fetches all available subjects for MMLU and MMLU-Pro.
	Returns a dictionary mapping benchmark dataset IDs to their subjects,
	and a flattened list suitable for a Gradio dropdown.
	"""
	all_options = {}
	gr_dropdown_options = []

	# Get subjects for MMLU
	try:
	mmlu_subjects = get_dataset_config_names(MMLU_DATASET, token=HF_TOKEN)
	all_options[MMLU_DATASET] = ["ALL"] + mmlu_subjects
	gr_dropdown_options.extend([f"MMLU - {s}" for s in all_options[MMLU_DATASET]])
	except Exception as e:
	print(f"Warning: Could not load MMLU dataset configs. Error: {e}")
	all_options[MMLU_DATASET] = []

	# Get subjects for MMLU-Pro
	try:
	mmlu_pro_subjects = get_dataset_config_names(MMLU_PRO_DATASET, token=HF_TOKEN)
	all_options[MMLU_PRO_DATASET] = ["ALL"] + mmlu_pro_subjects
	gr_dropdown_options.extend([f"MMLU-Pro - {s}" for s in all_options[MMLU_PRO_DATASET]])
	except Exception as e:
	print(f"Warning: Could not load MMLU-Pro dataset configs. It might not be accessible or available. Error: {e}")
	all_options[MMLU_PRO_DATASET] = []

	return all_options, gr_dropdown_options

	# Initialize these once globally when the app starts
	ALL_BENCHMARK_SUBJECTS, GRADIO_DROPDOWN_OPTIONS = get_all_benchmark_options()

	@spaces.GPU() # Decorator to ensure this function runs on GPU if available
	def load_model(model_id):
	"""
	Loads a Hugging Face model and its tokenizer, then creates a text-generation pipeline.
	Uses a cache to avoid re-loading if the model is already in memory.
	Provides Gradio Info/Error messages for user feedback.
	Raises an exception if model loading fails.
	"""
	gr.Info(f"Attempting to load model: {model_id}...")
	if model_id in model_cache:
	gr.Info(f"Model '{model_id}' already loaded from cache.")
	return model_cache[model_id]
	try:
	# Load tokenizer and model, using bfloat16 if CUDA is available for efficiency
	tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN, trust_remote_code=True)
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	token=HF_TOKEN,
	torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
	trust_remote_code=True
	).to("cuda" if torch.cuda.is_available() else "cpu")

	# Create a text-generation pipeline
	generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

	# Cache the loaded generator
	model_cache[model_id] = generator
	gr.Info(f"Model '{model_id}' loaded successfully.")
	return generator
	except Exception as e:
	# Re-raise the exception to be caught by the outer run_evaluation try-except
	raise ValueError(f"Failed to load model '{model_id}'. Please verify the model ID and your Hugging Face token. Error: {e}")


	def format_prompt(item):
	"""
	Formats a single MMLU/MMLU-Pro question item into a clear prompt for the LLM.
	The prompt is designed for the model to output a single letter answer (A, B, C, D).
	"""
	prompt = f"""{item['question']}
	A. {item['choices'][0]}
	B. {item['choices'][1]}
	C. {item['choices'][2]}
	D. {item['choices'][3]}
	Answer:"""
	return prompt, item['answer'] # Returns the prompt string and the correct choice index (0-3)

	def extract_choice_letter(output):
	"""
	Extracts the most likely choice letter (A, B, C, D) from the model's generated output.
	It prioritizes an exact match after "Answer:", then looks for any single capital letter.
	"""
	# Look for "Answer: X" pattern first (e.g., "Answer: A" or "Answer: B")
	match = re.search(r"Answer:\s*([ABCD])", output, re.IGNORECASE)
	if match:
	return match.group(1).upper() # Ensure it's uppercase

	# Fallback: look for a single capital letter A-D anywhere in the output
	match = re.search(r"\b([ABCD])\b", output.strip())
	if match:
	return match.group(1)

	return None # Return None if no valid choice letter is found

	def get_choice_letter(index):
	"""Converts a numerical choice index (0-3) to a capital letter (A-D)."""
	if 0 <= index <= 3:
	return chr(ord('A') + index)
	return None # Return None for invalid indices

	def evaluate_single_subject(generator, dataset_id, subject, sample_count, progress):
	"""
	Evaluates a given model generator on a specific subject from a specified dataset.

	Args:
	generator: The Hugging Face pipeline for text generation.
	dataset_id (str): The ID of the dataset (e.g., "cais/mmlu", "cais/mmlu_pro").
	subject (str): The specific subject/config name within the dataset.
	sample_count (int): The maximum number of samples to evaluate.
	progress (gr.Progress): Gradio progress tracker.

	Returns:
	tuple: (accuracy, list_of_detailed_results)
	Raises:
	Exception: If dataset loading fails.
	"""
	gr.Info(f"Loading dataset: {dataset_id} - {subject}...")
	try:
	# Load the "test" split of the dataset
	dataset = load_dataset(dataset_id, subject, token=HF_TOKEN)["test"]
	except Exception as e:
	# Re-raise the exception to be caught by the outer run_evaluation try-except
	raise RuntimeError(f"Failed to load dataset '{dataset_id}' for subject '{subject}'. Error: {e}")

	# Limit the number of samples and shuffle for consistent evaluation across runs
	num_samples_to_evaluate = min(sample_count, len(dataset))
	dataset = dataset.shuffle(seed=42).select(range(num_samples_to_evaluate))

	correct_count = 0
	subject_results = []

	# Iterate through the selected samples with a progress bar
	for i, item in enumerate(progress.tqdm(dataset, desc=f"Processing {subject} samples")):
	prompt, answer_idx = format_prompt(item)
	expected_letter = get_choice_letter(answer_idx)

	# Generate only 1 new token for the answer (A, B, C, D)
	# do_sample=False ensures deterministic output for a given prompt (greedy decoding)
	output_raw = generator(prompt, max_new_tokens=1, do_sample=False)[0]["generated_text"]

	# Check for potential reasoning model output
	is_reasoning_model_output = '<' in output_raw or re.search(r"\b(because\|therefore\|thus\|reasoning)\b", output_raw, re.IGNORECASE) is not None

	# Extract the predicted letter from the model's raw output
	predicted_letter = extract_choice_letter(output_raw)

	is_correct = (predicted_letter == expected_letter)
	correct_count += is_correct

	# Store detailed results for logging and display
	subject_results.append({
	"question": item['question'],
	"choices": item['choices'],
	"model_raw_output": output_raw.strip(),
	"expected_answer_letter": expected_letter,
	"predicted_answer_letter": predicted_letter,
	"is_correct": is_correct,
	"is_reasoning_model_output": is_reasoning_model_output # Store the flag
	})

	# Calculate accuracy for the current subject
	accuracy = (correct_count / len(dataset)) * 100 if len(dataset) > 0 else 0
	return accuracy, subject_results

	@spaces.GPU() # Decorator to ensure this function runs on GPU if available
	def run_evaluation(model_id, selected_benchmark_subject, sample_count, progress=gr.Progress()):
	"""
	Main function to orchestrate the evaluation process.
	Handles single subject or 'ALL' subjects evaluation for MMLU/MMLU-Pro.
	Returns Gradio.update objects to control UI component visibility and content.
	"""
	gr.Info("Starting evaluation...")
	if not model_id:
	gr.Warning("Please enter a Hugging Face Model ID before running the evaluation.")
	# Return updates to hide logs/debug and show empty results
	return "", gr.update(value="", visible=False), gr.update(visible=False), \
	gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False)

	# Parse the selected benchmark and subject from the dropdown string
	parts = selected_benchmark_subject.split(" - ")
	if len(parts) != 2:
	gr.Error("Invalid benchmark selection format. Please select from the dropdown.")
	return "", gr.update(value="", visible=False), gr.update(visible=False), \
	gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False)

	benchmark_name = parts[0]
	subject_name = parts[1]

	dataset_id_map = {
	"MMLU": MMLU_DATASET,
	"MMLU-Pro": MMLU_PRO_DATASET
	}
	current_dataset_id = dataset_id_map.get(benchmark_name)

	if not current_dataset_id:
	gr.Error(f"Unknown benchmark selected: {benchmark_name}. This should not happen.")
	return "", gr.update(value="", visible=False), gr.update(visible=False), \
	gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False)

	try:
	generator = load_model(model_id) # This function will raise an exception on failure

	all_evaluation_results = []
	total_correct_overall = 0
	total_samples_overall = 0
	eval_summary_lines = []

	if subject_name == "ALL":
	subjects_to_evaluate = ALL_BENCHMARK_SUBJECTS.get(current_dataset_id, [])
	if "ALL" in subjects_to_evaluate:
	subjects_to_evaluate.remove("ALL")

	if not subjects_to_evaluate:
	gr.Warning(f"No subjects found to evaluate for '{benchmark_name}'.")
	return "", gr.update(value="", visible=False), gr.update(visible=False), \
	gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False)

	for i, sub in enumerate(progress.tqdm(subjects_to_evaluate, desc=f"Evaluating ALL {benchmark_name} subjects")):
	gr.Info(f"Evaluating {benchmark_name} - {sub} ({i+1}/{len(subjects_to_evaluate)})...")
	try:
	accuracy, subject_details = evaluate_single_subject(generator, current_dataset_id, sub, sample_count, progress)
	all_evaluation_results.extend(subject_details)

	num_evaluated_samples = len(subject_details)
	num_correct_in_subject = sum(d['is_correct'] for d in subject_details)

	total_correct_overall += num_correct_in_subject
	total_samples_overall += num_evaluated_samples
	eval_summary_lines.append(f"- {benchmark_name} - {sub}: {accuracy:.2f}% ({num_correct_in_subject}/{num_evaluated_samples} samples)")
	except Exception as e:
	gr.Error(f"Skipping {benchmark_name} - {sub} due to an error: {e}")
	eval_summary_lines.append(f"- {benchmark_name} - {sub}: Error during evaluation.")
	continue

	overall_accuracy = (total_correct_overall / total_samples_overall) * 100 if total_samples_overall > 0 else 0
	score_string = f"Overall Average Accuracy for {benchmark_name}: {overall_accuracy:.2f}% across {total_samples_overall} total samples.\n\n"
	score_string += "Detailed breakdown:\n" + "\n".join(eval_summary_lines)

	else:
	accuracy, subject_details = evaluate_single_subject(generator, current_dataset_id, subject_name, sample_count, progress)
	all_evaluation_results.extend(subject_details)
	overall_accuracy = accuracy
	num_evaluated_samples = len(subject_details)
	score_string = f"Accuracy for {benchmark_name} - {subject_name}: {accuracy:.2f}% out of {num_evaluated_samples} samples."

	# Format detailed results for display in the text box
	formatted_details = "\n\n".join([
	(
	f"### Question:\n{item['question']}\n\n"
	+ f"Choices:\n" + "\n".join([f"{get_choice_letter(i)}. {c}" for i, c in enumerate(item['choices'])]) + "\n\n"
	+ (f"Note: Reasoning models are currently not fully supported for single-letter extraction. The original model output followed:\n" if item.get('is_reasoning_model_output') else "")
	+ f"Model Raw Output: {item['model_raw_output']}\n"
	+ f"Expected Answer: {item['expected_answer_letter']}\n"
	+ f"Predicted Answer: {item['predicted_answer_letter']}\n"
	+ f"Correct: {'Yes' if item['is_correct'] else 'No'}"
	)
	for item in all_evaluation_results
	])

	# Record the evaluation result to a JSONL file for the leaderboard
	record = {
	"model_id": model_id,
	"benchmark": benchmark_name,
	"subject": subject_name,
	"accuracy": overall_accuracy,
	"sample_count": total_samples_overall if subject_name == "ALL" else len(all_evaluation_results),
	"timestamp": pd.Timestamp.now().isoformat()
	}
	with open("eval.jsonl", "a") as f:
	f.write(json.dumps(record) + "\n")

	gr.Info("Evaluation completed successfully!")
	return score_string, \
	gr.update(value="", visible=False), gr.update(visible=False), \
	gr.update(visible=True), gr.update(visible=True), gr.update(value=formatted_details, visible=False)

	except Exception as e:
	error_message = str(e)
	detailed_error_traceback = traceback.format_exc()
	gr.Error("An error occurred during evaluation.")

	# Return updates for error state
	return "Error occurred during evaluation. We'll evaluate for you! If this persists, please open a community support tab for assistance.", \
	gr.update(value=detailed_error_traceback, visible=True), gr.update(visible=True), \
	gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False)

	def save_text(text_content):
	"""Saves the provided text content to a file and returns the file path for download."""
	if not text_content:
	gr.Warning("No evaluation results to download.")
	return None
	file_path = "evaluation_results.txt"
	try:
	with open(file_path, "w") as f:
	f.write(text_content)
	return file_path
	except Exception as e:
	gr.Error(f"Error saving file: {e}")
	return None

	def load_leaderboard():
	"""
	Loads evaluation data from 'eval.jsonl', computes average accuracy per model for MMLU and MMLU-Pro,
	and prepares data for two separate leaderboard tables.
	"""
	try:
	df = pd.read_json("eval.jsonl", lines=True)

	# Ensure 'accuracy' is numeric, coerce errors to NaN and drop them
	df['accuracy'] = pd.to_numeric(df['accuracy'], errors='coerce')
	df = df.dropna(subset=['accuracy'])

	if df.empty:
	gr.Warning("No valid evaluation data found to populate the leaderboard.")
	# Return empty dataframes for both MMLU and MMLU-Pro
	return (
	pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records'),
	pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
	)

	# Filter for MMLU data
	df_mmlu = df[df['benchmark'] == 'MMLU']
	if 'subject' in df_mmlu.columns:
	# For MMLU, if "ALL" subjects are evaluated, consider the overall accuracy.
	# Otherwise, average specific subject accuracies.
	df_mmlu_grouped = df_mmlu[df_mmlu['subject'] == 'ALL'].groupby("model_id")["accuracy"].mean().reset_index()
	# If a model only has specific subject evaluations, average those.
	# This is a simplification; a more robust approach might be to calculate weighted average.
	# For now, if "ALL" exists, we use that; otherwise, we average available subjects.

	# If no 'ALL' subject records, average across available subjects for MMLU
	if df_mmlu_grouped.empty:
	df_mmlu_grouped = df_mmlu.groupby("model_id")["accuracy"].mean().reset_index()

	else: # Handle older eval.jsonl without 'subject' column or if only MMLU was run
	df_mmlu_grouped = df_mmlu.groupby("model_id")["accuracy"].mean().reset_index()


	df_mmlu_grouped.columns = ["Model ID", "Average Accuracy (%)"]
	df_mmlu_sorted = df_mmlu_grouped.sort_values(by="Average Accuracy (%)", ascending=False)

	# Filter for MMLU-Pro data
	df_mmlu_pro = df[df['benchmark'] == 'MMLU-Pro']
	if 'subject' in df_mmlu_pro.columns:
	df_mmlu_pro_grouped = df_mmlu_pro[df_mmlu_pro['subject'] == 'ALL'].groupby("model_id")["accuracy"].mean().reset_index()
	if df_mmlu_pro_grouped.empty:
	df_mmlu_pro_grouped = df_mmlu_pro.groupby("model_id")["accuracy"].mean().reset_index()
	else: # Handle older eval.jsonl
	df_mmlu_pro_grouped = df_mmlu_pro.groupby("model_id")["accuracy"].mean().reset_index()


	df_mmlu_pro_grouped.columns = ["Model ID", "Average Accuracy (%)"]
	df_mmlu_pro_sorted = df_mmlu_pro_grouped.sort_values(by="Average Accuracy (%)", ascending=False)

	# Return two dataframes as lists of dictionaries
	return df_mmlu_sorted.to_dict('records'), df_mmlu_pro_sorted.to_dict('records')

	except FileNotFoundError:
	gr.Warning("No evaluation data found yet. Run an evaluation to populate the leaderboard!")
	return (
	pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records'),
	pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
	)
	except Exception as e:
	gr.Error(f"Error loading leaderboard: {e}")
	traceback.print_exc() # Print full traceback for debugging
	return (
	pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records'),
	pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
	)


	# --- Gradio Interface Definition ---
	with gr.Blocks(css="""
	/* Import Google Font - Inter */
	@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');

	/* General body and container styling */
	body {
	font-family: 'Inter', sans-serif;
	background-color: #eef2f6; /* Lighter background */
	margin: 0;
	padding: 20px;
	}
	.gradio-container {
	max-width: 1200px;
	margin: 20px auto;
	padding: 40px; /* Increased padding */
	box-shadow: 0 10px 25px rgba(0,0,0,0.1); /* Softer, larger shadow */
	border-radius: 15px; /* More rounded corners */
	background-color: #ffffff;
	border: 1px solid #e0e6ed; /* Subtle border */
	}

	/* Headings */
	h1 {
	color: #1a202c; /* Darker, more professional heading color */
	text-align: center;
	margin-bottom: 30px;
	font-size: 2.8em; /* Slightly larger H1 */
	font-weight: 700;
	letter-spacing: -0.03em;
	text-shadow: 1px 1px 2px rgba(0,0,0,0.05); /* Subtle text shadow */
	}
	h3 {
	color: #2d3748;
	font-size: 1.3em; /* Slightly larger H3 */
	margin-bottom: 15px;
	font-weight: 600;
	}

	/* Markdown text */
	.markdown-text {
	text-align: center;
	color: #4a5568;
	line-height: 1.7;
	font-size: 1.05em;
	margin-bottom: 30px;
	}
	.markdown-text div {
	font-size: 1.1em;
	max-width: 800px; /* Constrain width for readability */
	margin: 0 auto;
	}

	/* Buttons */
	.gr-button {
	background-color: #2f80ed; /* A vibrant, professional blue */
	color: white;
	border: none;
	padding: 14px 30px; /* More padding */
	border-radius: 10px; /* More rounded */
	cursor: pointer;
	transition: background-color 0.3s ease, transform 0.2s ease, box-shadow 0.2s ease;
	font-size: 1.15em; /* Slightly larger font */
	font-weight: 600;
	box-shadow: 0 5px 15px rgba(0, 123, 255, 0.2); /* Enhanced shadow for primary button */
	margin: 5px; /* Add some margin for spacing between buttons */
	}
	.gr-button:hover {
	background-color: #1a6dcd; /* Darker blue on hover */
	transform: translateY(-3px); /* More pronounced lift effect */
	box-shadow: 0 8px 20px rgba(0, 123, 255, 0.3);
	}
	.gr-button:active {
	transform: translateY(0);
	box-shadow: 0 2px 5px rgba(0,0,0,0.1);
	}
	/* Specific button styling for debug/show details */
	#debug-button, #show-details-button {
	background-color: #718096; /* Professional grey */
	box-shadow: 0 3px 10px rgba(113, 128, 150, 0.2);
	}
	#debug-button:hover, #show-details-button:hover {
	background-color: #5d6d81;
	box-shadow: 0 5px 12px rgba(113, 128, 150, 0.3);
	}
	#download-button {
	background-color: #38a169; /* Muted green for download */
	box-shadow: 0 3px 10px rgba(56, 161, 105, 0.2);
	}
	#download-button:hover {
	background-color: #277e50;
	box-shadow: 0 5px 12px rgba(56, 161, 105, 0.3);
	}

	/* Input/Output Boxes (Containers) */
	.gr-box {
	border: 1px solid #cbd5e0; /* Lighter, subtle border */
	border-radius: 12px;
	padding: 25px; /* Increased padding */
	margin-bottom: 25px;
	background-color: #f8fafc; /* Very light background */
	box-shadow: inset 0 2px 5px rgba(0,0,0,0.03); /* Subtle inner shadow */
	}
	/* Specific text output boxes (the content inside the containers) */
	.gr-output-text {
	white-space: pre-wrap;
	word-wrap: break-word;
	background-color: #ffffff; /* White background for readability */
	border: 1px solid #e2e8f0;
	border-radius: 8px;
	padding: 18px; /* More padding */
	min-height: 120px; /* Ensure a minimum height */
	box-shadow: 0 2px 8px rgba(0,0,0,0.05); /* Small shadow for depth */
	color: #2d3748; /* Darker text for readability */
	font-size: 0.95em;
	line-height: 1.6;
	}
	/* Specific error output style */
	#error-message-output {
	background-color: #ffe0e6; /* Light red */
	border-color: #ff99aa; /* Slightly darker red border */
	color: #c53030; /* Stronger red text */
	font-weight: 500;
	padding: 20px;
	}


	/* Labels for inputs */
	.gr-textbox label, .gr-dropdown label, .gr-slider label {
	font-weight: 600;
	color: #2d3748; /* Darker label text */
	margin-bottom: 10px;
	display: block;
	font-size: 1.05em; /* Slightly larger label font */
	}

	/* Tabs styling */
	.gr-tabs-nav button {
	font-weight: 600;
	font-size: 1.1em;
	padding: 12px 25px; /* More padding for tabs */
	border-top-left-radius: 10px;
	border-top-right-radius: 10px;
	background-color: #ebf4f8; /* Light blueish tab background */
	color: #4a5568;
	border: 1px solid #cce0eb; /* Subtle border for tabs */
	border-bottom: none;
	transition: background-color 0.3s ease, color 0.3s ease;
	}
	.gr-tabs-nav button.selected {
	background-color: #ffffff; /* White for selected tab */
	color: #2f80ed; /* Blue for selected text */
	border-color: #2f80ed;
	border-bottom: 1px solid #ffffff; /* Hide bottom border to merge with content */
	}

	/* Leaderboard specific table styling (general for all leaderboard tables) */
	.leaderboard-table {
	border-radius: 12px;
	box-shadow: 0 4px 15px rgba(0,0,0,0.08);
	overflow: hidden;
	margin-bottom: 25px; /* Space between tables */
	}
	.leaderboard-table table {
	border-collapse: separate;
	border-spacing: 0;
	width: 100%;
	background-color: #ffffff;
	}
	.leaderboard-table thead th {
	background-color: #edf2f7; /* Light grey header */
	color: #2d3748;
	font-weight: 700;
	padding: 15px 20px;
	text-align: left;
	border-bottom: 2px solid #e2e8f0;
	}
	.leaderboard-table tbody tr {
	transition: background-color 0.2s ease;
	}
	.leaderboard-table tbody tr:nth-child(odd) {
	background-color: #f7fafc; /* Zebra striping */
	}
	.leaderboard-table tbody tr:hover {
	background-color: #e6fffa; /* Light teal on hover for rows */
	}
	.leaderboard-table tbody td {
	padding: 12px 20px;
	border-bottom: 1px solid #ebf4f8;
	color: #4a5568;
	}
	.leaderboard-table tbody tr:last-child td {
	border-bottom: none;
	}
	.leaderboard-table tbody tr:first-child td {
	border-top-left-radius: 12px;
	border-top-right-radius: 12px;
	}
	.leaderboard-table tbody tr:last-child td {
	border-bottom-left-radius: 12px;
	border-bottom-right-radius: 12px;
	}

	/* Horizontal line for separation */
	hr {
	border: none;
	border-top: 1px solid #e2e8f0;
	margin: 30px 0;
	}
	""") as demo:
	gr.Markdown("""
	# 🤖 LLM Benchmark Evaluator
	""")

	with gr.Tabs():
	with gr.TabItem("🚀 Run Evaluation"):
	gr.Markdown("""
	<div class="markdown-text">
	Enter your Hugging Face Model ID, choose a benchmark (MMLU or MMLU-Pro),
	select a subject (or 'ALL' for a comprehensive evaluation),
	and specify the number of samples per subject.
	Ensure your Hugging Face token is set as an environment variable for private models.
	</div>
	""")

	with gr.Column(elem_classes="gr-box"):
	model_id_input = gr.Textbox(
	label="Your Hugging Face Model ID",
	placeholder="e.g., mistralai/Mistral-7B-Instruct-v0.2",
	interactive=True
	)
	with gr.Row():
	benchmark_subject_dropdown = gr.Dropdown(
	label="Choose Benchmark and Subject",
	choices=GRADIO_DROPDOWN_OPTIONS,
	value="MMLU - ALL", # Default to MMLU ALL for initial load
	interactive=True,
	min_width=400 # Ensure sufficient width
	)
	sample_count_slider = gr.Slider(
	label="Number of Samples per Subject (1-100)",
	minimum=1,
	maximum=100,
	value=10, # Default to 10 samples
	step=1,
	interactive=True,
	min_width=200
	)
	run_button = gr.Button("🚀 Run Evaluation", elem_classes="gr-button")

	gr.Markdown("<hr>") # Visual separator

	with gr.Column(elem_classes="gr-box"):
	acc_output = gr.Textbox(
	label="Benchmark Accuracy Results",
	interactive=False,
	elem_classes="gr-output-text",
	lines=5,
	placeholder="Evaluation results will appear here."
	)

	# Container for debug info, initially hidden
	with gr.Column(visible=False, elem_id="debug-error-column") as debug_error_column:
	error_message_output = gr.Textbox(
	label="Debug Information (Error Details)",
	lines=10, interactive=False, elem_classes="gr-output-text", elem_id="error-message-output",
	placeholder="Error details will appear here if an error occurs."
	)
	debug_button = gr.Button("🐛 Hide Debug Info", visible=True, elem_id="debug-button", elem_classes="gr-button")

	with gr.Row():
	show_details_button = gr.Button("🔍 Show Detailed Logs", visible=False, elem_id="show-details-button", elem_classes="gr-button")
	download_button = gr.Button("📥 Download Full Evaluation Logs", visible=False, elem_id="download-button", elem_classes="gr-button")

	# Detailed output, initially hidden
	detail_output = gr.Textbox(
	label="Detailed Evaluation Logs",
	lines=20,
	interactive=False,
	elem_classes="gr-output-text",
	placeholder="Detailed logs for each question will appear here upon successful evaluation.",
	visible=False # Initially hidden
	)

	# Define button click actions
	run_button.click(
	run_evaluation,
	inputs=[model_id_input, benchmark_subject_dropdown, sample_count_slider],
	outputs=[
	acc_output,
	error_message_output, debug_error_column, # For error state
	show_details_button, download_button, detail_output # For success state
	]
	)

	# Toggle visibility of detail_output
	show_details_button.click(
	lambda s: gr.update(visible=not s), # Toggle visibility
	inputs=[detail_output], # Pass the component itself as input
	outputs=[detail_output] # The component to update
	)
	# Change button text based on visibility
	show_details_button.click(
	lambda s: "🙈 Hide Detailed Logs" if not s else "🔍 Show Detailed Logs",
	inputs=[detail_output],
	outputs=[show_details_button]
	)

	# Toggle visibility of debug error column
	debug_button.click(
	lambda s: gr.update(visible=not s), # Toggle visibility
	inputs=[debug_error_column], # Pass the component itself as input
	outputs=[debug_error_column] # The component to update
	)
	# Change debug button text based on visibility
	debug_button.click(
	lambda s: "🐛 Show Debug Info" if not s else "🐛 Hide Debug Info",
	inputs=[debug_error_column],
	outputs=[debug_button]
	)

	download_button.click(
	save_text,
	inputs=[detail_output],
	outputs=gr.File(label="Download Evaluation Results", file_count="single", type="filepath")
	)

	with gr.TabItem("📊 Leaderboard"):
	gr.Markdown("""
	<div class="markdown-text">
	Explore the performance of various LLMs on the MMLU and MMLU-Pro benchmarks.
	This leaderboard is updated automatically with each new evaluation.
	</div>
	""")

	# MMLU Leaderboard Table
	gr.Markdown("### MMLU Top Models")
	mmlu_leaderboard_table = gr.Dataframe(
	headers=["Model ID", "Average Accuracy (%)"],
	interactive=False,
	datatype=["str", "number"],
	row_count=10,
	col_count=2,
	label="MMLU Leaderboard Data",
	elem_classes="leaderboard-table" # Apply custom class for styling
	)

	gr.Markdown("### MMLU-Pro Top Models")
	mmlu_pro_leaderboard_table = gr.Dataframe(
	headers=["Model ID", "Average Accuracy (%)"],
	interactive=False,
	datatype=["str", "number"],
	row_count=10,
	col_count=2,
	label="MMLU-Pro Leaderboard Data",
	elem_classes="leaderboard-table" # Apply custom class for styling
	)

	# Load leaderboard when the tab is selected or when the app loads
	demo.load(load_leaderboard, inputs=[], outputs=[mmlu_leaderboard_table, mmlu_pro_leaderboard_table])

	# Launch the Gradio app
	demo.launch()