Spaces:
Running
on
Zero
Running
on
Zero
import os | |
import gradio as gr | |
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline | |
from datasets import load_dataset, get_dataset_config_names | |
import torch | |
import re | |
import json | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import traceback # Import traceback for detailed error logging | |
import spaces # Import the spaces library | |
# Cache to avoid reloading the model | |
model_cache = {} | |
HF_TOKEN = os.environ.get("HF_TOKEN") | |
# --- Constants for Benchmarks --- | |
MMLU_DATASET = "cais/mmlu" | |
MMLU_PRO_DATASET = "cais/mmlu_pro" | |
def get_all_benchmark_options(): | |
""" | |
Dynamically fetches all available subjects for MMLU and MMLU-Pro. | |
Returns a dictionary mapping benchmark dataset IDs to their subjects, | |
and a flattened list suitable for a Gradio dropdown. | |
""" | |
all_options = {} | |
gr_dropdown_options = [] | |
# Get subjects for MMLU | |
try: | |
mmlu_subjects = get_dataset_config_names(MMLU_DATASET, token=HF_TOKEN) | |
all_options[MMLU_DATASET] = ["ALL"] + mmlu_subjects | |
gr_dropdown_options.extend([f"MMLU - {s}" for s in all_options[MMLU_DATASET]]) | |
except Exception as e: | |
print(f"Warning: Could not load MMLU dataset configs. Error: {e}") | |
all_options[MMLU_DATASET] = [] | |
# Get subjects for MMLU-Pro | |
try: | |
mmlu_pro_subjects = get_dataset_config_names(MMLU_PRO_DATASET, token=HF_TOKEN) | |
all_options[MMLU_PRO_DATASET] = ["ALL"] + mmlu_pro_subjects | |
gr_dropdown_options.extend([f"MMLU-Pro - {s}" for s in all_options[MMLU_PRO_DATASET]]) | |
except Exception as e: | |
print(f"Warning: Could not load MMLU-Pro dataset configs. It might not be accessible or available. Error: {e}") | |
all_options[MMLU_PRO_DATASET] = [] | |
return all_options, gr_dropdown_options | |
# Initialize these once globally when the app starts | |
ALL_BENCHMARK_SUBJECTS, GRADIO_DROPDOWN_OPTIONS = get_all_benchmark_options() | |
# Decorator to ensure this function runs on GPU if available | |
def load_model(model_id): | |
""" | |
Loads a Hugging Face model and its tokenizer, then creates a text-generation pipeline. | |
Uses a cache to avoid re-loading if the model is already in memory. | |
Provides Gradio Info/Error messages for user feedback. | |
Raises an exception if model loading fails. | |
""" | |
gr.Info(f"Attempting to load model: {model_id}...") | |
if model_id in model_cache: | |
gr.Info(f"Model '{model_id}' already loaded from cache.") | |
return model_cache[model_id] | |
try: | |
# Load tokenizer and model, using bfloat16 if CUDA is available for efficiency | |
tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN, trust_remote_code=True) | |
model = AutoModelForCausalLM.from_pretrained( | |
model_id, | |
token=HF_TOKEN, | |
torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, | |
trust_remote_code=True | |
).to("cuda" if torch.cuda.is_available() else "cpu") | |
# Create a text-generation pipeline | |
generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1) | |
# Cache the loaded generator | |
model_cache[model_id] = generator | |
gr.Info(f"Model '{model_id}' loaded successfully.") | |
return generator | |
except Exception as e: | |
# Re-raise the exception to be caught by the outer run_evaluation try-except | |
raise ValueError(f"Failed to load model '{model_id}'. Please verify the model ID and your Hugging Face token. Error: {e}") | |
def format_prompt(item): | |
""" | |
Formats a single MMLU/MMLU-Pro question item into a clear prompt for the LLM. | |
The prompt is designed for the model to output a single letter answer (A, B, C, D). | |
""" | |
prompt = f"""{item['question']} | |
A. {item['choices'][0]} | |
B. {item['choices'][1]} | |
C. {item['choices'][2]} | |
D. {item['choices'][3]} | |
Answer:""" | |
return prompt, item['answer'] # Returns the prompt string and the correct choice index (0-3) | |
def extract_choice_letter(output): | |
""" | |
Extracts the most likely choice letter (A, B, C, D) from the model's generated output. | |
It prioritizes an exact match after "Answer:", then looks for any single capital letter. | |
""" | |
# Look for "Answer: X" pattern first (e.g., "Answer: A" or "Answer: B") | |
match = re.search(r"Answer:\s*([ABCD])", output, re.IGNORECASE) | |
if match: | |
return match.group(1).upper() # Ensure it's uppercase | |
# Fallback: look for a single capital letter A-D anywhere in the output | |
match = re.search(r"\b([ABCD])\b", output.strip()) | |
if match: | |
return match.group(1) | |
return None # Return None if no valid choice letter is found | |
def get_choice_letter(index): | |
"""Converts a numerical choice index (0-3) to a capital letter (A-D).""" | |
if 0 <= index <= 3: | |
return chr(ord('A') + index) | |
return None # Return None for invalid indices | |
def evaluate_single_subject(generator, dataset_id, subject, sample_count, progress): | |
""" | |
Evaluates a given model generator on a specific subject from a specified dataset. | |
Args: | |
generator: The Hugging Face pipeline for text generation. | |
dataset_id (str): The ID of the dataset (e.g., "cais/mmlu", "cais/mmlu_pro"). | |
subject (str): The specific subject/config name within the dataset. | |
sample_count (int): The maximum number of samples to evaluate. | |
progress (gr.Progress): Gradio progress tracker. | |
Returns: | |
tuple: (accuracy, list_of_detailed_results) | |
Raises: | |
Exception: If dataset loading fails. | |
""" | |
gr.Info(f"Loading dataset: {dataset_id} - {subject}...") | |
try: | |
# Load the "test" split of the dataset | |
dataset = load_dataset(dataset_id, subject, token=HF_TOKEN)["test"] | |
except Exception as e: | |
# Re-raise the exception to be caught by the outer run_evaluation try-except | |
raise RuntimeError(f"Failed to load dataset '{dataset_id}' for subject '{subject}'. Error: {e}") | |
# Limit the number of samples and shuffle for consistent evaluation across runs | |
num_samples_to_evaluate = min(sample_count, len(dataset)) | |
dataset = dataset.shuffle(seed=42).select(range(num_samples_to_evaluate)) | |
correct_count = 0 | |
subject_results = [] | |
# Iterate through the selected samples with a progress bar | |
for i, item in enumerate(progress.tqdm(dataset, desc=f"Processing {subject} samples")): | |
prompt, answer_idx = format_prompt(item) | |
expected_letter = get_choice_letter(answer_idx) | |
# Generate only 1 new token for the answer (A, B, C, D) | |
# do_sample=False ensures deterministic output for a given prompt (greedy decoding) | |
output_raw = generator(prompt, max_new_tokens=1, do_sample=False)[0]["generated_text"] | |
# Check for potential reasoning model output | |
is_reasoning_model_output = '<' in output_raw or re.search(r"\b(because|therefore|thus|reasoning)\b", output_raw, re.IGNORECASE) is not None | |
# Extract the predicted letter from the model's raw output | |
predicted_letter = extract_choice_letter(output_raw) | |
is_correct = (predicted_letter == expected_letter) | |
correct_count += is_correct | |
# Store detailed results for logging and display | |
subject_results.append({ | |
"question": item['question'], | |
"choices": item['choices'], | |
"model_raw_output": output_raw.strip(), | |
"expected_answer_letter": expected_letter, | |
"predicted_answer_letter": predicted_letter, | |
"is_correct": is_correct, | |
"is_reasoning_model_output": is_reasoning_model_output # Store the flag | |
}) | |
# Calculate accuracy for the current subject | |
accuracy = (correct_count / len(dataset)) * 100 if len(dataset) > 0 else 0 | |
return accuracy, subject_results | |
# Decorator to ensure this function runs on GPU if available | |
def run_evaluation(model_id, selected_benchmark_subject, sample_count, progress=gr.Progress()): | |
""" | |
Main function to orchestrate the evaluation process. | |
Handles single subject or 'ALL' subjects evaluation for MMLU/MMLU-Pro. | |
Returns Gradio.update objects to control UI component visibility and content. | |
""" | |
gr.Info("Starting evaluation...") | |
if not model_id: | |
gr.Warning("Please enter a Hugging Face Model ID before running the evaluation.") | |
# Return updates to hide logs/debug and show empty results | |
return "", gr.update(value="", visible=False), gr.update(visible=False), \ | |
gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False) | |
# Parse the selected benchmark and subject from the dropdown string | |
parts = selected_benchmark_subject.split(" - ") | |
if len(parts) != 2: | |
gr.Error("Invalid benchmark selection format. Please select from the dropdown.") | |
return "", gr.update(value="", visible=False), gr.update(visible=False), \ | |
gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False) | |
benchmark_name = parts[0] | |
subject_name = parts[1] | |
dataset_id_map = { | |
"MMLU": MMLU_DATASET, | |
"MMLU-Pro": MMLU_PRO_DATASET | |
} | |
current_dataset_id = dataset_id_map.get(benchmark_name) | |
if not current_dataset_id: | |
gr.Error(f"Unknown benchmark selected: {benchmark_name}. This should not happen.") | |
return "", gr.update(value="", visible=False), gr.update(visible=False), \ | |
gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False) | |
try: | |
generator = load_model(model_id) # This function will raise an exception on failure | |
all_evaluation_results = [] | |
total_correct_overall = 0 | |
total_samples_overall = 0 | |
eval_summary_lines = [] | |
if subject_name == "ALL": | |
subjects_to_evaluate = ALL_BENCHMARK_SUBJECTS.get(current_dataset_id, []) | |
if "ALL" in subjects_to_evaluate: | |
subjects_to_evaluate.remove("ALL") | |
if not subjects_to_evaluate: | |
gr.Warning(f"No subjects found to evaluate for '{benchmark_name}'.") | |
return "", gr.update(value="", visible=False), gr.update(visible=False), \ | |
gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False) | |
for i, sub in enumerate(progress.tqdm(subjects_to_evaluate, desc=f"Evaluating ALL {benchmark_name} subjects")): | |
gr.Info(f"Evaluating {benchmark_name} - {sub} ({i+1}/{len(subjects_to_evaluate)})...") | |
try: | |
accuracy, subject_details = evaluate_single_subject(generator, current_dataset_id, sub, sample_count, progress) | |
all_evaluation_results.extend(subject_details) | |
num_evaluated_samples = len(subject_details) | |
num_correct_in_subject = sum(d['is_correct'] for d in subject_details) | |
total_correct_overall += num_correct_in_subject | |
total_samples_overall += num_evaluated_samples | |
eval_summary_lines.append(f"- {benchmark_name} - {sub}: {accuracy:.2f}% ({num_correct_in_subject}/{num_evaluated_samples} samples)") | |
except Exception as e: | |
gr.Error(f"Skipping {benchmark_name} - {sub} due to an error: {e}") | |
eval_summary_lines.append(f"- {benchmark_name} - {sub}: Error during evaluation.") | |
continue | |
overall_accuracy = (total_correct_overall / total_samples_overall) * 100 if total_samples_overall > 0 else 0 | |
score_string = f"Overall Average Accuracy for {benchmark_name}: {overall_accuracy:.2f}% across {total_samples_overall} total samples.\n\n" | |
score_string += "Detailed breakdown:\n" + "\n".join(eval_summary_lines) | |
else: | |
accuracy, subject_details = evaluate_single_subject(generator, current_dataset_id, subject_name, sample_count, progress) | |
all_evaluation_results.extend(subject_details) | |
overall_accuracy = accuracy | |
num_evaluated_samples = len(subject_details) | |
score_string = f"Accuracy for {benchmark_name} - {subject_name}: {accuracy:.2f}% out of {num_evaluated_samples} samples." | |
# Format detailed results for display in the text box | |
formatted_details = "\n\n".join([ | |
( | |
f"### Question:\n{item['question']}\n\n" | |
+ f"**Choices:**\n" + "\n".join([f"{get_choice_letter(i)}. {c}" for i, c in enumerate(item['choices'])]) + "\n\n" | |
+ (f"**Note:** Reasoning models are currently not fully supported for single-letter extraction. The original model output followed:\n" if item.get('is_reasoning_model_output') else "") | |
+ f"**Model Raw Output:** {item['model_raw_output']}\n" | |
+ f"**Expected Answer:** {item['expected_answer_letter']}\n" | |
+ f"**Predicted Answer:** {item['predicted_answer_letter']}\n" | |
+ f"**Correct:** {'Yes' if item['is_correct'] else 'No'}" | |
) | |
for item in all_evaluation_results | |
]) | |
# Record the evaluation result to a JSONL file for the leaderboard | |
record = { | |
"model_id": model_id, | |
"benchmark": benchmark_name, | |
"subject": subject_name, | |
"accuracy": overall_accuracy, | |
"sample_count": total_samples_overall if subject_name == "ALL" else len(all_evaluation_results), | |
"timestamp": pd.Timestamp.now().isoformat() | |
} | |
with open("eval.jsonl", "a") as f: | |
f.write(json.dumps(record) + "\n") | |
gr.Info("Evaluation completed successfully!") | |
return score_string, \ | |
gr.update(value="", visible=False), gr.update(visible=False), \ | |
gr.update(visible=True), gr.update(visible=True), gr.update(value=formatted_details, visible=False) | |
except Exception as e: | |
error_message = str(e) | |
detailed_error_traceback = traceback.format_exc() | |
gr.Error("An error occurred during evaluation.") | |
# Return updates for error state | |
return "Error occurred during evaluation. We'll evaluate for you! If this persists, please open a community support tab for assistance.", \ | |
gr.update(value=detailed_error_traceback, visible=True), gr.update(visible=True), \ | |
gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False) | |
def save_text(text_content): | |
"""Saves the provided text content to a file and returns the file path for download.""" | |
if not text_content: | |
gr.Warning("No evaluation results to download.") | |
return None | |
file_path = "evaluation_results.txt" | |
try: | |
with open(file_path, "w") as f: | |
f.write(text_content) | |
return file_path | |
except Exception as e: | |
gr.Error(f"Error saving file: {e}") | |
return None | |
def load_leaderboard(): | |
""" | |
Loads evaluation data from 'eval.jsonl', computes average accuracy per model for MMLU and MMLU-Pro, | |
and prepares data for two separate leaderboard tables. | |
""" | |
try: | |
df = pd.read_json("eval.jsonl", lines=True) | |
# Ensure 'accuracy' is numeric, coerce errors to NaN and drop them | |
df['accuracy'] = pd.to_numeric(df['accuracy'], errors='coerce') | |
df = df.dropna(subset=['accuracy']) | |
if df.empty: | |
gr.Warning("No valid evaluation data found to populate the leaderboard.") | |
# Return empty dataframes for both MMLU and MMLU-Pro | |
return ( | |
pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records'), | |
pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records') | |
) | |
# Filter for MMLU data | |
df_mmlu = df[df['benchmark'] == 'MMLU'] | |
if 'subject' in df_mmlu.columns: | |
# For MMLU, if "ALL" subjects are evaluated, consider the overall accuracy. | |
# Otherwise, average specific subject accuracies. | |
df_mmlu_grouped = df_mmlu[df_mmlu['subject'] == 'ALL'].groupby("model_id")["accuracy"].mean().reset_index() | |
# If a model only has specific subject evaluations, average those. | |
# This is a simplification; a more robust approach might be to calculate weighted average. | |
# For now, if "ALL" exists, we use that; otherwise, we average available subjects. | |
# If no 'ALL' subject records, average across available subjects for MMLU | |
if df_mmlu_grouped.empty: | |
df_mmlu_grouped = df_mmlu.groupby("model_id")["accuracy"].mean().reset_index() | |
else: # Handle older eval.jsonl without 'subject' column or if only MMLU was run | |
df_mmlu_grouped = df_mmlu.groupby("model_id")["accuracy"].mean().reset_index() | |
df_mmlu_grouped.columns = ["Model ID", "Average Accuracy (%)"] | |
df_mmlu_sorted = df_mmlu_grouped.sort_values(by="Average Accuracy (%)", ascending=False) | |
# Filter for MMLU-Pro data | |
df_mmlu_pro = df[df['benchmark'] == 'MMLU-Pro'] | |
if 'subject' in df_mmlu_pro.columns: | |
df_mmlu_pro_grouped = df_mmlu_pro[df_mmlu_pro['subject'] == 'ALL'].groupby("model_id")["accuracy"].mean().reset_index() | |
if df_mmlu_pro_grouped.empty: | |
df_mmlu_pro_grouped = df_mmlu_pro.groupby("model_id")["accuracy"].mean().reset_index() | |
else: # Handle older eval.jsonl | |
df_mmlu_pro_grouped = df_mmlu_pro.groupby("model_id")["accuracy"].mean().reset_index() | |
df_mmlu_pro_grouped.columns = ["Model ID", "Average Accuracy (%)"] | |
df_mmlu_pro_sorted = df_mmlu_pro_grouped.sort_values(by="Average Accuracy (%)", ascending=False) | |
# Return two dataframes as lists of dictionaries | |
return df_mmlu_sorted.to_dict('records'), df_mmlu_pro_sorted.to_dict('records') | |
except FileNotFoundError: | |
gr.Warning("No evaluation data found yet. Run an evaluation to populate the leaderboard!") | |
return ( | |
pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records'), | |
pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records') | |
) | |
except Exception as e: | |
gr.Error(f"Error loading leaderboard: {e}") | |
traceback.print_exc() # Print full traceback for debugging | |
return ( | |
pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records'), | |
pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records') | |
) | |
# --- Gradio Interface Definition --- | |
with gr.Blocks(css=""" | |
/* Import Google Font - Inter */ | |
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap'); | |
/* General body and container styling */ | |
body { | |
font-family: 'Inter', sans-serif; | |
background-color: #eef2f6; /* Lighter background */ | |
margin: 0; | |
padding: 20px; | |
} | |
.gradio-container { | |
max-width: 1200px; | |
margin: 20px auto; | |
padding: 40px; /* Increased padding */ | |
box-shadow: 0 10px 25px rgba(0,0,0,0.1); /* Softer, larger shadow */ | |
border-radius: 15px; /* More rounded corners */ | |
background-color: #ffffff; | |
border: 1px solid #e0e6ed; /* Subtle border */ | |
} | |
/* Headings */ | |
h1 { | |
color: #1a202c; /* Darker, more professional heading color */ | |
text-align: center; | |
margin-bottom: 30px; | |
font-size: 2.8em; /* Slightly larger H1 */ | |
font-weight: 700; | |
letter-spacing: -0.03em; | |
text-shadow: 1px 1px 2px rgba(0,0,0,0.05); /* Subtle text shadow */ | |
} | |
h3 { | |
color: #2d3748; | |
font-size: 1.3em; /* Slightly larger H3 */ | |
margin-bottom: 15px; | |
font-weight: 600; | |
} | |
/* Markdown text */ | |
.markdown-text { | |
text-align: center; | |
color: #4a5568; | |
line-height: 1.7; | |
font-size: 1.05em; | |
margin-bottom: 30px; | |
} | |
.markdown-text div { | |
font-size: 1.1em; | |
max-width: 800px; /* Constrain width for readability */ | |
margin: 0 auto; | |
} | |
/* Buttons */ | |
.gr-button { | |
background-color: #2f80ed; /* A vibrant, professional blue */ | |
color: white; | |
border: none; | |
padding: 14px 30px; /* More padding */ | |
border-radius: 10px; /* More rounded */ | |
cursor: pointer; | |
transition: background-color 0.3s ease, transform 0.2s ease, box-shadow 0.2s ease; | |
font-size: 1.15em; /* Slightly larger font */ | |
font-weight: 600; | |
box-shadow: 0 5px 15px rgba(0, 123, 255, 0.2); /* Enhanced shadow for primary button */ | |
margin: 5px; /* Add some margin for spacing between buttons */ | |
} | |
.gr-button:hover { | |
background-color: #1a6dcd; /* Darker blue on hover */ | |
transform: translateY(-3px); /* More pronounced lift effect */ | |
box-shadow: 0 8px 20px rgba(0, 123, 255, 0.3); | |
} | |
.gr-button:active { | |
transform: translateY(0); | |
box-shadow: 0 2px 5px rgba(0,0,0,0.1); | |
} | |
/* Specific button styling for debug/show details */ | |
#debug-button, #show-details-button { | |
background-color: #718096; /* Professional grey */ | |
box-shadow: 0 3px 10px rgba(113, 128, 150, 0.2); | |
} | |
#debug-button:hover, #show-details-button:hover { | |
background-color: #5d6d81; | |
box-shadow: 0 5px 12px rgba(113, 128, 150, 0.3); | |
} | |
#download-button { | |
background-color: #38a169; /* Muted green for download */ | |
box-shadow: 0 3px 10px rgba(56, 161, 105, 0.2); | |
} | |
#download-button:hover { | |
background-color: #277e50; | |
box-shadow: 0 5px 12px rgba(56, 161, 105, 0.3); | |
} | |
/* Input/Output Boxes (Containers) */ | |
.gr-box { | |
border: 1px solid #cbd5e0; /* Lighter, subtle border */ | |
border-radius: 12px; | |
padding: 25px; /* Increased padding */ | |
margin-bottom: 25px; | |
background-color: #f8fafc; /* Very light background */ | |
box-shadow: inset 0 2px 5px rgba(0,0,0,0.03); /* Subtle inner shadow */ | |
} | |
/* Specific text output boxes (the content inside the containers) */ | |
.gr-output-text { | |
white-space: pre-wrap; | |
word-wrap: break-word; | |
background-color: #ffffff; /* White background for readability */ | |
border: 1px solid #e2e8f0; | |
border-radius: 8px; | |
padding: 18px; /* More padding */ | |
min-height: 120px; /* Ensure a minimum height */ | |
box-shadow: 0 2px 8px rgba(0,0,0,0.05); /* Small shadow for depth */ | |
color: #2d3748; /* Darker text for readability */ | |
font-size: 0.95em; | |
line-height: 1.6; | |
} | |
/* Specific error output style */ | |
#error-message-output { | |
background-color: #ffe0e6; /* Light red */ | |
border-color: #ff99aa; /* Slightly darker red border */ | |
color: #c53030; /* Stronger red text */ | |
font-weight: 500; | |
padding: 20px; | |
} | |
/* Labels for inputs */ | |
.gr-textbox label, .gr-dropdown label, .gr-slider label { | |
font-weight: 600; | |
color: #2d3748; /* Darker label text */ | |
margin-bottom: 10px; | |
display: block; | |
font-size: 1.05em; /* Slightly larger label font */ | |
} | |
/* Tabs styling */ | |
.gr-tabs-nav button { | |
font-weight: 600; | |
font-size: 1.1em; | |
padding: 12px 25px; /* More padding for tabs */ | |
border-top-left-radius: 10px; | |
border-top-right-radius: 10px; | |
background-color: #ebf4f8; /* Light blueish tab background */ | |
color: #4a5568; | |
border: 1px solid #cce0eb; /* Subtle border for tabs */ | |
border-bottom: none; | |
transition: background-color 0.3s ease, color 0.3s ease; | |
} | |
.gr-tabs-nav button.selected { | |
background-color: #ffffff; /* White for selected tab */ | |
color: #2f80ed; /* Blue for selected text */ | |
border-color: #2f80ed; | |
border-bottom: 1px solid #ffffff; /* Hide bottom border to merge with content */ | |
} | |
/* Leaderboard specific table styling (general for all leaderboard tables) */ | |
.leaderboard-table { | |
border-radius: 12px; | |
box-shadow: 0 4px 15px rgba(0,0,0,0.08); | |
overflow: hidden; | |
margin-bottom: 25px; /* Space between tables */ | |
} | |
.leaderboard-table table { | |
border-collapse: separate; | |
border-spacing: 0; | |
width: 100%; | |
background-color: #ffffff; | |
} | |
.leaderboard-table thead th { | |
background-color: #edf2f7; /* Light grey header */ | |
color: #2d3748; | |
font-weight: 700; | |
padding: 15px 20px; | |
text-align: left; | |
border-bottom: 2px solid #e2e8f0; | |
} | |
.leaderboard-table tbody tr { | |
transition: background-color 0.2s ease; | |
} | |
.leaderboard-table tbody tr:nth-child(odd) { | |
background-color: #f7fafc; /* Zebra striping */ | |
} | |
.leaderboard-table tbody tr:hover { | |
background-color: #e6fffa; /* Light teal on hover for rows */ | |
} | |
.leaderboard-table tbody td { | |
padding: 12px 20px; | |
border-bottom: 1px solid #ebf4f8; | |
color: #4a5568; | |
} | |
.leaderboard-table tbody tr:last-child td { | |
border-bottom: none; | |
} | |
.leaderboard-table tbody tr:first-child td { | |
border-top-left-radius: 12px; | |
border-top-right-radius: 12px; | |
} | |
.leaderboard-table tbody tr:last-child td { | |
border-bottom-left-radius: 12px; | |
border-bottom-right-radius: 12px; | |
} | |
/* Horizontal line for separation */ | |
hr { | |
border: none; | |
border-top: 1px solid #e2e8f0; | |
margin: 30px 0; | |
} | |
""") as demo: | |
gr.Markdown(""" | |
# π€ LLM Benchmark Evaluator | |
""") | |
with gr.Tabs(): | |
with gr.TabItem("π Run Evaluation"): | |
gr.Markdown(""" | |
<div class="markdown-text"> | |
Enter your Hugging Face Model ID, choose a benchmark (MMLU or MMLU-Pro), | |
select a subject (or 'ALL' for a comprehensive evaluation), | |
and specify the number of samples per subject. | |
Ensure your Hugging Face token is set as an environment variable for private models. | |
</div> | |
""") | |
with gr.Column(elem_classes="gr-box"): | |
model_id_input = gr.Textbox( | |
label="Your Hugging Face Model ID", | |
placeholder="e.g., mistralai/Mistral-7B-Instruct-v0.2", | |
interactive=True | |
) | |
with gr.Row(): | |
benchmark_subject_dropdown = gr.Dropdown( | |
label="Choose Benchmark and Subject", | |
choices=GRADIO_DROPDOWN_OPTIONS, | |
value="MMLU - ALL", # Default to MMLU ALL for initial load | |
interactive=True, | |
min_width=400 # Ensure sufficient width | |
) | |
sample_count_slider = gr.Slider( | |
label="Number of Samples per Subject (1-100)", | |
minimum=1, | |
maximum=100, | |
value=10, # Default to 10 samples | |
step=1, | |
interactive=True, | |
min_width=200 | |
) | |
run_button = gr.Button("π Run Evaluation", elem_classes="gr-button") | |
gr.Markdown("<hr>") # Visual separator | |
with gr.Column(elem_classes="gr-box"): | |
acc_output = gr.Textbox( | |
label="Benchmark Accuracy Results", | |
interactive=False, | |
elem_classes="gr-output-text", | |
lines=5, | |
placeholder="Evaluation results will appear here." | |
) | |
# Container for debug info, initially hidden | |
with gr.Column(visible=False, elem_id="debug-error-column") as debug_error_column: | |
error_message_output = gr.Textbox( | |
label="Debug Information (Error Details)", | |
lines=10, interactive=False, elem_classes="gr-output-text", elem_id="error-message-output", | |
placeholder="Error details will appear here if an error occurs." | |
) | |
debug_button = gr.Button("π Hide Debug Info", visible=True, elem_id="debug-button", elem_classes="gr-button") | |
with gr.Row(): | |
show_details_button = gr.Button("π Show Detailed Logs", visible=False, elem_id="show-details-button", elem_classes="gr-button") | |
download_button = gr.Button("π₯ Download Full Evaluation Logs", visible=False, elem_id="download-button", elem_classes="gr-button") | |
# Detailed output, initially hidden | |
detail_output = gr.Textbox( | |
label="Detailed Evaluation Logs", | |
lines=20, | |
interactive=False, | |
elem_classes="gr-output-text", | |
placeholder="Detailed logs for each question will appear here upon successful evaluation.", | |
visible=False # Initially hidden | |
) | |
# Define button click actions | |
run_button.click( | |
run_evaluation, | |
inputs=[model_id_input, benchmark_subject_dropdown, sample_count_slider], | |
outputs=[ | |
acc_output, | |
error_message_output, debug_error_column, # For error state | |
show_details_button, download_button, detail_output # For success state | |
] | |
) | |
# Toggle visibility of detail_output | |
show_details_button.click( | |
lambda s: gr.update(visible=not s), # Toggle visibility | |
inputs=[detail_output], # Pass the component itself as input | |
outputs=[detail_output] # The component to update | |
) | |
# Change button text based on visibility | |
show_details_button.click( | |
lambda s: "π Hide Detailed Logs" if not s else "π Show Detailed Logs", | |
inputs=[detail_output], | |
outputs=[show_details_button] | |
) | |
# Toggle visibility of debug error column | |
debug_button.click( | |
lambda s: gr.update(visible=not s), # Toggle visibility | |
inputs=[debug_error_column], # Pass the component itself as input | |
outputs=[debug_error_column] # The component to update | |
) | |
# Change debug button text based on visibility | |
debug_button.click( | |
lambda s: "π Show Debug Info" if not s else "π Hide Debug Info", | |
inputs=[debug_error_column], | |
outputs=[debug_button] | |
) | |
download_button.click( | |
save_text, | |
inputs=[detail_output], | |
outputs=gr.File(label="Download Evaluation Results", file_count="single", type="filepath") | |
) | |
with gr.TabItem("π Leaderboard"): | |
gr.Markdown(""" | |
<div class="markdown-text"> | |
Explore the performance of various LLMs on the MMLU and MMLU-Pro benchmarks. | |
This leaderboard is updated automatically with each new evaluation. | |
</div> | |
""") | |
# MMLU Leaderboard Table | |
gr.Markdown("### MMLU Top Models") | |
mmlu_leaderboard_table = gr.Dataframe( | |
headers=["Model ID", "Average Accuracy (%)"], | |
interactive=False, | |
datatype=["str", "number"], | |
row_count=10, | |
col_count=2, | |
label="MMLU Leaderboard Data", | |
elem_classes="leaderboard-table" # Apply custom class for styling | |
) | |
gr.Markdown("### MMLU-Pro Top Models") | |
mmlu_pro_leaderboard_table = gr.Dataframe( | |
headers=["Model ID", "Average Accuracy (%)"], | |
interactive=False, | |
datatype=["str", "number"], | |
row_count=10, | |
col_count=2, | |
label="MMLU-Pro Leaderboard Data", | |
elem_classes="leaderboard-table" # Apply custom class for styling | |
) | |
# Load leaderboard when the tab is selected or when the app loads | |
demo.load(load_leaderboard, inputs=[], outputs=[mmlu_leaderboard_table, mmlu_pro_leaderboard_table]) | |
# Launch the Gradio app | |
demo.launch() | |