import os import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline from datasets import load_dataset, get_dataset_config_names import torch import re import json import pandas as pd import traceback import spaces # --- Environment and Caching --- # It's good practice to ensure the cache directory exists. CACHE_DIR = "evaluation_cache" os.makedirs(CACHE_DIR, exist_ok=True) EVAL_FILE = os.path.join(CACHE_DIR, "eval.jsonl") # Cache to avoid reloading models and dataset configs model_cache = {} benchmark_subject_cache = {} # Use environment variable for the Hugging Face token HF_TOKEN = os.environ.get("HF_TOKEN") # --- Constants for Benchmarks --- MMLU_DATASET = "cais/mmlu" MMLU_PRO_DATASET = "TIGER-Lab/MMLU-Pro" BENCHMARK_MAP = { "MMLU": MMLU_DATASET, "MMLU-Pro": MMLU_PRO_DATASET } # --- Data Loading and Preparation --- def get_all_benchmark_options(): """ Fetches and caches the available subjects (configs) for each benchmark dataset. This function now populates a global cache to avoid repeated API calls. """ if benchmark_subject_cache: return benchmark_subject_cache print("Fetching benchmark configurations for the first time...") for key, dataset_id in BENCHMARK_MAP.items(): try: # Fetching dataset configurations requires authentication if the dataset is private subjects = get_dataset_config_names(dataset_id, token=HF_TOKEN) benchmark_subject_cache[key] = ["ALL"] + subjects except Exception as e: print(f"Warning: Could not load configs for {key} ({dataset_id}). It might be private or unavailable. Error: {e}") benchmark_subject_cache[key] = [] print("Benchmark configurations cached.") return benchmark_subject_cache # Initialize the cache on startup ALL_BENCHMARK_SUBJECTS = get_all_benchmark_options() @spaces.GPU() def load_model(model_id): """ Loads a Hugging Face model and tokenizer, creating a text-generation pipeline. Uses a cache to avoid reloading models. """ if not model_id: raise ValueError("Model ID cannot be empty.") gr.Info(f"Attempting to load model: {model_id}...") if model_id in model_cache: gr.Info(f"Model '{model_id}' found in cache.") return model_cache[model_id] try: # Use bfloat16 for better performance on modern GPUs dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float32 tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( model_id, token=HF_TOKEN, torch_dtype=dtype, trust_remote_code=True ).to("cuda" if torch.cuda.is_available() else "cpu") # Create the pipeline for text generation generator = pipeline( "text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1 ) model_cache[model_id] = generator gr.Info(f"Model '{model_id}' loaded successfully.") return generator except Exception as e: # Raise a more specific error to be caught by the main evaluation function raise RuntimeError(f"Failed to load model '{model_id}'. Please verify the model ID and your Hugging Face token (if required). Error: {e}") # --- Evaluation Logic --- def format_prompt(item): """Formats the MMLU question and choices into a standardized prompt.""" prompt = f"Question: {item['question']}\n\nChoices:\nA. {item['choices'][0]}\nB. {item['choices'][1]}\nC. {item['choices'][2]}\nD. {item['choices'][3]}\n\nAnswer:" return prompt, item['answer'] def get_choice_letter(index): """Converts a numerical choice index (0-3) to a letter (A-D).""" return chr(ord('A') + index) if 0 <= index <= 3 else None def extract_predicted_letter(output_text): """ Extracts the predicted letter from the model's output. It looks for a letter (A, B, C, D) immediately following 'Answer:'. """ match = re.search(r"Answer:\s*([ABCD])", output_text, re.IGNORECASE) return match.group(1).upper() if match else None def evaluate_single_subject(generator, dataset_id, subject, sample_count, progress): """ Evaluates a model on a specific subject from a dataset. """ gr.Info(f"Loading dataset: {dataset_id} ({subject})...") try: # Load the 'test' split as it's standard for MMLU evaluation dataset = load_dataset(dataset_id, subject, token=HF_TOKEN, split="test") except Exception as e: raise RuntimeError(f"Failed to load dataset '{dataset_id}' for subject '{subject}'. Error: {e}") # Shuffle and select a subset of samples for evaluation num_samples = min(sample_count, len(dataset)) dataset = dataset.shuffle(seed=42).select(range(num_samples)) correct_predictions = 0 results_details = [] for item in progress.tqdm(dataset, desc=f"Evaluating {subject}"): prompt, correct_answer_idx = format_prompt(item) expected_letter = get_choice_letter(correct_answer_idx) # Generate a short response, aiming for a single letter answer. # do_sample=False (greedy decoding) is crucial for reproducibility. raw_output = generator(prompt, max_new_tokens=5, do_sample=False)[0]["generated_text"] predicted_letter = extract_predicted_letter(raw_output) is_correct = (predicted_letter == expected_letter) if is_correct: correct_predictions += 1 results_details.append({ "question": item['question'], "choices": item['choices'], "raw_output": raw_output.strip(), "expected_letter": expected_letter, "predicted_letter": predicted_letter, "is_correct": is_correct, }) accuracy = (correct_predictions / num_samples) * 100 if num_samples > 0 else 0 return accuracy, results_details @spaces.GPU() def run_evaluation(model_id, benchmark_category, subject_name, sample_count, progress=gr.Progress(track_tqdm=True)): """ Main function to orchestrate the entire evaluation process. Handles single subject or 'ALL' subjects evaluation. Returns updates for Gradio UI components. """ try: gr.Info("Starting evaluation...") generator = load_model(model_id) dataset_id = BENCHMARK_MAP.get(benchmark_category) if not dataset_id: raise ValueError(f"Invalid benchmark category: {benchmark_category}") all_results_details = [] summary_lines = [] total_correct = 0 total_samples = 0 subjects_to_run = [] if subject_name == "ALL": subjects_to_run = ALL_BENCHMARK_SUBJECTS.get(benchmark_category, []) if "ALL" in subjects_to_run: subjects_to_run.remove("ALL") # Remove 'ALL' from the list of subjects to run else: subjects_to_run = [subject_name] if not subjects_to_run: gr.Warning(f"No subjects found for '{benchmark_category}'.") return "", "", "", pd.DataFrame().to_dict('records') for i, subject in enumerate(subjects_to_run): gr.Info(f"Evaluating {benchmark_category} - {subject} ({i+1}/{len(subjects_to_run)})...") try: accuracy, subject_details = evaluate_single_subject(generator, dataset_id, subject, sample_count, progress) all_results_details.extend(subject_details) num_correct = sum(d['is_correct'] for d in subject_details) num_evaluated = len(subject_details) total_correct += num_correct total_samples += num_evaluated summary_lines.append(f"- **{subject}**: {accuracy:.2f}% ({num_correct}/{num_evaluated})") except Exception as e: gr.Error(f"Skipping {subject} due to an error: {e}") summary_lines.append(f"- **{subject}**: Evaluation failed.") continue overall_accuracy = (total_correct / total_samples) * 100 if total_samples > 0 else 0 # --- Prepare Outputs --- if subject_name == "ALL": result_summary = f"### Overall Average Accuracy for {benchmark_category}: {overall_accuracy:.2f}%\n" result_summary += "across {:,} total samples.\n\n---\n\n**Breakdown by Subject:**\n".format(total_samples) result_summary += "\n".join(summary_lines) else: result_summary = f"### Accuracy for {benchmark_category} - {subject_name}: {overall_accuracy:.2f}%\n" result_summary += "({:,}/{:,} correct)".format(total_correct, total_samples) # Create a detailed DataFrame for inspection df_details = pd.DataFrame(all_results_details) # Save results for leaderboard record = { "model_id": model_id, "benchmark": benchmark_category, "accuracy": overall_accuracy, "subject": subject_name, "sample_count": total_samples, "timestamp": pd.Timestamp.now().isoformat() } with open(EVAL_FILE, "a") as f: f.write(json.dumps(record) + "\n") gr.Info("Evaluation completed successfully!") # Return updates for the UI return ( gr.update(value=result_summary, visible=True), gr.update(value="", visible=False), # Hide error message gr.update(value="", visible=False), # Hide error details gr.update(value=df_details.to_dict('records'), visible=True) # Show detailed results table ) except Exception as e: error_message = f"An unexpected error occurred: {e}" error_details = traceback.format_exc() gr.Error(error_message) # Return error updates for the UI return ( gr.update(value="", visible=False), # Hide results summary gr.update(value=error_message, visible=True), gr.update(value=error_details, visible=True), gr.update(value=pd.DataFrame().to_dict('records'), visible=False) # Hide detailed results ) # --- UI Helper Functions --- def update_subject_dropdown(benchmark_category): """Updates the subject dropdown choices based on the selected benchmark.""" choices = ALL_BENCHMARK_SUBJECTS.get(benchmark_category, []) default_value = "ALL" if "ALL" in choices else (choices[0] if choices else None) return gr.update(choices=choices, value=default_value) def load_leaderboard(benchmark_filter): """ Loads and processes evaluation data to display on the leaderboard. It now correctly averages scores for models that were evaluated on 'ALL' subjects. """ try: if not os.path.exists(EVAL_FILE): return pd.DataFrame(columns=["Model ID", "Avg. Accuracy (%)", "Total Samples"]).to_dict('records') df = pd.read_json(EVAL_FILE, lines=True) if df.empty: return pd.DataFrame(columns=["Model ID", "Avg. Accuracy (%)", "Total Samples"]).to_dict('records') # Coerce accuracy to numeric and filter valid entries df['accuracy'] = pd.to_numeric(df['accuracy'], errors='coerce') df.dropna(subset=['accuracy'], inplace=True) # Filter by the selected benchmark (e.g., MMLU or MMLU-Pro) df_filtered = df[df['benchmark'] == benchmark_filter].copy() if df_filtered.empty: return pd.DataFrame(columns=["Model ID", "Avg. Accuracy (%)", "Total Samples"]).to_dict('records') # We are interested in the 'ALL' subject evaluations for the main leaderboard df_all = df_filtered[df_filtered['subject'] == 'ALL'].copy() if df_all.empty: return pd.DataFrame(columns=["Model ID", "Avg. Accuracy (%)", "Total Samples"]).to_dict('records') # Find the latest evaluation for each model df_all['timestamp'] = pd.to_datetime(df_all['timestamp']) latest_evals = df_all.loc[df_all.groupby('model_id')['timestamp'].idxmax()] leaderboard_df = latest_evals[['model_id', 'accuracy', 'sample_count']].copy() leaderboard_df.columns = ["Model ID", "Avg. Accuracy (%)", "Total Samples"] # Format accuracy to 2 decimal places leaderboard_df["Avg. Accuracy (%)"] = leaderboard_df["Avg. Accuracy (%)"].map('{:.2f}'.format) # Sort by accuracy leaderboard_df = leaderboard_df.sort_values(by="Avg. Accuracy (%)", ascending=False) return leaderboard_df.to_dict('records') except Exception as e: gr.Error(f"Error loading leaderboard: {e}") traceback.print_exc() return pd.DataFrame(columns=["Model ID", "Avg. Accuracy (%)", "Total Samples"]).to_dict('records') # --- Gradio Interface Definition --- with gr.Blocks(theme=gr.themes.Soft(), css=""" /* --- Global & Layout --- */ body { font-family: 'Inter', sans-serif; background-color: #f8f9fa; } .gradio-container { max-width: 1280px !important; margin: auto; } .gr-box { border-radius: 12px !important; box-shadow: 0 4px 12px rgba(0,0,0,0.05) !important; border: 1px solid #e9ecef !important; } /* --- Typography --- */ h1 { text-align: center; font-size: 2.5rem !important; font-weight: 700; color: #212529; margin-bottom: 0.5rem; letter-spacing: -1px; } .subtitle { text-align: center; color: #6c757d; font-size: 1.1rem; margin-bottom: 2.5rem; } /* --- Buttons & Inputs --- */ .gr-button { border-radius: 8px !important; font-weight: 600 !important; padding: 10px 20px !important; transition: all 0.2s ease; } .gr-button-primary { box-shadow: 0 4px 10px rgba(0, 123, 255, 0.2); } .gr-button-primary:hover { transform: translateY(-2px); box-shadow: 0 6px 15px rgba(0, 123, 255, 0.3); } /* --- Custom Radio Buttons (Segmented Control) --- */ #leaderboard-toggle, #eval-benchmark-selection { background-color: #e9ecef; padding: 5px; border-radius: 10px; display: inline-flex; margin: auto; } #leaderboard-toggle div.gr-form, #eval-benchmark-selection div.gr-form { display: flex; gap: 5px; } #leaderboard-toggle input[type='radio'], #eval-benchmark-selection input[type='radio'] { display: none; } #leaderboard-toggle label, #eval-benchmark-selection label { padding: 8px 16px; border-radius: 8px; cursor: pointer; transition: background-color 0.3s, color 0.3s, box-shadow 0.3s; font-weight: 500; color: #495057; background: transparent; border: none; box-shadow: none; } #leaderboard-toggle input[type='radio']:checked + label, #eval-benchmark-selection input[type='radio']:checked + label { background-color: white; color: #007bff; font-weight: 600; box-shadow: 0 2px 5px rgba(0,0,0,0.1); } /* --- Dataframe / Table Styling --- */ .leaderboard-table .gr-dataframe table { border-collapse: collapse; width: 100%; } .leaderboard-table .gr-dataframe thead th { background-color: #f8f9fa !important; color: #495057 !important; font-weight: 600 !important; text-align: left; padding: 12px 15px; border-bottom: 2px solid #dee2e6; } .leaderboard-table .gr-dataframe tbody tr:nth-of-type(even) { background-color: #f8f9fa; } .leaderboard-table .gr-dataframe tbody tr:hover { background-color: #e9ecef; } .leaderboard-table .gr-dataframe tbody td { padding: 12px 15px; border-bottom: 1px solid #dee2e6; } /* --- Error & Result Panes --- */ #error-display-box { background-color: #fff3f3; border-color: #ffc9c9; } #error-display-box .gr-label { color: #d9480f !important; font-weight: 600; } #result-summary-box { background-color: #f3f9ff; border-color: #cde4ff; } """) as demo: gr.Markdown("

🤖 Open LLM Evaluator

") gr.Markdown("

Benchmark leading models on MMLU and MMLU-Pro. Your results contribute to a live leaderboard.

") with gr.Tabs(): # --- Evaluation Tab --- with gr.TabItem("🚀 Run Evaluation"): with gr.Row(): with gr.Column(scale=2): with gr.Box(): gr.Markdown("### 1. Configure Evaluation") model_id_input = gr.Textbox( label="Hugging Face Model ID", placeholder="e.g., meta-llama/Meta-Llama-3-8B-Instruct", interactive=True ) with gr.Row(): benchmark_selection_radio = gr.Radio( ["MMLU", "MMLU-Pro"], label="Benchmark", value="MMLU", interactive=True, elem_id="eval-benchmark-selection", container=False ) with gr.Row(): benchmark_subject_dropdown = gr.Dropdown( label="Subject", choices=ALL_BENCHMARK_SUBJECTS.get("MMLU", []), value="ALL", interactive=True ) sample_count_slider = gr.Slider( label="Samples per Subject", minimum=5, maximum=100, value=25, step=5, interactive=True ) run_button = gr.Button("Start Evaluation", variant="primary", scale=1) with gr.Column(scale=3): gr.Markdown("### 2. View Results") # Panel for displaying the summary of results with gr.Box(visible=False, elem_id="result-summary-box") as result_summary_box: result_summary_output = gr.Markdown() # Panel for displaying errors with gr.Box(visible=False, elem_id="error-display-box") as error_box: error_output = gr.Textbox(label="Error Message", interactive=False) error_details_output = gr.Textbox(label="Error Details (Traceback)", interactive=False, lines=8) # Panel for detailed, row-by-row results with gr.Box(visible=False) as details_box: gr.Markdown("#### Detailed Evaluation Log") detailed_results_df = gr.Dataframe( headers=["Question", "Correct", "Expected", "Predicted", "Raw Output"], datatype=["str", "bool", "str", "str", "str"], interactive=False, row_count=10, col_count=5 ) # --- Leaderboard Tab --- with gr.TabItem("📊 Leaderboard"): with gr.Column(): gr.Markdown("
", elem_id="leaderboard-toggle-container") leaderboard_type_toggle = gr.Radio( ["MMLU", "MMLU-Pro"], label="Select Benchmark", value="MMLU", interactive=True, elem_id="leaderboard-toggle", container=False ) leaderboard_table_output = gr.Dataframe( headers=["Model ID", "Avg. Accuracy (%)", "Total Samples"], interactive=False, datatype=["str", "str", "number"], row_count=15, elem_classes="leaderboard-table" ) # --- Event Handlers & Logic --- # Update subject dropdown when benchmark type changes benchmark_selection_radio.change( fn=update_subject_dropdown, inputs=[benchmark_selection_radio], outputs=[benchmark_subject_dropdown] ) # Main evaluation trigger run_button.click( fn=run_evaluation, inputs=[model_id_input, benchmark_selection_radio, benchmark_subject_dropdown, sample_count_slider], outputs=[result_summary_box, error_box, error_details_output, details_box] ).then( # This chained function updates the component values *after* the visibility is set lambda r, e, d, df: (r, e, d, df.to_dict('records')), inputs=[result_summary_box, error_box, error_details_output, details_box], outputs=[result_summary_output, error_output, error_details_output, detailed_results_df] ) # Leaderboard loading logic demo.load( fn=load_leaderboard, inputs=[leaderboard_type_toggle], outputs=[leaderboard_table_output] ) leaderboard_type_toggle.change( fn=load_leaderboard, inputs=[leaderboard_type_toggle], outputs=[leaderboard_table_output] ) # When the run button is clicked again, refresh the leaderboard run_button.click( fn=load_leaderboard, inputs=[leaderboard_type_toggle], outputs=[leaderboard_table_output] ) # Launch the Gradio app if __name__ == "__main__": demo.launch(debug=True)