import os import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline from datasets import load_dataset, get_dataset_config_names import torch import re import json import pandas as pd import matplotlib.pyplot as plt import traceback # Import traceback for detailed error logging import spaces # Import the spaces library # Cache to avoid reloading the model model_cache = {} HF_TOKEN = os.environ.get("HF_TOKEN") # --- Constants for Benchmarks --- MMLU_DATASET = "cais/mmlu" MMLU_PRO_DATASET = "cais/mmlu_pro" def get_all_benchmark_options(): """ Dynamically fetches all available subjects for MMLU and MMLU-Pro. Returns a dictionary mapping benchmark dataset IDs to their subjects, and a flattened list suitable for a Gradio dropdown. """ all_options = {} gr_dropdown_options = [] # This is for initial display only, not used for dynamic updates directly # Get subjects for MMLU try: mmlu_subjects = get_dataset_config_names(MMLU_DATASET, token=HF_TOKEN) all_options[MMLU_DATASET] = ["ALL"] + mmlu_subjects except Exception as e: print(f"Warning: Could not load MMLU dataset configs. Error: {e}") all_options[MMLU_DATASET] = [] # Get subjects for MMLU-Pro try: mmlu_pro_subjects = get_dataset_config_names(MMLU_PRO_DATASET, token=HF_TOKEN) all_options[MMLU_PRO_DATASET] = ["ALL"] + mmlu_pro_subjects except Exception as e: print(f"Warning: Could not load MMLU-Pro dataset configs. It might not be accessible or available. Error: {e}") all_options[MMLU_PRO_DATASET] = [] # Flattened list for the initial state of the subject dropdown (e.g., MMLU subjects) if MMLU_DATASET in all_options: gr_dropdown_options.extend(all_options[MMLU_DATASET]) return all_options, gr_dropdown_options # Initialize these once globally when the app starts ALL_BENCHMARK_SUBJECTS, INITIAL_GRADIO_DROPDOWN_OPTIONS = get_all_benchmark_options() @spaces.GPU() # Decorator to ensure this function runs on GPU if available def load_model(model_id): """ Loads a Hugging Face model and its tokenizer, then creates a text-generation pipeline. Uses a cache to avoid re-loading if the model is already in memory. Provides Gradio Info/Error messages for user feedback. Raises an exception if model loading fails. """ gr.Info(f"Attempting to load model: {model_id}...") if model_id in model_cache: gr.Info(f"Model '{model_id}' already loaded from cache.") return model_cache[model_id] try: # Load tokenizer and model, using bfloat16 if CUDA is available for efficiency tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( model_id, token=HF_TOKEN, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, trust_remote_code=True ).to("cuda" if torch.cuda.is_available() else "cpu") # Create a text-generation pipeline generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1) # Cache the loaded generator model_cache[model_id] = generator gr.Info(f"Model '{model_id}' loaded successfully.") return generator except Exception as e: # Re-raise the exception to be caught by the outer run_evaluation try-except raise ValueError(f"Failed to load model '{model_id}'. Please verify the model ID and your Hugging Face token. Error: {e}") def format_prompt(item): """ Formats a single MMLU/MMLU-Pro question item into a clear prompt for the LLM. The prompt is designed for the model to output a single letter answer (A, B, C, D). """ prompt = f"""{item['question']} A. {item['choices'][0]} B. {item['choices'][1]} C. {item['choices'][2]} D. {item['choices'][3]} Answer:""" return prompt, item['answer'] # Returns the prompt string and the correct choice index (0-3) def extract_choice_letter(output): """ Extracts the most likely choice letter (A, B, C, D) from the model's generated output. It prioritizes an exact match after "Answer:", then looks for any single capital letter. """ # Look for "Answer: X" pattern first (e.g., "Answer: A" or "Answer: B") match = re.search(r"Answer:\s*([ABCD])", output, re.IGNORECASE) if match: return match.group(1).upper() # Ensure it's uppercase # Fallback: look for a single capital letter A-D anywhere in the output match = re.search(r"\b([ABCD])\b", output.strip()) if match: return match.group(1) return None # Return None if no valid choice letter is found def get_choice_letter(index): """Converts a numerical choice index (0-3) to a capital letter (A-D).""" if 0 <= index <= 3: return chr(ord('A') + index) return None # Return None for invalid indices def evaluate_single_subject(generator, dataset_id, subject, sample_count, progress): """ Evaluates a given model generator on a specific subject from a specified dataset. Args: generator: The Hugging Face pipeline for text generation. dataset_id (str): The ID of the dataset (e.g., "cais/mmlu", "cais/mmlu_pro"). subject (str): The specific subject/config name within the dataset. sample_count (int): The maximum number of samples to evaluate. progress (gr.Progress): Gradio progress tracker. Returns: tuple: (accuracy, list_of_detailed_results) Raises: Exception: If dataset loading fails. """ gr.Info(f"Loading dataset: {dataset_id} - {subject}...") try: # Load the "test" split of the dataset dataset = load_dataset(dataset_id, subject, token=HF_TOKEN)["test"] except Exception as e: # Re-raise the exception to be caught by the outer run_evaluation try-except raise RuntimeError(f"Failed to load dataset '{dataset_id}' for subject '{subject}'. Error: {e}") # Limit the number of samples and shuffle for consistent evaluation across runs num_samples_to_evaluate = min(sample_count, len(dataset)) dataset = dataset.shuffle(seed=42).select(range(num_samples_to_evaluate)) correct_count = 0 subject_results = [] # Iterate through the selected samples with a progress bar for i, item in enumerate(progress.tqdm(dataset, desc=f"Processing {subject} samples")): prompt, answer_idx = format_prompt(item) expected_letter = get_choice_letter(answer_idx) # Generate only 1 new token for the answer (A, B, C, D) # do_sample=False ensures deterministic output for a given prompt (greedy decoding) output_raw = generator(prompt, max_new_tokens=1, do_sample=False)[0]["generated_text"] # Check for potential reasoning model output is_reasoning_model_output = '<' in output_raw or re.search(r"\b(because|therefore|thus|reasoning)\b", output_raw, re.IGNORECASE) is not None # Extract the predicted letter from the model's raw output predicted_letter = extract_choice_letter(output_raw) is_correct = (predicted_letter == expected_letter) correct_count += is_correct # Store detailed results for logging and display subject_results.append({ "question": item['question'], "choices": item['choices'], "model_raw_output": output_raw.strip(), "expected_answer_letter": expected_letter, "predicted_answer_letter": predicted_letter, "is_correct": is_correct, "is_reasoning_model_output": is_reasoning_model_output # Store the flag }) # Calculate accuracy for the current subject accuracy = (correct_count / len(dataset)) * 100 if len(dataset) > 0 else 0 return accuracy, subject_results @spaces.GPU() # Decorator to ensure this function runs on GPU if available def run_evaluation(model_id, benchmark_category, subject_name, sample_count, progress=gr.Progress()): """ Main function to orchestrate the evaluation process. Handles single subject or 'ALL' subjects evaluation for MMLU/MMLU-Pro. Returns Gradio.update objects to control UI component visibility and content. """ gr.Info("Starting evaluation...") if not model_id: gr.Warning("Please enter a Hugging Face Model ID before running the evaluation.") # Return updates to hide logs/debug and show empty results return "", gr.update(value="", visible=False), gr.update(visible=False), \ gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False) dataset_id_map = { "MMLU": MMLU_DATASET, "MMLU-Pro": MMLU_PRO_DATASET } current_dataset_id = dataset_id_map.get(benchmark_category) if not current_dataset_id: gr.Error(f"Unknown benchmark category selected: {benchmark_category}. This should not happen.") return "", gr.update(value="", visible=False), gr.update(visible=False), \ gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False) try: generator = load_model(model_id) # This function will raise an exception on failure all_evaluation_results = [] total_correct_overall = 0 total_samples_overall = 0 eval_summary_lines = [] if subject_name == "ALL": subjects_to_evaluate = ALL_BENCHMARK_SUBJECTS.get(current_dataset_id, []) if "ALL" in subjects_to_evaluate: subjects_to_evaluate.remove("ALL") if not subjects_to_evaluate: gr.Warning(f"No subjects found to evaluate for '{benchmark_category}'.") return "", gr.update(value="", visible=False), gr.update(visible=False), \ gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False) for i, sub in enumerate(progress.tqdm(subjects_to_evaluate, desc=f"Evaluating ALL {benchmark_category} subjects")): gr.Info(f"Evaluating {benchmark_category} - {sub} ({i+1}/{len(subjects_to_evaluate)})...") try: accuracy, subject_details = evaluate_single_subject(generator, current_dataset_id, sub, sample_count, progress) all_evaluation_results.extend(subject_details) num_evaluated_samples = len(subject_details) num_correct_in_subject = sum(d['is_correct'] for d in subject_details) total_correct_overall += num_correct_in_subject total_samples_overall += num_evaluated_samples eval_summary_lines.append(f"- {benchmark_category} - {sub}: {accuracy:.2f}% ({num_correct_in_subject}/{num_evaluated_samples} samples)") except Exception as e: gr.Error(f"Skipping {benchmark_category} - {sub} due to an error: {e}") eval_summary_lines.append(f"- {benchmark_category} - {sub}: Error during evaluation.") continue overall_accuracy = (total_correct_overall / total_samples_overall) * 100 if total_samples_overall > 0 else 0 score_string = f"Overall Average Accuracy for {benchmark_category}: {overall_accuracy:.2f}% across {total_samples_overall} total samples.\n\n" score_string += "Detailed breakdown:\n" + "\n".join(eval_summary_lines) else: accuracy, subject_details = evaluate_single_subject(generator, current_dataset_id, subject_name, sample_count, progress) all_evaluation_results.extend(subject_details) overall_accuracy = accuracy num_evaluated_samples = len(subject_details) score_string = f"Accuracy for {benchmark_category} - {subject_name}: {accuracy:.2f}% out of {num_evaluated_samples} samples." # Format detailed results for display in the text box formatted_details = "\n\n".join([ ( f"### Question:\n{item['question']}\n\n" + f"**Choices:**\n" + "\n".join([f"{get_choice_letter(i)}. {c}" for i, c in enumerate(item['choices'])]) + "\n\n" + (f"**Note:** Reasoning models are currently not fully supported for single-letter extraction. The original model output followed:\n" if item.get('is_reasoning_model_output') else "") + f"**Model Raw Output:** {item['model_raw_output']}\n" + f"**Expected Answer:** {item['expected_answer_letter']}\n" + f"**Predicted Answer:** {item['predicted_answer_letter']}\n" + f"**Correct:** {'Yes' if item['is_correct'] else 'No'}" ) for item in all_evaluation_results ]) # Record the evaluation result to a JSONL file for the leaderboard record = { "model_id": model_id, "benchmark": benchmark_category, "subject": subject_name, "accuracy": overall_accuracy, "sample_count": total_samples_overall if subject_name == "ALL" else len(all_evaluation_results), "timestamp": pd.Timestamp.now().isoformat() } with open("eval.jsonl", "a") as f: f.write(json.dumps(record) + "\n") gr.Info("Evaluation completed successfully!") return score_string, \ gr.update(value="", visible=False), gr.update(visible=False), \ gr.update(visible=True), gr.update(visible=True), gr.update(value=formatted_details, visible=False) except Exception as e: error_message = str(e) detailed_error_traceback = traceback.format_exc() gr.Error("An error occurred during evaluation.") # Return updates for error state return "Error occurred during evaluation. We'll evaluate for you! If this persists, please open a community support tab for assistance.", \ gr.update(value=detailed_error_traceback, visible=True), gr.update(visible=True), \ gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False) def save_text(text_content): """Saves the provided text content to a file and returns the file path for download.""" if not text_content: gr.Warning("No evaluation results to download.") return None file_path = "evaluation_results.txt" try: with open(file_path, "w") as f: f.write(text_content) return file_path except Exception as e: gr.Error(f"Error saving file: {e}") return None def load_leaderboard(benchmark_filter): """ Loads evaluation data from 'eval.jsonl', computes average accuracy per model for the selected benchmark, and prepares data for the leaderboard table. """ try: df = pd.read_json("eval.jsonl", lines=True) # Ensure 'accuracy' is numeric, coerce errors to NaN and drop them df['accuracy'] = pd.to_numeric(df['accuracy'], errors='coerce') df = df.dropna(subset=['accuracy']) if df.empty: gr.Warning("No valid evaluation data found to populate the leaderboard.") return pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records') # Filter data based on the selected benchmark df_filtered = df[df['benchmark'] == benchmark_filter] if df_filtered.empty: gr.Warning(f"No evaluation data for {benchmark_filter} found yet.") return pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records') # For the leaderboard, we typically want the average across all subjects within that benchmark. # So we group by model_id and take the mean of accuracy. df_grouped = df_filtered.groupby("model_id")["accuracy"].mean().reset_index() df_grouped.columns = ["Model ID", "Average Accuracy (%)"] df_sorted = df_grouped.sort_values(by="Average Accuracy (%)", ascending=False) return df_sorted.to_dict('records') except FileNotFoundError: gr.Warning("No evaluation data found yet. Run an evaluation to populate the leaderboard!") return pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records') except Exception as e: gr.Error(f"Error loading leaderboard: {e}") traceback.print_exc() # Print full traceback for debugging return pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records') def update_subject_dropdown_choices(benchmark_category): """ Updates the choices for the subject dropdown based on the selected benchmark category. """ dataset_id_map = { "MMLU": MMLU_DATASET, "MMLU-Pro": MMLU_PRO_DATASET } selected_dataset_id = dataset_id_map.get(benchmark_category) if selected_dataset_id and selected_dataset_id in ALL_BENCHMARK_SUBJECTS: new_choices = ALL_BENCHMARK_SUBJECTS[selected_dataset_id] # Set default value to "ALL" if available, otherwise the first subject default_value = "ALL" if "ALL" in new_choices else (new_choices[0] if new_choices else None) return gr.update(choices=new_choices, value=default_value) else: return gr.update(choices=[], value=None) # --- Gradio Interface Definition --- with gr.Blocks(css=""" /* Import Google Font - Inter */ @import url('https://fonts.com/css2?family=Inter:wght@300;400;500;600;700&display=swap'); /* General body and container styling */ body { font-family: 'Inter', sans-serif; background-color: #eef2f6; /* Lighter background */ margin: 0; padding: 20px; } .gradio-container { max-width: 1200px; margin: 20px auto; padding: 40px; /* Increased padding */ box-shadow: 0 10px 25px rgba(0,0,0,0.1); /* Softer, larger shadow */ border-radius: 15px; /* More rounded corners */ background-color: #ffffff; border: 1px solid #e0e6ed; /* Subtle border */ } /* Headings */ h1 { color: #1a202c; /* Darker, more professional heading color */ text-align: center; margin-bottom: 30px; font-size: 2.8em; /* Slightly larger H1 */ font-weight: 700; letter-spacing: -0.03em; text-shadow: 1px 1px 2px rgba(0,0,0,0.05); /* Subtle text shadow */ } h3 { color: #2d3748; font-size: 1.3em; /* Slightly larger H3 */ margin-bottom: 15px; font-weight: 600; } /* Markdown text */ .markdown-text { text-align: center; color: #4a5568; line-height: 1.7; font-size: 1.05em; margin-bottom: 30px; } .markdown-text div { font-size: 1.1em; max-width: 800px; /* Constrain width for readability */ margin: 0 auto; } /* Buttons */ .gr-button { background-color: #2f80ed; /* A vibrant, professional blue */ color: white; border: none; padding: 14px 30px; /* More padding */ border-radius: 10px; /* More rounded */ cursor: pointer; transition: background-color 0.3s ease, transform 0.2s ease, box-shadow 0.2s ease; font-size: 1.15em; /* Slightly larger font */ font-weight: 600; box-shadow: 0 5px 15px rgba(0, 123, 255, 0.2); /* Enhanced shadow for primary button */ margin: 5px; /* Add some margin for spacing between buttons */ } .gr-button:hover { background-color: #1a6dcd; /* Darker blue on hover */ transform: translateY(-3px); /* More pronounced lift effect */ box-shadow: 0 8px 20px rgba(0, 123, 255, 0.3); } .gr-button:active { transform: translateY(0); box-shadow: 0 2px 5px rgba(0,0,0,0.1); } /* Specific button styling for debug/show details */ #debug-button, #show-details-button { background-color: #718096; /* Professional grey */ box-shadow: 0 3px 10px rgba(113, 128, 150, 0.2); } #debug-button:hover, #show-details-button:hover { background-color: #5d6d81; box-shadow: 0 5px 12px rgba(113, 128, 150, 0.3); } #download-button { background-color: #38a169; /* Muted green for download */ box-shadow: 0 3px 10px rgba(56, 161, 105, 0.2); } #download-button:hover { background-color: #277e50; box-shadow: 0 5px 12px rgba(56, 161, 105, 0.3); } /* Input/Output Boxes (Containers) */ .gr-box { border: 1px solid #cbd5e0; /* Lighter, subtle border */ border-radius: 12px; padding: 25px; /* Increased padding */ margin-bottom: 25px; background-color: #f8fafc; /* Very light background */ box-shadow: inset 0 2px 5px rgba(0,0,0,0.03); /* Subtle inner shadow */ } /* Specific text output boxes (the content inside the containers) */ .gr-output-text { white-space: pre-wrap; word-wrap: break-word; background-color: #ffffff; /* White background for readability */ border: 1px solid #e2e8f0; border-radius: 8px; padding: 18px; /* More padding */ min-height: 120px; /* Ensure a minimum height */ box-shadow: 0 2px 8px rgba(0,0,0,0.05); /* Small shadow for depth */ color: #2d3748; /* Darker text for readability */ font-size: 0.95em; line-height: 1.6; } /* Specific error output style */ #error-message-output { background-color: #ffe0e6; /* Light red */ border-color: #ff99aa; /* Slightly darker red border */ color: #c53030; /* Stronger red text */ font-weight: 500; padding: 20px; } /* Labels for inputs */ .gr-textbox label, .gr-dropdown label, .gr-slider label { font-weight: 600; color: #2d3748; /* Darker label text */ margin-bottom: 10px; display: block; font-size: 1.05em; /* Slightly larger label font */ } /* Tabs styling */ .gr-tabs-nav button { font-weight: 600; font-size: 1.1em; padding: 12px 25px; /* More padding for tabs */ border-top-left-radius: 10px; border-top-right-radius: 10px; background-color: #ebf4f8; /* Light blueish tab background */ color: #4a5568; border: 1px solid #cce0eb; /* Subtle border for tabs */ border-bottom: none; transition: background-color 0.3s ease, color 0.3s ease; } .gr-tabs-nav button.selected { background-color: #ffffff; /* White for selected tab */ color: #2f80ed; /* Blue for selected text */ border-color: #2f80ed; border-bottom: 1px solid #ffffff; /* Hide bottom border to merge with content */ } /* Leaderboard specific table styling (general for all leaderboard tables) */ .leaderboard-table { border-radius: 12px; box-shadow: 0 4px 15px rgba(0,0,0,0.08); overflow: hidden; margin-bottom: 25px; /* Space between tables */ } .leaderboard-table table { border-collapse: separate; border-spacing: 0; width: 100%; background-color: #ffffff; } .leaderboard-table thead th { background-color: #edf2f7; /* Light grey header */ color: #2d3748; font-weight: 700; padding: 15px 20px; text-align: left; border-bottom: 2px solid #e2e8f0; } .leaderboard-table tbody tr { transition: background-color 0.2s ease; } .leaderboard-table tbody tr:nth-child(odd) { background-color: #f7fafc; /* Zebra striping */ } .leaderboard-table tbody tr:hover { background-color: #e6fffa; /* Light teal on hover for rows */ } .leaderboard-table tbody td { padding: 12px 20px; border-bottom: 1px solid #ebf4f8; color: #4a5568; } .leaderboard-table tbody tr:last-child td { border-bottom: none; } .leaderboard-table tbody tr:first-child td { border-top-left-radius: 12px; border-top-right-radius: 12px; } .leaderboard-table tbody tr:last-child td { border-bottom-left-radius: 12px; border-bottom-right-radius: 12px; } /* Radio button group for leaderboard */ #leaderboard-toggle.gr-form { display: flex; justify-content: center; padding: 0px 0px 20px 0px; /* Reduced padding for more compact look */ } #leaderboard-toggle label.gr-radio-label { font-size: 1.1em; font-weight: 600; color: #2d3748; padding: 10px 20px; border-radius: 8px; background-color: #edf2f7; /* Light background for unselected */ border: 1px solid #e2e8f0; cursor: pointer; transition: all 0.3s ease; margin: 0 5px; /* Spacing between radio buttons */ } #leaderboard-toggle input[type="radio"]:checked + label.gr-radio-label { background-color: #2f80ed; /* Blue for selected */ color: white; border-color: #2f80ed; box-shadow: 0 3px 10px rgba(47, 128, 237, 0.3); } #leaderboard-toggle input[type="radio"]:checked + label.gr-radio-label:hover { background-color: #1a6dcd; /* Darker blue on hover */ } #leaderboard-toggle label.gr-radio-label:hover { background-color: #e2e8f0; /* Lighter grey on hover */ } /* Radio button group for evaluation benchmark selection */ #eval-benchmark-selection { display: flex; justify-content: center; margin-bottom: 20px; /* Space above dropdown */ } #eval-benchmark-selection label.gr-radio-label { font-size: 1.05em; font-weight: 500; color: #4a5568; padding: 8px 15px; border-radius: 6px; background-color: #f0f4f7; border: 1px solid #d9e3ed; cursor: pointer; transition: all 0.3s ease; margin: 0 5px; } #eval-benchmark-selection input[type="radio"]:checked + label.gr-radio-label { background-color: #48bb78; /* A pleasant green for evaluation selection */ color: white; border-color: #48bb78; box-shadow: 0 2px 8px rgba(72, 187, 120, 0.2); } #eval-benchmark-selection input[type="radio"]:checked + label.gr-radio-label:hover { background-color: #38a169; } #eval-benchmark-selection label.gr-radio-label:hover { background-color: #e5edf2; } """) as demo: gr.Markdown(""" # 🤖 LLM Benchmark Evaluator """) with gr.Tabs(): with gr.TabItem("🚀 Run Evaluation"): gr.Markdown("""
Enter your Hugging Face Model ID, choose a benchmark (MMLU or MMLU-Pro), select a subject (or 'ALL' for a comprehensive evaluation), and specify the number of samples per subject. Ensure your Hugging Face token is set as an environment variable for private models.
""") with gr.Column(elem_classes="gr-box"): model_id_input = gr.Textbox( label="Your Hugging Face Model ID", placeholder="e.g., mistralai/Mistral-7B-Instruct-v0.2", interactive=True ) # New Radio button for benchmark selection for evaluation benchmark_selection_radio = gr.Radio( ["MMLU", "MMLU-Pro"], label="Select Benchmark Type", value="MMLU", # Default selection interactive=True, container=False, # Important for custom styling placement elem_id="eval-benchmark-selection" ) with gr.Row(): benchmark_subject_dropdown = gr.Dropdown( label="Choose Subject", # Label changed to be more concise choices=INITIAL_GRADIO_DROPDOWN_OPTIONS, # Initial choices (MMLU subjects) value="ALL", # Default to ALL for MMLU initially interactive=True, min_width=400 ) sample_count_slider = gr.Slider( label="Number of Samples per Subject (1-100)", minimum=1, maximum=100, value=10, step=1, interactive=True, min_width=200 ) run_button = gr.Button("🚀 Run Evaluation", elem_classes="gr-button") gr.Markdown("
") # Visual separator with gr.Column(elem_classes="gr-box"): acc_output = gr.Textbox( label="Benchmark Accuracy Results", interactive=False, elem_classes="gr-output-text", lines=5, placeholder="Evaluation results will appear here." ) # Container for debug info, initially hidden with gr.Column(visible=False, elem_id="debug-error-column") as debug_error_column: error_message_output = gr.Textbox( label="Debug Information (Error Details)", lines=10, interactive=False, elem_classes="gr-output-text", elem_id="error-message-output", placeholder="Error details will appear here if an error occurs." ) debug_button = gr.Button("🐛 Hide Debug Info", visible=True, elem_id="debug-button", elem_classes="gr-button") with gr.Row(): show_details_button = gr.Button("🔍 Show Detailed Logs", visible=False, elem_id="show-details-button", elem_classes="gr-button") download_button = gr.Button("📥 Download Full Evaluation Logs", visible=False, elem_id="download-button", elem_classes="gr-button") # Detailed output, initially hidden detail_output = gr.Textbox( label="Detailed Evaluation Logs", lines=20, interactive=False, elem_classes="gr-output-text", placeholder="Detailed logs for each question will appear here upon successful evaluation.", visible=False # Initially hidden ) # Define button click actions run_button.click( run_evaluation, inputs=[model_id_input, benchmark_selection_radio, benchmark_subject_dropdown, sample_count_slider], # Updated inputs outputs=[ acc_output, error_message_output, debug_error_column, # For error state show_details_button, download_button, detail_output # For success state ] ) # Link benchmark selection radio to subject dropdown benchmark_selection_radio.change( update_subject_dropdown_choices, inputs=[benchmark_selection_radio], outputs=[benchmark_subject_dropdown] ) # Toggle visibility of detail_output show_details_button.click( lambda s: gr.update(visible=not s), # Toggle visibility inputs=[detail_output], # Pass the component itself as input outputs=[detail_output] # The component to update ) # Change button text based on visibility show_details_button.click( lambda s: "🙈 Hide Detailed Logs" if not s else "🔍 Show Detailed Logs", inputs=[detail_output], outputs=[show_details_button] ) # Toggle visibility of debug error column debug_button.click( lambda s: gr.update(visible=not s), # Toggle visibility inputs=[debug_error_column], # Pass the component itself as input outputs=[debug_error_column] # The component to update ) # Change debug button text based on visibility debug_button.click( lambda s: "🐛 Show Debug Info" if not s else "🐛 Hide Debug Info", inputs=[debug_error_column], outputs=[debug_button] ) download_button.click( save_text, inputs=[detail_output], outputs=gr.File(label="Download Evaluation Results", file_count="single", type="filepath") ) with gr.TabItem("📊 Leaderboard"): gr.Markdown("""
Explore the performance of various LLMs on the MMLU and MMLU-Pro benchmarks. This leaderboard is updated automatically with each new evaluation.
""") # Leaderboard Type Toggle leaderboard_type_toggle = gr.Radio( ["MMLU", "MMLU-Pro"], label="Select Benchmark for Leaderboard", value="MMLU", # Default to MMLU interactive=True, container=False, # Make it inline with content elem_id="leaderboard-toggle" ) # Leaderboard Table leaderboard_table_output = gr.Dataframe( headers=["Model ID", "Average Accuracy (%)"], interactive=False, datatype=["str", "number"], row_count=10, col_count=2, label="Benchmark Leaderboard Data", elem_classes="leaderboard-table" # Apply custom class for styling ) # Initial load and dynamic update for the leaderboard demo.load(load_leaderboard, inputs=[leaderboard_type_toggle], outputs=[leaderboard_table_output]) leaderboard_type_toggle.change(load_leaderboard, inputs=[leaderboard_type_toggle], outputs=[leaderboard_table_output]) # Launch the Gradio app demo.launch()