import gradio as gr import pandas as pd from huggingface_hub import snapshot_download, create_repo from huggingface_hub.utils import RepositoryNotFoundError import os from src.about import ( INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE, ) from src.display.css_html_js import custom_css from src.display.utils import ( BENCHMARK_COLS, COLS, AutoEvalColumn, fields, ) from src.envs import API, EVAL_RESULTS_PATH, RESULTS_REPO, TOKEN, OWNER from src.populate import get_leaderboard_df from src.evaluation.dynamic_eval import run_dynamic_perplexity_eval def create_results_dataframe(): """Create and return the results DataFrame for display""" import sys sys.stderr.write("\n๐Ÿ“Š CREATE_RESULTS_DATAFRAME CALLED\n") sys.stderr.flush() df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS) sys.stderr.write(f"๐Ÿ“‹ Retrieved leaderboard df: {df.shape if df is not None else 'None'}\n") sys.stderr.flush() if df is None or df.empty: sys.stderr.write("โš ๏ธ DataFrame is None or empty, returning empty DataFrame\n") sys.stderr.flush() # Return empty DataFrame with proper columns return pd.DataFrame(columns=["Model", "Perplexity", "Match P-Value", "Average Score", "Type", "Precision"]) sys.stderr.write(f"๐Ÿ“Š Original DataFrame columns: {list(df.columns)}\n") sys.stderr.flush() # Check if required columns exist required_cols = [ AutoEvalColumn.model.name, "Perplexity", AutoEvalColumn.model_trace_p_value.name, AutoEvalColumn.average.name, AutoEvalColumn.model_type.name, AutoEvalColumn.precision.name, ] missing_cols = [col for col in required_cols if col not in df.columns] if missing_cols: sys.stderr.write(f"โš ๏ธ Missing columns in DataFrame: {missing_cols}\n") sys.stderr.flush() # Add missing columns with default values for col in missing_cols: if col == AutoEvalColumn.model_trace_p_value.name: df[col] = None sys.stderr.write(f"โž• Added {col} column with None values\n") # Select and rename columns for display try: display_df = df[required_cols].copy() sys.stderr.write(f"โœ… Selected columns successfully: {list(display_df.columns)}\n") except Exception as e: sys.stderr.write(f"๐Ÿ’ฅ Error selecting columns: {e}\n") sys.stderr.flush() return pd.DataFrame(columns=["Model", "Perplexity", "Match P-Value", "Average Score", "Type", "Precision"]) # Rename columns for better display display_df.columns = ["Model", "Perplexity", "Match P-Value", "Average Score", "Type", "Precision"] sys.stderr.write(f"๐ŸŽฏ Final display DataFrame shape: {display_df.shape}\n") sys.stderr.write(f"๐ŸŽฏ Final columns: {list(display_df.columns)}\n") # Check p-value column if "Match P-Value" in display_df.columns: p_value_stats = display_df["Match P-Value"].describe() sys.stderr.write(f"๐Ÿ“ˆ P-Value column stats:\n{p_value_stats}\n") sys.stderr.flush() return display_df def run_perplexity_test(model_name, revision, precision): """Run perplexity evaluation on demand.""" import sys import traceback import gradio as gr if not model_name: return "Please enter a model name.", gr.update(), gr.update() try: # Use stderr for more reliable logging in HF Spaces sys.stderr.write(f"\n=== RUNNING PERPLEXITY TEST ===\n") sys.stderr.write(f"Model: {model_name}\n") sys.stderr.write(f"Revision: {revision}\n") sys.stderr.write(f"Precision: {precision}\n") sys.stderr.flush() success, result = run_dynamic_perplexity_eval(model_name, revision, precision) sys.stderr.write(f"Evaluation result - Success: {success}, Result: {result}\n") sys.stderr.flush() if success: sys.stderr.write("Evaluation succeeded - updating both results tables\n") sys.stderr.flush() # Get updated results (this will trigger model trace p-value computation for the new model) sys.stderr.write("๐Ÿ”„ Creating updated results DataFrame (may compute model trace p-values)...\n") sys.stderr.flush() updated_df = create_results_dataframe() sys.stderr.write("โœ… Updated DataFrame created successfully\n") sys.stderr.flush() success_msg = f"""โœ… **Perplexity evaluation completed successfully!** **Model**: {model_name} **Perplexity Score**: {result:.4f} ๐ŸŽ‰ **Results have been saved and both tables have been updated!** Note: Model trace p-value computation may take additional time and will appear in the logs.""" return success_msg, gr.update(value=updated_df), gr.update(value=updated_df) else: return f"โŒ **Evaluation failed**: {result}", gr.update(), gr.update() except Exception as e: error_msg = str(e) traceback_str = traceback.format_exc() sys.stderr.write(f"Critical error in run_perplexity_test: {error_msg}\n") sys.stderr.write(f"Traceback: {traceback_str}\n") sys.stderr.flush() return f"โŒ **Critical error**: {error_msg}", gr.update(), gr.update() # Initialize results repository and directory try: # Try to download existing repository try: snapshot_download( repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN ) except RepositoryNotFoundError: # Create the repository if it doesn't exist print(f"Creating new results repository: {RESULTS_REPO}") create_repo( repo_id=RESULTS_REPO, repo_type="dataset", private=False, token=TOKEN ) # Create local directory os.makedirs(EVAL_RESULTS_PATH, exist_ok=True) except Exception as e: print(f"Error initializing results: {e}") # Ensure local directory exists even if repo operations fail os.makedirs(EVAL_RESULTS_PATH, exist_ok=True) # Get initial results data import sys sys.stderr.write("\n๐Ÿš€ STARTING GRADIO APP INITIALIZATION\n") sys.stderr.write("๐Ÿ“Š Creating initial results DataFrame...\n") sys.stderr.flush() RESULTS_DF = create_results_dataframe() sys.stderr.write(f"โœ… Initial DataFrame created with shape: {RESULTS_DF.shape}\n") sys.stderr.write(f"๐Ÿ“‹ Columns: {list(RESULTS_DF.columns)}\n") sys.stderr.flush() # Create the Gradio interface sys.stderr.write("๐ŸŽจ Creating Gradio interface...\n") sys.stderr.flush() demo = gr.Blocks(css=custom_css) with demo: gr.HTML(TITLE) gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.TabItem("๐Ÿ… Results", elem_id="results-tab", id=0): gr.Markdown("## Model Evaluation Results") results_table = gr.DataFrame( value=RESULTS_DF, headers=["Model", "Perplexity", "Match P-Value", "Average Score", "Type", "Precision"], interactive=False, wrap=False ) with gr.TabItem("๐Ÿ“ About", elem_id="about-tab", id=1): gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") with gr.TabItem("๐Ÿงช Test Model", elem_id="test-model-tab", id=2): gr.Markdown("## Run Perplexity Test\n\nTest any Hugging Face model for perplexity evaluation.") with gr.Row(): with gr.Column(): model_name = gr.Textbox(label="Model name", placeholder="openai-community/gpt2") revision = gr.Textbox(label="Revision", placeholder="main", value="main") precision = gr.Dropdown( choices=["float16", "bfloat16"], label="Precision", value="float16" ) debug_mode = gr.Checkbox(label="Enable debug mode (more verbose logging)", value=True) with gr.Column(): test_button = gr.Button("๐Ÿš€ Run Perplexity Test", variant="primary") result = gr.Markdown() gr.Markdown("## Live Results") live_results_table = gr.DataFrame( value=RESULTS_DF, headers=["Model", "Perplexity", "Match P-Value", "Average Score", "Type", "Precision"], interactive=False, wrap=False ) gr.Markdown(""" ### Tips: - **Check stderr logs** in HF Spaces for detailed debugging information - **Results will update automatically** in the table above after evaluation completes - **Example models to test**: `openai-community/gpt2`, `EleutherAI/gpt-neo-1.3B`, `openai-community/gpt2-large` - **Lower perplexity scores = better performance** (better at predicting text) ### How it works: 1. Enter a model name from Hugging Face Hub 2. Click "Run Perplexity Test" 3. Wait for evaluation to complete (may take a few minutes for large models) 4. Results will appear automatically in the table above! """) test_button.click( run_perplexity_test, [model_name, revision, precision], [result, live_results_table, results_table] ) sys.stderr.write("๐ŸŽฏ GRADIO INTERFACE SETUP COMPLETE\n") sys.stderr.write("๐Ÿš€ LAUNCHING GRADIO APP WITH MODEL TRACING INTEGRATION\n") sys.stderr.write("๐Ÿ“Š Features enabled:\n") sys.stderr.write(" - Perplexity evaluation\n") sys.stderr.write(" - Model trace p-value computation (vs GPT-2 base)\n") sys.stderr.write(" - Match statistic with alignment\n") sys.stderr.write("๐ŸŽ‰ Ready to accept requests!\n") sys.stderr.flush() demo.queue(default_concurrency_limit=5).launch()