Spaces:
Runtime error
Runtime error
File size: 6,977 Bytes
359f755 de8f813 24c8512 359f755 1b2d49a 359f755 24c8512 77c0f20 359f755 f02d36b 1dd4b6a f02d36b 1dd4b6a f02d36b 1dd4b6a f02d36b 1191811 77c0f20 1dd4b6a 1191811 1dd4b6a f02d36b 1dd4b6a f02d36b 1dd4b6a 1191811 ce8066d f02d36b 1191811 1dd4b6a f02d36b 1dd4b6a f02d36b 359f755 1191811 70ea05e 24c8512 77c0f20 24c8512 de8f813 24c8512 77c0f20 24c8512 70ea05e 4864926 1dd4b6a 1191811 4864926 1dd4b6a 4864926 1dd4b6a f02d36b 77c0f20 1dd4b6a 77c0f20 1dd4b6a 359f755 f02d36b 1191811 f02d36b 359f755 77c0f20 359f755 1191811 f02d36b 536d515 1191811 3a2ac99 1191811 536d515 359f755 1dd4b6a 1191811 1dd4b6a 1191811 1dd4b6a 1191811 1dd4b6a 77c0f20 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 |
import gradio as gr
import pandas as pd
from huggingface_hub import snapshot_download, create_repo
from huggingface_hub.utils import RepositoryNotFoundError
import os
from src.about import (
INTRODUCTION_TEXT,
LLM_BENCHMARKS_TEXT,
TITLE,
)
from src.display.css_html_js import custom_css
from src.display.utils import (
BENCHMARK_COLS,
COLS,
AutoEvalColumn,
fields,
)
from src.envs import API, EVAL_RESULTS_PATH, RESULTS_REPO, TOKEN, OWNER
from src.populate import get_leaderboard_df
def create_results_dataframe():
"""Create and return the results DataFrame for display"""
import sys
sys.stderr.write("\nπ CREATE_RESULTS_DATAFRAME CALLED\n")
sys.stderr.flush()
df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
sys.stderr.write(f"π Retrieved leaderboard df: {df.shape if df is not None else 'None'}\n")
sys.stderr.flush()
if df is None or df.empty:
sys.stderr.write("β οΈ DataFrame is None or empty, returning empty DataFrame\n")
sys.stderr.flush()
# Return empty DataFrame with proper columns
return pd.DataFrame(columns=["Model", "Match P-Value", "Type", "Precision"])
sys.stderr.write(f"π Original DataFrame columns: {list(df.columns)}\n")
sys.stderr.flush()
# Check if required columns exist - only p-values matter
required_cols = [
AutoEvalColumn.model.name,
AutoEvalColumn.model_trace_p_value.name,
AutoEvalColumn.model_type.name,
AutoEvalColumn.precision.name,
]
missing_cols = [col for col in required_cols if col not in df.columns]
if missing_cols:
sys.stderr.write(f"β οΈ Missing columns in DataFrame: {missing_cols}\n")
sys.stderr.flush()
# Add missing columns with default values
for col in missing_cols:
if col == AutoEvalColumn.model_trace_p_value.name:
df[col] = None
sys.stderr.write(f"β Added {col} column with None values\n")
# Select and rename columns for display
try:
display_df = df[required_cols].copy()
sys.stderr.write(f"β
Selected columns successfully: {list(display_df.columns)}\n")
except Exception as e:
sys.stderr.write(f"π₯ Error selecting columns: {e}\n")
sys.stderr.flush()
return pd.DataFrame(columns=["Model", "Match P-Value", "Type", "Precision"])
# Rename columns for better display
display_df.columns = ["Model", "Match P-Value", "Type", "Precision"]
sys.stderr.write(f"π― Final display DataFrame shape: {display_df.shape}\n")
sys.stderr.write(f"π― Final columns: {list(display_df.columns)}\n")
# Check p-value column
if "Match P-Value" in display_df.columns:
p_value_stats = display_df["Match P-Value"].describe()
sys.stderr.write(f"π P-Value column stats:\n{p_value_stats}\n")
sys.stderr.flush()
return display_df
# Perplexity testing removed - we only focus on p-values now
# Initialize results repository and directory
try:
# Try to download existing repository
try:
snapshot_download(
repo_id=RESULTS_REPO,
local_dir=EVAL_RESULTS_PATH,
repo_type="dataset",
tqdm_class=None,
etag_timeout=30,
token=TOKEN
)
except RepositoryNotFoundError:
# Create the repository if it doesn't exist
print(f"Creating new results repository: {RESULTS_REPO}")
create_repo(
repo_id=RESULTS_REPO,
repo_type="dataset",
private=False,
token=TOKEN
)
# Create local directory
os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
except Exception as e:
print(f"Error initializing results: {e}")
# Ensure local directory exists even if repo operations fail
os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
# Initialize allowed models
import sys
from src.evaluation.initialize_models import initialize_allowed_models
sys.stderr.write("\nπ STARTING GRADIO APP INITIALIZATION\n")
sys.stderr.write("π Initializing allowed models...\n")
sys.stderr.flush()
# Initialize the allowed models
initialize_allowed_models()
sys.stderr.write("π Creating initial results DataFrame...\n")
sys.stderr.flush()
RESULTS_DF = create_results_dataframe()
sys.stderr.write(f"β
Initial DataFrame created with shape: {RESULTS_DF.shape}\n")
sys.stderr.write(f"π Columns: {list(RESULTS_DF.columns)}\n")
sys.stderr.flush()
# Create the Gradio interface
sys.stderr.write("π¨ Creating Gradio interface...\n")
sys.stderr.flush()
demo = gr.Blocks(css=custom_css)
with demo:
gr.HTML(TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("π
Results", elem_id="results-tab", id=0):
gr.Markdown("## Model Evaluation Results")
results_table = gr.DataFrame(
value=RESULTS_DF,
headers=["Model", "Match P-Value", "Type", "Precision"],
interactive=False,
wrap=False
)
with gr.TabItem("π About", elem_id="about-tab", id=1):
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
with gr.TabItem("π¬ Analysis", elem_id="analysis-tab", id=2):
gr.Markdown("## Model Tracing Analysis\n\nP-values are computed automatically for all supported models.")
gr.Markdown("""
### Current Analysis Status:
- **P-values are computed automatically** using the model tracing pipeline
- **Lower p-values indicate higher structural similarity** to Llama-2-7B
- **Analysis compares neuron organization** across transformer layers
- **Results appear in the main table** once computation is complete
### Supported Models:
- `lmsys/vicuna-7b-v1.5` - Vicuna 7B v1.5
- `ibm-granite/granite-7b-base` - IBM Granite 7B Base
- `EleutherAI/llemma_7b` - LLeMa 7B
### How it works:
1. Models are automatically analyzed against Llama-2-7B base
2. Match statistic with alignment is computed
3. P-values indicate structural similarity preservation
4. Results appear in the main Results tab
""")
sys.stderr.write("π― GRADIO INTERFACE SETUP COMPLETE\n")
sys.stderr.write("π LAUNCHING GRADIO APP WITH MODEL TRACING ANALYSIS\n")
sys.stderr.write("π Features enabled:\n")
sys.stderr.write(" - Model trace p-value computation (vs Llama-2-7B base)\n")
sys.stderr.write(" - Match statistic with alignment\n")
sys.stderr.write(" - Structural similarity analysis\n")
sys.stderr.write("π Ready to display p-values!\n")
sys.stderr.flush()
demo.queue(default_concurrency_limit=5).launch() |