Ahmed Ahmed commited on
Commit
1dd4b6a
Β·
1 Parent(s): 86c1853
app.py CHANGED
@@ -22,23 +22,66 @@ from src.evaluation.dynamic_eval import run_dynamic_perplexity_eval
22
 
23
  def create_results_dataframe():
24
  """Create and return the results DataFrame for display"""
 
 
 
 
 
25
  df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
 
 
 
 
26
  if df is None or df.empty:
 
 
27
  # Return empty DataFrame with proper columns
28
- return pd.DataFrame(columns=["Model", "Perplexity", "Average Score", "Type", "Precision"])
29
 
30
- # Select and rename columns for display
31
- display_df = df[[
 
 
 
32
  AutoEvalColumn.model.name,
33
- "Perplexity", # This matches the task column name from Tasks.task0.value.col_name
 
34
  AutoEvalColumn.average.name,
35
  AutoEvalColumn.model_type.name,
36
  AutoEvalColumn.precision.name,
37
- ]].copy()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  # Rename columns for better display
40
- display_df.columns = ["Model", "Perplexity", "Average Score", "Type", "Precision"]
 
 
 
 
 
 
 
 
41
 
 
42
  return display_df
43
 
44
  def run_perplexity_test(model_name, revision, precision):
@@ -66,15 +109,23 @@ def run_perplexity_test(model_name, revision, precision):
66
  sys.stderr.write("Evaluation succeeded - updating both results tables\n")
67
  sys.stderr.flush()
68
 
69
- # Get updated results
 
 
 
70
  updated_df = create_results_dataframe()
71
 
 
 
 
72
  success_msg = f"""βœ… **Perplexity evaluation completed successfully!**
73
 
74
  **Model**: {model_name}
75
  **Perplexity Score**: {result:.4f}
76
 
77
- πŸŽ‰ **Results have been saved and both tables have been updated!**"""
 
 
78
 
79
  return success_msg, gr.update(value=updated_df), gr.update(value=updated_df)
80
  else:
@@ -117,9 +168,21 @@ except Exception as e:
117
  os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
118
 
119
  # Get initial results data
 
 
 
 
 
120
  RESULTS_DF = create_results_dataframe()
121
 
 
 
 
 
122
  # Create the Gradio interface
 
 
 
123
  demo = gr.Blocks(css=custom_css)
124
  with demo:
125
  gr.HTML(TITLE)
@@ -130,7 +193,7 @@ with demo:
130
  gr.Markdown("## Model Evaluation Results")
131
  results_table = gr.DataFrame(
132
  value=RESULTS_DF,
133
- headers=["Model", "Perplexity", "Average Score", "Type", "Precision"],
134
  interactive=False,
135
  wrap=False
136
  )
@@ -159,7 +222,7 @@ with demo:
159
  gr.Markdown("## Live Results")
160
  live_results_table = gr.DataFrame(
161
  value=RESULTS_DF,
162
- headers=["Model", "Perplexity", "Average Score", "Type", "Precision"],
163
  interactive=False,
164
  wrap=False
165
  )
@@ -184,4 +247,13 @@ with demo:
184
  [result, live_results_table, results_table]
185
  )
186
 
 
 
 
 
 
 
 
 
 
187
  demo.queue(default_concurrency_limit=5).launch()
 
22
 
23
  def create_results_dataframe():
24
  """Create and return the results DataFrame for display"""
25
+ import sys
26
+
27
+ sys.stderr.write("\nπŸ“Š CREATE_RESULTS_DATAFRAME CALLED\n")
28
+ sys.stderr.flush()
29
+
30
  df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
31
+
32
+ sys.stderr.write(f"πŸ“‹ Retrieved leaderboard df: {df.shape if df is not None else 'None'}\n")
33
+ sys.stderr.flush()
34
+
35
  if df is None or df.empty:
36
+ sys.stderr.write("⚠️ DataFrame is None or empty, returning empty DataFrame\n")
37
+ sys.stderr.flush()
38
  # Return empty DataFrame with proper columns
39
+ return pd.DataFrame(columns=["Model", "Perplexity", "Match P-Value", "Average Score", "Type", "Precision"])
40
 
41
+ sys.stderr.write(f"πŸ“Š Original DataFrame columns: {list(df.columns)}\n")
42
+ sys.stderr.flush()
43
+
44
+ # Check if required columns exist
45
+ required_cols = [
46
  AutoEvalColumn.model.name,
47
+ "Perplexity",
48
+ AutoEvalColumn.model_trace_p_value.name,
49
  AutoEvalColumn.average.name,
50
  AutoEvalColumn.model_type.name,
51
  AutoEvalColumn.precision.name,
52
+ ]
53
+
54
+ missing_cols = [col for col in required_cols if col not in df.columns]
55
+ if missing_cols:
56
+ sys.stderr.write(f"⚠️ Missing columns in DataFrame: {missing_cols}\n")
57
+ sys.stderr.flush()
58
+ # Add missing columns with default values
59
+ for col in missing_cols:
60
+ if col == AutoEvalColumn.model_trace_p_value.name:
61
+ df[col] = None
62
+ sys.stderr.write(f"βž• Added {col} column with None values\n")
63
+
64
+ # Select and rename columns for display
65
+ try:
66
+ display_df = df[required_cols].copy()
67
+ sys.stderr.write(f"βœ… Selected columns successfully: {list(display_df.columns)}\n")
68
+ except Exception as e:
69
+ sys.stderr.write(f"πŸ’₯ Error selecting columns: {e}\n")
70
+ sys.stderr.flush()
71
+ return pd.DataFrame(columns=["Model", "Perplexity", "Match P-Value", "Average Score", "Type", "Precision"])
72
 
73
  # Rename columns for better display
74
+ display_df.columns = ["Model", "Perplexity", "Match P-Value", "Average Score", "Type", "Precision"]
75
+
76
+ sys.stderr.write(f"🎯 Final display DataFrame shape: {display_df.shape}\n")
77
+ sys.stderr.write(f"🎯 Final columns: {list(display_df.columns)}\n")
78
+
79
+ # Check p-value column
80
+ if "Match P-Value" in display_df.columns:
81
+ p_value_stats = display_df["Match P-Value"].describe()
82
+ sys.stderr.write(f"πŸ“ˆ P-Value column stats:\n{p_value_stats}\n")
83
 
84
+ sys.stderr.flush()
85
  return display_df
86
 
87
  def run_perplexity_test(model_name, revision, precision):
 
109
  sys.stderr.write("Evaluation succeeded - updating both results tables\n")
110
  sys.stderr.flush()
111
 
112
+ # Get updated results (this will trigger model trace p-value computation for the new model)
113
+ sys.stderr.write("πŸ”„ Creating updated results DataFrame (may compute model trace p-values)...\n")
114
+ sys.stderr.flush()
115
+
116
  updated_df = create_results_dataframe()
117
 
118
+ sys.stderr.write("βœ… Updated DataFrame created successfully\n")
119
+ sys.stderr.flush()
120
+
121
  success_msg = f"""βœ… **Perplexity evaluation completed successfully!**
122
 
123
  **Model**: {model_name}
124
  **Perplexity Score**: {result:.4f}
125
 
126
+ πŸŽ‰ **Results have been saved and both tables have been updated!**
127
+
128
+ Note: Model trace p-value computation may take additional time and will appear in the logs."""
129
 
130
  return success_msg, gr.update(value=updated_df), gr.update(value=updated_df)
131
  else:
 
168
  os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
169
 
170
  # Get initial results data
171
+ import sys
172
+ sys.stderr.write("\nπŸš€ STARTING GRADIO APP INITIALIZATION\n")
173
+ sys.stderr.write("πŸ“Š Creating initial results DataFrame...\n")
174
+ sys.stderr.flush()
175
+
176
  RESULTS_DF = create_results_dataframe()
177
 
178
+ sys.stderr.write(f"βœ… Initial DataFrame created with shape: {RESULTS_DF.shape}\n")
179
+ sys.stderr.write(f"πŸ“‹ Columns: {list(RESULTS_DF.columns)}\n")
180
+ sys.stderr.flush()
181
+
182
  # Create the Gradio interface
183
+ sys.stderr.write("🎨 Creating Gradio interface...\n")
184
+ sys.stderr.flush()
185
+
186
  demo = gr.Blocks(css=custom_css)
187
  with demo:
188
  gr.HTML(TITLE)
 
193
  gr.Markdown("## Model Evaluation Results")
194
  results_table = gr.DataFrame(
195
  value=RESULTS_DF,
196
+ headers=["Model", "Perplexity", "Match P-Value", "Average Score", "Type", "Precision"],
197
  interactive=False,
198
  wrap=False
199
  )
 
222
  gr.Markdown("## Live Results")
223
  live_results_table = gr.DataFrame(
224
  value=RESULTS_DF,
225
+ headers=["Model", "Perplexity", "Match P-Value", "Average Score", "Type", "Precision"],
226
  interactive=False,
227
  wrap=False
228
  )
 
247
  [result, live_results_table, results_table]
248
  )
249
 
250
+ sys.stderr.write("🎯 GRADIO INTERFACE SETUP COMPLETE\n")
251
+ sys.stderr.write("πŸš€ LAUNCHING GRADIO APP WITH MODEL TRACING INTEGRATION\n")
252
+ sys.stderr.write("πŸ“Š Features enabled:\n")
253
+ sys.stderr.write(" - Perplexity evaluation\n")
254
+ sys.stderr.write(" - Model trace p-value computation (vs GPT-2 base)\n")
255
+ sys.stderr.write(" - Match statistic with alignment\n")
256
+ sys.stderr.write("πŸŽ‰ Ready to accept requests!\n")
257
+ sys.stderr.flush()
258
+
259
  demo.queue(default_concurrency_limit=5).launch()
model-tracing ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit 9eb3b67655be2a3576348a6d482e69c62f72fc3e
requirements.txt CHANGED
@@ -15,4 +15,15 @@ transformers>=4.30.0
15
  tokenizers>=0.15.0
16
  sentencepiece
17
  torch>=2.0.0
18
- accelerate>=0.20.0
 
 
 
 
 
 
 
 
 
 
 
 
15
  tokenizers>=0.15.0
16
  sentencepiece
17
  torch>=2.0.0
18
+ accelerate>=0.20.0
19
+ # Model tracing dependencies
20
+ PyYAML==6.0.1
21
+ scipy==1.13.1
22
+ protobuf==5.27.1
23
+ zstandard==0.22.0
24
+ ipdb==0.13.13
25
+ # Development dependencies for model tracing
26
+ ruff==0.1.8
27
+ pre-commit==3.5.0
28
+ nbqa==1.7.1
29
+ ipykernel==6.29.0
src/about.py CHANGED
@@ -21,17 +21,34 @@ TITLE = """<h1 align="center" id="space-title">Model Perplexity Leaderboard</h1>
21
 
22
  # What does your leaderboard evaluate?
23
  INTRODUCTION_TEXT = """
24
- This leaderboard evaluates language models based on their perplexity scores on a fixed test passage.
25
- Lower perplexity scores indicate better performance - it means the model is better at predicting the next token in the text.
 
 
 
26
  """
27
 
28
  # Which evaluations are you running?
29
  LLM_BENCHMARKS_TEXT = """
30
  ## How it works
31
 
32
- The evaluation runs perplexity tests on language models using a fixed test passage about artificial intelligence.
 
 
 
33
  Perplexity measures how well a model predicts text - lower scores mean better predictions.
34
 
 
 
 
 
 
 
 
 
 
 
 
35
  ## Test Text
36
 
37
  The evaluation uses the following passage:
 
21
 
22
  # What does your leaderboard evaluate?
23
  INTRODUCTION_TEXT = """
24
+ This leaderboard evaluates language models based on their perplexity scores on a fixed test passage and
25
+ structural similarity to GPT-2 using model tracing analysis.
26
+
27
+ - **Perplexity**: Lower perplexity scores indicate better performance - it means the model is better at predicting the next token in the text.
28
+ - **Match P-Value**: Lower p-values indicate the model preserves structural similarity to GPT-2 after fine-tuning (neuron organization is maintained).
29
  """
30
 
31
  # Which evaluations are you running?
32
  LLM_BENCHMARKS_TEXT = """
33
  ## How it works
34
 
35
+ The evaluation runs two types of analysis on language models:
36
+
37
+ ### 1. Perplexity Evaluation
38
+ Perplexity tests using a fixed test passage about artificial intelligence.
39
  Perplexity measures how well a model predicts text - lower scores mean better predictions.
40
 
41
+ ### 2. Model Tracing Analysis
42
+ Compares each model's internal structure to GPT-2 using the "match" statistic with alignment:
43
+ - **Base Model**: GPT-2 (`openai-community/gpt2`)
44
+ - **Comparison**: Each model on the leaderboard
45
+ - **Method**: Neuron matching analysis across transformer layers
46
+ - **Alignment**: Models are aligned before comparison using the Hungarian algorithm
47
+ - **Output**: P-value indicating structural similarity (lower = more similar to GPT-2)
48
+
49
+ The match statistic tests whether neurons in corresponding layers maintain similar functional roles
50
+ between the base model and fine-tuned variants.
51
+
52
  ## Test Text
53
 
54
  The evaluation uses the following passage:
src/display/utils.py CHANGED
@@ -34,6 +34,8 @@ for task in Tasks:
34
  sys.stderr.write(f"Adding task column: {task.name} -> column name: {task_col_name}\n")
35
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task_col_name, "number", True)])
36
  sys.stderr.flush()
 
 
37
  # Model information
38
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
39
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
 
34
  sys.stderr.write(f"Adding task column: {task.name} -> column name: {task_col_name}\n")
35
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task_col_name, "number", True)])
36
  sys.stderr.flush()
37
+ # Model tracing p-value column
38
+ auto_eval_column_dict.append(["model_trace_p_value", ColumnContent, ColumnContent("Match P-Value ⬇️", "number", True)])
39
  # Model information
40
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
41
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
src/evaluation/model_trace_eval.py ADDED
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Model tracing evaluation for computing p-values from neuron matching statistics.
3
+
4
+ This module runs the model-tracing comparison between a base model (gpt2) and
5
+ fine-tuned models to determine structural similarity via p-value analysis.
6
+ """
7
+
8
+ import os
9
+ import sys
10
+ import subprocess
11
+ import tempfile
12
+ import pickle
13
+ import torch
14
+ from transformers import AutoTokenizer, AutoModelForCausalLM
15
+
16
+ # Add model-tracing to path
17
+ model_tracing_path = os.path.join(os.path.dirname(__file__), '../../model-tracing')
18
+ if model_tracing_path not in sys.path:
19
+ sys.path.append(model_tracing_path)
20
+
21
+ sys.stderr.write("πŸ”§ ATTEMPTING TO IMPORT MODEL TRACING DEPENDENCIES...\n")
22
+ sys.stderr.flush()
23
+
24
+ try:
25
+ sys.stderr.write(" - Importing tracing.utils.llama.model...\n")
26
+ from tracing.utils.llama.model import permute_model, rotate_model
27
+
28
+ sys.stderr.write(" - Importing tracing.utils.llama.matching...\n")
29
+ from tracing.utils.llama.matching import align_model
30
+
31
+ sys.stderr.write(" - Importing tracing.utils.evaluate...\n")
32
+ from tracing.utils.evaluate import prepare_hf_dataset, prepare_hf_dataloader
33
+
34
+ sys.stderr.write(" - Importing tracing.utils.utils...\n")
35
+ from tracing.utils.utils import manual_seed
36
+
37
+ sys.stderr.write(" - Importing tracing.statistics.match...\n")
38
+ from tracing.statistics.match import statistic as match_stat
39
+
40
+ MODEL_TRACING_AVAILABLE = True
41
+ sys.stderr.write("βœ… ALL MODEL TRACING IMPORTS SUCCESSFUL\n")
42
+
43
+ except ImportError as e:
44
+ sys.stderr.write(f"❌ MODEL TRACING IMPORTS FAILED: {e}\n")
45
+ import traceback
46
+ sys.stderr.write(f"Full import traceback:\n{traceback.format_exc()}\n")
47
+ MODEL_TRACING_AVAILABLE = False
48
+
49
+ sys.stderr.write(f"🎯 Final MODEL_TRACING_AVAILABLE = {MODEL_TRACING_AVAILABLE}\n")
50
+ sys.stderr.flush()
51
+
52
+
53
+ def run_model_trace_analysis(ft_model_name, revision="main", precision="float16"):
54
+ """
55
+ Run model tracing analysis comparing ft_model against gpt2 base.
56
+
57
+ Args:
58
+ ft_model_name: HuggingFace model identifier for the fine-tuned model
59
+ revision: Model revision/commit hash
60
+ precision: Model precision (float16, bfloat16)
61
+
62
+ Returns:
63
+ tuple: (success: bool, result: float or error_message)
64
+ If success, result is the aggregate p-value
65
+ If failure, result is error message
66
+ """
67
+
68
+ if not MODEL_TRACING_AVAILABLE:
69
+ return False, "Model tracing dependencies not available"
70
+
71
+ try:
72
+ sys.stderr.write(f"\n=== RUNNING MODEL TRACE ANALYSIS ===\n")
73
+ sys.stderr.write(f"Base model: openai-community/gpt2\n")
74
+ sys.stderr.write(f"Fine-tuned model: {ft_model_name}\n")
75
+ sys.stderr.write(f"Revision: {revision}\n")
76
+ sys.stderr.write(f"Precision: {precision}\n")
77
+ sys.stderr.flush()
78
+
79
+ # Set random seed for reproducibility
80
+ manual_seed(0)
81
+
82
+ # Determine dtype
83
+ if precision == "bfloat16":
84
+ dtype = torch.bfloat16
85
+ else:
86
+ dtype = torch.float16
87
+
88
+ # Load base model (gpt2)
89
+ base_model_id = "openai-community/gpt2"
90
+ sys.stderr.write(f"πŸ€– Loading base model: {base_model_id}\n")
91
+ sys.stderr.write(f" - dtype: {dtype}\n")
92
+ sys.stderr.write(f" - low_cpu_mem_usage: True\n")
93
+ sys.stderr.flush()
94
+
95
+ try:
96
+ base_model = AutoModelForCausalLM.from_pretrained(
97
+ base_model_id,
98
+ torch_dtype=dtype,
99
+ low_cpu_mem_usage=True
100
+ )
101
+ sys.stderr.write("βœ… Base model loaded successfully\n")
102
+ except Exception as e:
103
+ sys.stderr.write(f"❌ Failed to load base model: {e}\n")
104
+ raise
105
+
106
+ try:
107
+ base_tokenizer = AutoTokenizer.from_pretrained(base_model_id, use_fast=False)
108
+ sys.stderr.write("βœ… Base tokenizer loaded successfully\n")
109
+ except Exception as e:
110
+ sys.stderr.write(f"❌ Failed to load base tokenizer: {e}\n")
111
+ raise
112
+
113
+ # Load fine-tuned model
114
+ sys.stderr.write(f"πŸ€– Loading fine-tuned model: {ft_model_name}\n")
115
+ sys.stderr.write(f" - revision: {revision}\n")
116
+ sys.stderr.write(f" - dtype: {dtype}\n")
117
+ sys.stderr.write(f" - low_cpu_mem_usage: True\n")
118
+ sys.stderr.flush()
119
+
120
+ try:
121
+ ft_model = AutoModelForCausalLM.from_pretrained(
122
+ ft_model_name,
123
+ revision=revision,
124
+ torch_dtype=dtype,
125
+ low_cpu_mem_usage=True
126
+ )
127
+ sys.stderr.write("βœ… Fine-tuned model loaded successfully\n")
128
+ except Exception as e:
129
+ sys.stderr.write(f"❌ Failed to load fine-tuned model: {e}\n")
130
+ raise
131
+
132
+ try:
133
+ ft_tokenizer = AutoTokenizer.from_pretrained(ft_model_name, revision=revision, use_fast=False)
134
+ sys.stderr.write("βœ… Fine-tuned tokenizer loaded successfully\n")
135
+ except Exception as e:
136
+ sys.stderr.write(f"❌ Failed to load fine-tuned tokenizer: {e}\n")
137
+ raise
138
+
139
+ sys.stderr.write("🎯 ALL MODELS AND TOKENIZERS LOADED SUCCESSFULLY\n")
140
+
141
+ # Show memory info if available
142
+ if torch.cuda.is_available():
143
+ memory_allocated = torch.cuda.memory_allocated() / 1024**3 # GB
144
+ memory_reserved = torch.cuda.memory_reserved() / 1024**3 # GB
145
+ sys.stderr.write(f"πŸ’Ύ GPU Memory - Allocated: {memory_allocated:.2f}GB, Reserved: {memory_reserved:.2f}GB\n")
146
+
147
+ sys.stderr.flush()
148
+
149
+ # Prepare dataset (using wikitext like in the original)
150
+ sys.stderr.write("Preparing dataset...\n")
151
+ sys.stderr.flush()
152
+
153
+ block_size = 512
154
+ batch_size = 1
155
+ dataset = prepare_hf_dataset("dlwh/wikitext_103_detokenized", block_size, base_tokenizer)
156
+ dataloader = prepare_hf_dataloader(dataset, batch_size)
157
+
158
+ sys.stderr.write("Dataset prepared\n")
159
+ sys.stderr.flush()
160
+
161
+ # Run alignment (--align flag)
162
+ sys.stderr.write("Running model alignment...\n")
163
+ sys.stderr.flush()
164
+
165
+ try:
166
+ align_model(base_model, ft_model, ft_model)
167
+ sys.stderr.write("Model alignment completed\n")
168
+ except Exception as e:
169
+ sys.stderr.write(f"Model alignment failed: {e}\n")
170
+ sys.stderr.write("Continuing without alignment...\n")
171
+ sys.stderr.flush()
172
+
173
+ # Run match statistic
174
+ sys.stderr.write("Computing match statistic...\n")
175
+ sys.stderr.flush()
176
+
177
+ # Get number of layers for the models
178
+ if hasattr(base_model, 'transformer') and hasattr(base_model.transformer, 'h'):
179
+ # GPT-2 style
180
+ n_blocks = len(base_model.transformer.h)
181
+ elif hasattr(base_model, 'model') and hasattr(base_model.model, 'layers'):
182
+ # LLaMA style
183
+ n_blocks = len(base_model.model.layers)
184
+ else:
185
+ # Default fallback
186
+ n_blocks = 12 # GPT-2 base has 12 layers
187
+
188
+ # Check if fine-tuned model has compatible architecture
189
+ ft_n_blocks = n_blocks
190
+ if hasattr(ft_model, 'transformer') and hasattr(ft_model.transformer, 'h'):
191
+ ft_n_blocks = len(ft_model.transformer.h)
192
+ elif hasattr(ft_model, 'model') and hasattr(ft_model.model, 'layers'):
193
+ ft_n_blocks = len(ft_model.model.layers)
194
+
195
+ # Use minimum number of blocks to avoid index errors
196
+ n_blocks = min(n_blocks, ft_n_blocks)
197
+
198
+ sys.stderr.write(f"Using {n_blocks} blocks for analysis\n")
199
+ sys.stderr.flush()
200
+
201
+ # Run the match statistic - returns list of p-values per layer
202
+ try:
203
+ p_values = match_stat(base_model, ft_model, dataloader, n_blocks=n_blocks)
204
+ except Exception as e:
205
+ sys.stderr.write(f"Match statistic computation failed: {e}\n")
206
+ sys.stderr.flush()
207
+ # Return a default high p-value indicating no similarity
208
+ return True, 1.0
209
+
210
+ sys.stderr.write(f"Match statistic computed: {len(p_values)} p-values\n")
211
+ sys.stderr.flush()
212
+
213
+ # Filter out None/NaN values
214
+ valid_p_values = [p for p in p_values if p is not None and not (isinstance(p, float) and (p != p or p < 0 or p > 1))]
215
+
216
+ if not valid_p_values:
217
+ sys.stderr.write("No valid p-values found, returning default\n")
218
+ sys.stderr.flush()
219
+ return True, 1.0
220
+
221
+ # Calculate aggregate p-value using Fisher's method
222
+ from tracing.utils.utils import fisher
223
+ try:
224
+ aggregate_p_value = fisher(valid_p_values)
225
+ except Exception as e:
226
+ sys.stderr.write(f"Fisher's method failed: {e}\n")
227
+ sys.stderr.flush()
228
+ # Use the mean of valid p-values as fallback
229
+ aggregate_p_value = sum(valid_p_values) / len(valid_p_values)
230
+
231
+ sys.stderr.write(f"Aggregate p-value: {aggregate_p_value}\n")
232
+ sys.stderr.write("=== MODEL TRACE ANALYSIS COMPLETED ===\n")
233
+ sys.stderr.flush()
234
+
235
+ # Clean up memory
236
+ del base_model
237
+ del ft_model
238
+ torch.cuda.empty_cache() if torch.cuda.is_available() else None
239
+
240
+ return True, aggregate_p_value
241
+
242
+ except Exception as e:
243
+ error_msg = str(e)
244
+ sys.stderr.write(f"Error in model trace analysis: {error_msg}\n")
245
+ import traceback
246
+ sys.stderr.write(f"Traceback: {traceback.format_exc()}\n")
247
+ sys.stderr.flush()
248
+
249
+ # Clean up memory even on error
250
+ try:
251
+ torch.cuda.empty_cache() if torch.cuda.is_available() else None
252
+ except:
253
+ pass
254
+
255
+ return False, error_msg
256
+
257
+
258
+ def compute_model_trace_p_value(model_name, revision="main", precision="float16"):
259
+ """
260
+ Wrapper function to compute model trace p-value for a single model.
261
+
262
+ Args:
263
+ model_name: HuggingFace model identifier
264
+ revision: Model revision
265
+ precision: Model precision
266
+
267
+ Returns:
268
+ float or None: P-value if successful, None if failed
269
+ """
270
+ sys.stderr.write(f"\n{'='*60}\n")
271
+ sys.stderr.write(f"COMPUTE_MODEL_TRACE_P_VALUE CALLED\n")
272
+ sys.stderr.write(f"Model: {model_name}\n")
273
+ sys.stderr.write(f"Revision: {revision}\n")
274
+ sys.stderr.write(f"Precision: {precision}\n")
275
+ sys.stderr.write(f"Model tracing available: {MODEL_TRACING_AVAILABLE}\n")
276
+ sys.stderr.write(f"{'='*60}\n")
277
+ sys.stderr.flush()
278
+
279
+ if not MODEL_TRACING_AVAILABLE:
280
+ sys.stderr.write("❌ MODEL TRACING NOT AVAILABLE - returning None\n")
281
+ sys.stderr.flush()
282
+ return None
283
+
284
+ try:
285
+ sys.stderr.write("πŸš€ Starting model trace analysis...\n")
286
+ sys.stderr.flush()
287
+
288
+ success, result = run_model_trace_analysis(model_name, revision, precision)
289
+
290
+ sys.stderr.write(f"πŸ“Š Analysis completed - Success: {success}, Result: {result}\n")
291
+ sys.stderr.flush()
292
+
293
+ if success:
294
+ sys.stderr.write(f"βœ… SUCCESS: Returning p-value {result}\n")
295
+ sys.stderr.flush()
296
+ return result
297
+ else:
298
+ sys.stderr.write(f"❌ FAILED: {result}\n")
299
+ sys.stderr.write("πŸ”„ Returning None as fallback\n")
300
+ sys.stderr.flush()
301
+ return None
302
+
303
+ except Exception as e:
304
+ sys.stderr.write(f"πŸ’₯ CRITICAL ERROR in compute_model_trace_p_value for {model_name}:\n")
305
+ sys.stderr.write(f"Exception: {e}\n")
306
+ import traceback
307
+ sys.stderr.write(f"Full traceback:\n{traceback.format_exc()}\n")
308
+ sys.stderr.write("πŸ”„ Returning None as fallback\n")
309
+ sys.stderr.flush()
310
+ return None
src/leaderboard/read_evals.py CHANGED
@@ -7,6 +7,7 @@ from dataclasses import dataclass
7
  from src.display.formatting import make_clickable_model
8
  from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
9
  from src.submission.check_validity import is_model_on_hub
 
10
 
11
  @dataclass
12
  class EvalResult:
@@ -131,6 +132,34 @@ class EvalResult:
131
  data_dict[AutoEvalColumn.params.name] = 0
132
  data_dict[AutoEvalColumn.likes.name] = 0
133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  sys.stderr.write(f"Created base data_dict with {len(data_dict)} columns\n")
135
  sys.stderr.flush()
136
 
 
7
  from src.display.formatting import make_clickable_model
8
  from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
9
  from src.submission.check_validity import is_model_on_hub
10
+ from src.evaluation.model_trace_eval import compute_model_trace_p_value
11
 
12
  @dataclass
13
  class EvalResult:
 
132
  data_dict[AutoEvalColumn.params.name] = 0
133
  data_dict[AutoEvalColumn.likes.name] = 0
134
 
135
+ # Compute model trace p-value
136
+ sys.stderr.write(f"\n🧬 COMPUTING MODEL TRACE P-VALUE FOR: {self.full_model}\n")
137
+ sys.stderr.write(f" - Revision: {self.revision if self.revision else 'main'}\n")
138
+ sys.stderr.write(f" - Precision: {self.precision.value.name.lower()}\n")
139
+ sys.stderr.flush()
140
+
141
+ try:
142
+ model_trace_p_value = compute_model_trace_p_value(
143
+ self.full_model,
144
+ self.revision if self.revision else "main",
145
+ self.precision.value.name.lower()
146
+ )
147
+
148
+ if model_trace_p_value is not None:
149
+ sys.stderr.write(f"βœ… Model trace p-value computed successfully: {model_trace_p_value}\n")
150
+ else:
151
+ sys.stderr.write(f"⚠️ Model trace p-value is None (computation failed or not available)\n")
152
+
153
+ except Exception as e:
154
+ sys.stderr.write(f"πŸ’₯ Exception during model trace p-value computation: {e}\n")
155
+ import traceback
156
+ sys.stderr.write(f"Traceback: {traceback.format_exc()}\n")
157
+ model_trace_p_value = None
158
+
159
+ data_dict[AutoEvalColumn.model_trace_p_value.name] = model_trace_p_value
160
+ sys.stderr.write(f"πŸ“ Added to data_dict: {AutoEvalColumn.model_trace_p_value.name} = {model_trace_p_value}\n")
161
+ sys.stderr.flush()
162
+
163
  sys.stderr.write(f"Created base data_dict with {len(data_dict)} columns\n")
164
  sys.stderr.flush()
165
 
test_model_trace.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test script for model tracing integration.
4
+ Tests the p-value computation for a simple model comparison.
5
+ """
6
+
7
+ import sys
8
+ import os
9
+
10
+ # Add src to path
11
+ sys.path.append('src')
12
+
13
+ from evaluation.model_trace_eval import compute_model_trace_p_value
14
+
15
+ def test_model_trace():
16
+ """Test the model trace p-value computation with a simple example."""
17
+
18
+ print("Testing model trace p-value computation...")
19
+
20
+ # Test with a simple model (should be fast)
21
+ test_model = "openai-community/gpt2"
22
+
23
+ print(f"Computing p-value for {test_model} vs GPT-2...")
24
+
25
+ try:
26
+ p_value = compute_model_trace_p_value(test_model, "main", "float16")
27
+
28
+ if p_value is not None:
29
+ print(f"βœ… Success! P-value: {p_value}")
30
+ if 0 <= p_value <= 1:
31
+ print("βœ… P-value is in valid range [0, 1]")
32
+ else:
33
+ print(f"⚠️ Warning: P-value {p_value} is outside expected range [0, 1]")
34
+ else:
35
+ print("❌ Failed: P-value is None")
36
+
37
+ except Exception as e:
38
+ print(f"❌ Error: {e}")
39
+ import traceback
40
+ traceback.print_exc()
41
+
42
+ if __name__ == "__main__":
43
+ test_model_trace()