Ahmed Ahmed commited on
Commit
4864926
Β·
1 Parent(s): de071e9
app.py CHANGED
@@ -89,9 +89,13 @@ def run_perplexity_test(model_name, revision, precision):
89
  import sys
90
  import traceback
91
  import gradio as gr
 
92
 
93
  if not model_name:
94
- return "Please enter a model name.", gr.update(), gr.update()
 
 
 
95
 
96
  try:
97
  # Use stderr for more reliable logging in HF Spaces
@@ -125,7 +129,7 @@ def run_perplexity_test(model_name, revision, precision):
125
 
126
  πŸŽ‰ **Results have been saved and both tables have been updated!**
127
 
128
- Note: Model trace p-value computation may take additional time and will appear in the logs."""
129
 
130
  return success_msg, gr.update(value=updated_df), gr.update(value=updated_df)
131
  else:
@@ -167,9 +171,17 @@ except Exception as e:
167
  # Ensure local directory exists even if repo operations fail
168
  os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
169
 
170
- # Get initial results data
171
  import sys
 
 
172
  sys.stderr.write("\nπŸš€ STARTING GRADIO APP INITIALIZATION\n")
 
 
 
 
 
 
173
  sys.stderr.write("πŸ“Š Creating initial results DataFrame...\n")
174
  sys.stderr.flush()
175
 
@@ -202,11 +214,17 @@ with demo:
202
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
203
 
204
  with gr.TabItem("πŸ§ͺ Test Model", elem_id="test-model-tab", id=2):
205
- gr.Markdown("## Run Perplexity Test\n\nTest any Hugging Face model for perplexity evaluation.")
 
 
206
 
207
  with gr.Row():
208
  with gr.Column():
209
- model_name = gr.Textbox(label="Model name", placeholder="openai-community/gpt2")
 
 
 
 
210
  revision = gr.Textbox(label="Revision", placeholder="main", value="main")
211
  precision = gr.Dropdown(
212
  choices=["float16", "bfloat16"],
@@ -231,13 +249,14 @@ with demo:
231
  ### Tips:
232
  - **Check stderr logs** in HF Spaces for detailed debugging information
233
  - **Results will update automatically** in the table above after evaluation completes
234
- - **Example models to test**: `openai-community/gpt2`, `EleutherAI/gpt-neo-1.3B`, `openai-community/gpt2-large`
235
  - **Lower perplexity scores = better performance** (better at predicting text)
 
236
 
237
  ### How it works:
238
- 1. Enter a model name from Hugging Face Hub
239
  2. Click "Run Perplexity Test"
240
- 3. Wait for evaluation to complete (may take a few minutes for large models)
241
  4. Results will appear automatically in the table above!
242
  """)
243
 
 
89
  import sys
90
  import traceback
91
  import gradio as gr
92
+ from src.evaluation.initialize_models import is_model_allowed
93
 
94
  if not model_name:
95
+ return "Please select a model.", gr.update(), gr.update()
96
+
97
+ if not is_model_allowed(model_name):
98
+ return f"❌ Model '{model_name}' is not in the allowed list. Please select from the dropdown.", gr.update(), gr.update()
99
 
100
  try:
101
  # Use stderr for more reliable logging in HF Spaces
 
129
 
130
  πŸŽ‰ **Results have been saved and both tables have been updated!**
131
 
132
+ ⏰ **Note**: Model trace p-value computation runs a full model comparison analysis and may take 10-30 minutes per model. Progress will appear in the logs."""
133
 
134
  return success_msg, gr.update(value=updated_df), gr.update(value=updated_df)
135
  else:
 
171
  # Ensure local directory exists even if repo operations fail
172
  os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
173
 
174
+ # Initialize allowed models
175
  import sys
176
+ from src.evaluation.initialize_models import initialize_allowed_models, get_allowed_models
177
+
178
  sys.stderr.write("\nπŸš€ STARTING GRADIO APP INITIALIZATION\n")
179
+ sys.stderr.write("πŸ“Š Initializing allowed models...\n")
180
+ sys.stderr.flush()
181
+
182
+ # Initialize the allowed models
183
+ initialize_allowed_models()
184
+
185
  sys.stderr.write("πŸ“Š Creating initial results DataFrame...\n")
186
  sys.stderr.flush()
187
 
 
214
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
215
 
216
  with gr.TabItem("πŸ§ͺ Test Model", elem_id="test-model-tab", id=2):
217
+ gr.Markdown("## Run Perplexity Test\n\nTest one of the supported models for perplexity evaluation.")
218
+
219
+ allowed_models = get_allowed_models()
220
 
221
  with gr.Row():
222
  with gr.Column():
223
+ model_name = gr.Dropdown(
224
+ choices=allowed_models,
225
+ label="Model name",
226
+ value=allowed_models[0] if allowed_models else None
227
+ )
228
  revision = gr.Textbox(label="Revision", placeholder="main", value="main")
229
  precision = gr.Dropdown(
230
  choices=["float16", "bfloat16"],
 
249
  ### Tips:
250
  - **Check stderr logs** in HF Spaces for detailed debugging information
251
  - **Results will update automatically** in the table above after evaluation completes
252
+ - **Available models**: Vicuna 7B v1.5, IBM Granite 7B Base, LLeMa 7B
253
  - **Lower perplexity scores = better performance** (better at predicting text)
254
+ - **Model trace p-values are computed automatically** (may take 10-30 minutes)
255
 
256
  ### How it works:
257
+ 1. Select a model from the dropdown
258
  2. Click "Run Perplexity Test"
259
+ 3. Wait for evaluation to complete (may take a few minutes for perplexity + longer for p-value)
260
  4. Results will appear automatically in the table above!
261
  """)
262
 
src/about.py CHANGED
@@ -17,37 +17,48 @@ NUM_FEWSHOT = 0 # Not used for perplexity
17
  # ---------------------------------------------------
18
 
19
  # Your leaderboard name
20
- TITLE = """<h1 align="center" id="space-title">Model Perplexity Leaderboard</h1>"""
21
 
22
  # What does your leaderboard evaluate?
23
  INTRODUCTION_TEXT = """
24
- This leaderboard evaluates language models based on their perplexity scores on a fixed test passage and
25
- structural similarity to GPT-2 using model tracing analysis.
26
 
 
 
 
 
 
 
27
  - **Perplexity**: Lower perplexity scores indicate better performance - it means the model is better at predicting the next token in the text.
28
- - **Match P-Value**: Lower p-values indicate the model preserves structural similarity to GPT-2 after fine-tuning (neuron organization is maintained).
29
  """
30
 
31
  # Which evaluations are you running?
32
  LLM_BENCHMARKS_TEXT = """
33
  ## How it works
34
 
35
- The evaluation runs two types of analysis on language models:
 
 
 
 
 
36
 
37
  ### 1. Perplexity Evaluation
38
  Perplexity tests using a fixed test passage about artificial intelligence.
39
  Perplexity measures how well a model predicts text - lower scores mean better predictions.
40
 
41
  ### 2. Model Tracing Analysis
42
- Compares each model's internal structure to GPT-2 using the "match" statistic with alignment:
43
- - **Base Model**: GPT-2 (`openai-community/gpt2`)
44
- - **Comparison**: Each model on the leaderboard
45
  - **Method**: Neuron matching analysis across transformer layers
46
  - **Alignment**: Models are aligned before comparison using the Hungarian algorithm
47
- - **Output**: P-value indicating structural similarity (lower = more similar to GPT-2)
48
 
49
  The match statistic tests whether neurons in corresponding layers maintain similar functional roles
50
- between the base model and fine-tuned variants.
51
 
52
  ## Test Text
53
 
@@ -62,11 +73,15 @@ with these important social considerations.
62
  """
63
 
64
  EVALUATION_QUEUE_TEXT = """
65
- ## Before submitting a model
 
 
 
 
 
 
66
 
67
- 1. Make sure your model is public on the Hugging Face Hub
68
- 2. The model should be loadable with AutoModelForCausalLM
69
- 3. The model should support text generation tasks
70
  """
71
 
72
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 
17
  # ---------------------------------------------------
18
 
19
  # Your leaderboard name
20
+ TITLE = """<h1 align="center" id="space-title">Model Tracing Leaderboard</h1>"""
21
 
22
  # What does your leaderboard evaluate?
23
  INTRODUCTION_TEXT = """
24
+ This leaderboard evaluates specific language models based on their perplexity scores and
25
+ structural similarity to Llama-2-7B using model tracing analysis.
26
 
27
+ **Models Evaluated:**
28
+ - `lmsys/vicuna-7b-v1.5` - Vicuna 7B v1.5
29
+ - `ibm-granite/granite-7b-base` - IBM Granite 7B Base
30
+ - `EleutherAI/llemma_7b` - LLeMa 7B
31
+
32
+ **Metrics:**
33
  - **Perplexity**: Lower perplexity scores indicate better performance - it means the model is better at predicting the next token in the text.
34
+ - **Match P-Value**: Lower p-values indicate the model preserves structural similarity to Llama-2-7B after fine-tuning (neuron organization is maintained).
35
  """
36
 
37
  # Which evaluations are you running?
38
  LLM_BENCHMARKS_TEXT = """
39
  ## How it works
40
 
41
+ The evaluation runs two types of analysis on the supported language models:
42
+
43
+ ### Supported Models
44
+ - **Vicuna 7B v1.5** (`lmsys/vicuna-7b-v1.5`) - Chat-optimized LLaMA variant
45
+ - **IBM Granite 7B** (`ibm-granite/granite-7b-base`) - IBM's foundational language model
46
+ - **LLeMa 7B** (`EleutherAI/llemma_7b`) - EleutherAI's mathematical language model
47
 
48
  ### 1. Perplexity Evaluation
49
  Perplexity tests using a fixed test passage about artificial intelligence.
50
  Perplexity measures how well a model predicts text - lower scores mean better predictions.
51
 
52
  ### 2. Model Tracing Analysis
53
+ Compares each model's internal structure to Llama-2-7B using the "match" statistic:
54
+ - **Base Model**: Llama-2-7B (`meta-llama/Llama-2-7b-hf`)
55
+ - **Comparison Models**: The 3 supported models listed above
56
  - **Method**: Neuron matching analysis across transformer layers
57
  - **Alignment**: Models are aligned before comparison using the Hungarian algorithm
58
+ - **Output**: P-value indicating structural similarity (lower = more similar to Llama-2-7B)
59
 
60
  The match statistic tests whether neurons in corresponding layers maintain similar functional roles
61
+ between the base model and the comparison models.
62
 
63
  ## Test Text
64
 
 
73
  """
74
 
75
  EVALUATION_QUEUE_TEXT = """
76
+ ## Testing Models
77
+
78
+ This leaderboard focuses on comparing specific models:
79
+
80
+ 1. **Vicuna 7B v1.5** - Chat-optimized variant of LLaMA
81
+ 2. **IBM Granite 7B Base** - IBM's foundational language model
82
+ 3. **LLeMa 7B** - EleutherAI's mathematical language model
83
 
84
+ Use the "Test Model" tab to run perplexity evaluation on any of these models.
 
 
85
  """
86
 
87
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
src/evaluation/initialize_models.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Initialize the leaderboard with specific models and compute their p-values.
3
+
4
+ This module ensures only the specified models are included in the leaderboard
5
+ and their model trace p-values are computed.
6
+ """
7
+
8
+ import os
9
+ import json
10
+ import sys
11
+ from src.evaluation.model_trace_eval import compute_model_trace_p_value
12
+ from src.envs import EVAL_RESULTS_PATH
13
+
14
+ # The specific models we want to include
15
+ ALLOWED_MODELS = [
16
+ "lmsys/vicuna-7b-v1.5",
17
+ "ibm-granite/granite-7b-base",
18
+ "EleutherAI/llemma_7b"
19
+ ]
20
+
21
+ def create_model_result_file(model_name, precision="float16"):
22
+ """
23
+ Create a result file for a model with computed p-value.
24
+
25
+ Args:
26
+ model_name: HuggingFace model identifier
27
+ precision: Model precision
28
+ """
29
+ sys.stderr.write(f"\nπŸ”§ CREATING RESULT FILE FOR: {model_name}\n")
30
+ sys.stderr.flush()
31
+
32
+ # Create the results directory if it doesn't exist
33
+ os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
34
+
35
+ # Generate a safe filename
36
+ safe_name = model_name.replace("/", "_").replace("-", "_")
37
+ result_file = os.path.join(EVAL_RESULTS_PATH, f"{safe_name}_{precision}.json")
38
+
39
+ sys.stderr.write(f"πŸ“ Result file path: {result_file}\n")
40
+ sys.stderr.flush()
41
+
42
+ # Check if file already exists
43
+ if os.path.exists(result_file):
44
+ sys.stderr.write(f"βœ… Result file already exists: {result_file}\n")
45
+ sys.stderr.flush()
46
+ return result_file
47
+
48
+ # Create basic result structure
49
+ result_data = {
50
+ "config": {
51
+ "model_dtype": f"torch.{precision}",
52
+ "model_name": model_name,
53
+ "model_sha": "main"
54
+ },
55
+ "results": {
56
+ "perplexity": {
57
+ "perplexity": None # Will be populated when user tests
58
+ }
59
+ }
60
+ }
61
+
62
+ # Save the result file
63
+ try:
64
+ with open(result_file, 'w') as f:
65
+ json.dump(result_data, f, indent=2)
66
+
67
+ sys.stderr.write(f"βœ… Created result file: {result_file}\n")
68
+ sys.stderr.flush()
69
+ return result_file
70
+
71
+ except Exception as e:
72
+ sys.stderr.write(f"❌ Failed to create result file: {e}\n")
73
+ sys.stderr.flush()
74
+ return None
75
+
76
+ def initialize_allowed_models():
77
+ """
78
+ Initialize result files for all allowed models.
79
+ """
80
+ sys.stderr.write(f"\nπŸš€ INITIALIZING ALLOWED MODELS\n")
81
+ sys.stderr.write(f"πŸ“‹ Models to initialize: {ALLOWED_MODELS}\n")
82
+ sys.stderr.flush()
83
+
84
+ created_files = []
85
+
86
+ for model_name in ALLOWED_MODELS:
87
+ try:
88
+ result_file = create_model_result_file(model_name)
89
+ if result_file:
90
+ created_files.append(result_file)
91
+
92
+ except Exception as e:
93
+ sys.stderr.write(f"❌ Failed to initialize {model_name}: {e}\n")
94
+ sys.stderr.flush()
95
+ continue
96
+
97
+ sys.stderr.write(f"βœ… Initialized {len(created_files)} model result files\n")
98
+ sys.stderr.flush()
99
+
100
+ return created_files
101
+
102
+ def is_model_allowed(model_name):
103
+ """
104
+ Check if a model is in the allowed list.
105
+
106
+ Args:
107
+ model_name: HuggingFace model identifier
108
+
109
+ Returns:
110
+ bool: True if model is allowed
111
+ """
112
+ return model_name in ALLOWED_MODELS
113
+
114
+ def get_allowed_models():
115
+ """
116
+ Get the list of allowed models.
117
+
118
+ Returns:
119
+ list: List of allowed model names
120
+ """
121
+ return ALLOWED_MODELS.copy()
src/evaluation/model_trace_eval.py CHANGED
@@ -1,8 +1,8 @@
1
  """
2
  Model tracing evaluation for computing p-values from neuron matching statistics.
3
 
4
- This module runs the model-tracing comparison between a base model (gpt2) and
5
- fine-tuned models to determine structural similarity via p-value analysis.
6
  """
7
 
8
  import os
@@ -10,49 +10,26 @@ import sys
10
  import subprocess
11
  import tempfile
12
  import pickle
13
- import torch
14
- from transformers import AutoTokenizer, AutoModelForCausalLM
15
 
16
- # Add model-tracing to path
17
  model_tracing_path = os.path.join(os.path.dirname(__file__), '../../model-tracing')
18
- if model_tracing_path not in sys.path:
19
- sys.path.append(model_tracing_path)
20
 
21
- sys.stderr.write("πŸ”§ ATTEMPTING TO IMPORT MODEL TRACING DEPENDENCIES...\n")
22
- sys.stderr.flush()
23
-
24
- try:
25
- sys.stderr.write(" - Importing tracing.utils.llama.model...\n")
26
- from tracing.utils.llama.model import permute_model, rotate_model
27
-
28
- sys.stderr.write(" - Importing tracing.utils.llama.matching...\n")
29
- from tracing.utils.llama.matching import align_model
30
-
31
- sys.stderr.write(" - Importing tracing.utils.evaluate...\n")
32
- from tracing.utils.evaluate import prepare_hf_dataset, prepare_hf_dataloader
33
-
34
- sys.stderr.write(" - Importing tracing.utils.utils...\n")
35
- from tracing.utils.utils import manual_seed
36
-
37
- sys.stderr.write(" - Importing tracing.statistics.match...\n")
38
- from tracing.statistics.match import statistic as match_stat
39
-
40
- MODEL_TRACING_AVAILABLE = True
41
- sys.stderr.write("βœ… ALL MODEL TRACING IMPORTS SUCCESSFUL\n")
42
-
43
- except ImportError as e:
44
- sys.stderr.write(f"❌ MODEL TRACING IMPORTS FAILED: {e}\n")
45
- import traceback
46
- sys.stderr.write(f"Full import traceback:\n{traceback.format_exc()}\n")
47
- MODEL_TRACING_AVAILABLE = False
48
-
49
  sys.stderr.write(f"🎯 Final MODEL_TRACING_AVAILABLE = {MODEL_TRACING_AVAILABLE}\n")
50
  sys.stderr.flush()
51
 
52
 
53
  def run_model_trace_analysis(ft_model_name, revision="main", precision="float16"):
54
  """
55
- Run model tracing analysis comparing ft_model against gpt2 base.
 
 
 
56
 
57
  Args:
58
  ft_model_name: HuggingFace model identifier for the fine-tuned model
@@ -61,197 +38,175 @@ def run_model_trace_analysis(ft_model_name, revision="main", precision="float16"
61
 
62
  Returns:
63
  tuple: (success: bool, result: float or error_message)
64
- If success, result is the aggregate p-value
65
  If failure, result is error message
66
  """
67
 
68
  if not MODEL_TRACING_AVAILABLE:
69
- return False, "Model tracing dependencies not available"
70
 
71
  try:
72
- sys.stderr.write(f"\n=== RUNNING MODEL TRACE ANALYSIS ===\n")
73
- sys.stderr.write(f"Base model: openai-community/gpt2\n")
74
  sys.stderr.write(f"Fine-tuned model: {ft_model_name}\n")
75
  sys.stderr.write(f"Revision: {revision}\n")
76
  sys.stderr.write(f"Precision: {precision}\n")
77
  sys.stderr.flush()
78
 
79
- # Set random seed for reproducibility
80
- manual_seed(0)
81
-
82
- # Determine dtype
83
- if precision == "bfloat16":
84
- dtype = torch.bfloat16
85
- else:
86
- dtype = torch.float16
87
-
88
- # Load base model (gpt2)
89
- base_model_id = "openai-community/gpt2"
90
- sys.stderr.write(f"πŸ€– Loading base model: {base_model_id}\n")
91
- sys.stderr.write(f" - dtype: {dtype}\n")
92
- sys.stderr.write(f" - low_cpu_mem_usage: True\n")
93
- sys.stderr.flush()
94
-
95
- try:
96
- base_model = AutoModelForCausalLM.from_pretrained(
97
- base_model_id,
98
- torch_dtype=dtype,
99
- low_cpu_mem_usage=True
100
- )
101
- sys.stderr.write("βœ… Base model loaded successfully\n")
102
- except Exception as e:
103
- sys.stderr.write(f"❌ Failed to load base model: {e}\n")
104
- raise
105
-
106
- try:
107
- base_tokenizer = AutoTokenizer.from_pretrained(base_model_id, use_fast=False)
108
- sys.stderr.write("βœ… Base tokenizer loaded successfully\n")
109
- except Exception as e:
110
- sys.stderr.write(f"❌ Failed to load base tokenizer: {e}\n")
111
- raise
112
-
113
- # Load fine-tuned model
114
- sys.stderr.write(f"πŸ€– Loading fine-tuned model: {ft_model_name}\n")
115
- sys.stderr.write(f" - revision: {revision}\n")
116
- sys.stderr.write(f" - dtype: {dtype}\n")
117
- sys.stderr.write(f" - low_cpu_mem_usage: True\n")
118
- sys.stderr.flush()
119
-
120
- try:
121
- ft_model = AutoModelForCausalLM.from_pretrained(
122
- ft_model_name,
123
- revision=revision,
124
- torch_dtype=dtype,
125
- low_cpu_mem_usage=True
126
- )
127
- sys.stderr.write("βœ… Fine-tuned model loaded successfully\n")
128
- except Exception as e:
129
- sys.stderr.write(f"❌ Failed to load fine-tuned model: {e}\n")
130
- raise
131
-
132
- try:
133
- ft_tokenizer = AutoTokenizer.from_pretrained(ft_model_name, revision=revision, use_fast=False)
134
- sys.stderr.write("βœ… Fine-tuned tokenizer loaded successfully\n")
135
- except Exception as e:
136
- sys.stderr.write(f"❌ Failed to load fine-tuned tokenizer: {e}\n")
137
- raise
138
-
139
- sys.stderr.write("🎯 ALL MODELS AND TOKENIZERS LOADED SUCCESSFULLY\n")
140
-
141
- # Show memory info if available
142
- if torch.cuda.is_available():
143
- memory_allocated = torch.cuda.memory_allocated() / 1024**3 # GB
144
- memory_reserved = torch.cuda.memory_reserved() / 1024**3 # GB
145
- sys.stderr.write(f"πŸ’Ύ GPU Memory - Allocated: {memory_allocated:.2f}GB, Reserved: {memory_reserved:.2f}GB\n")
146
 
 
147
  sys.stderr.flush()
148
 
149
- # Prepare dataset (using wikitext like in the original)
150
- sys.stderr.write("Preparing dataset...\n")
151
- sys.stderr.flush()
152
 
153
- block_size = 512
154
- batch_size = 1
155
- dataset = prepare_hf_dataset("dlwh/wikitext_103_detokenized", block_size, base_tokenizer)
156
- dataloader = prepare_hf_dataloader(dataset, batch_size)
 
 
 
 
157
 
158
- sys.stderr.write("Dataset prepared\n")
159
- sys.stderr.flush()
 
 
 
160
 
161
- # Run alignment (--align flag)
162
- sys.stderr.write("Running model alignment...\n")
163
  sys.stderr.flush()
164
 
 
 
165
  try:
166
- align_model(base_model, ft_model, ft_model)
167
- sys.stderr.write("Model alignment completed\n")
168
- except Exception as e:
169
- sys.stderr.write(f"Model alignment failed: {e}\n")
170
- sys.stderr.write("Continuing without alignment...\n")
171
- sys.stderr.flush()
172
-
173
- # Run match statistic
174
- sys.stderr.write("Computing match statistic...\n")
175
- sys.stderr.flush()
176
-
177
- # Get number of layers for the models
178
- if hasattr(base_model, 'transformer') and hasattr(base_model.transformer, 'h'):
179
- # GPT-2 style
180
- n_blocks = len(base_model.transformer.h)
181
- elif hasattr(base_model, 'model') and hasattr(base_model.model, 'layers'):
182
- # LLaMA style
183
- n_blocks = len(base_model.model.layers)
184
- else:
185
- # Default fallback
186
- n_blocks = 12 # GPT-2 base has 12 layers
187
 
188
- # Check if fine-tuned model has compatible architecture
189
- ft_n_blocks = n_blocks
190
- if hasattr(ft_model, 'transformer') and hasattr(ft_model.transformer, 'h'):
191
- ft_n_blocks = len(ft_model.transformer.h)
192
- elif hasattr(ft_model, 'model') and hasattr(ft_model.model, 'layers'):
193
- ft_n_blocks = len(ft_model.model.layers)
 
194
 
195
- # Use minimum number of blocks to avoid index errors
196
- n_blocks = min(n_blocks, ft_n_blocks)
197
 
198
- sys.stderr.write(f"Using {n_blocks} blocks for analysis\n")
199
- sys.stderr.flush()
200
-
201
- # Run the match statistic - returns list of p-values per layer
202
- try:
203
- p_values = match_stat(base_model, ft_model, dataloader, n_blocks=n_blocks)
204
- except Exception as e:
205
- sys.stderr.write(f"Match statistic computation failed: {e}\n")
206
  sys.stderr.flush()
207
- # Return a default high p-value indicating no similarity
208
- return True, 1.0
209
-
210
- sys.stderr.write(f"Match statistic computed: {len(p_values)} p-values\n")
211
- sys.stderr.flush()
212
-
213
- # Filter out None/NaN values
214
- valid_p_values = [p for p in p_values if p is not None and not (isinstance(p, float) and (p != p or p < 0 or p > 1))]
215
-
216
- if not valid_p_values:
217
- sys.stderr.write("No valid p-values found, returning default\n")
218
  sys.stderr.flush()
219
- return True, 1.0
220
 
221
- # Calculate aggregate p-value using Fisher's method
222
- from tracing.utils.utils import fisher
223
  try:
224
- aggregate_p_value = fisher(valid_p_values)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  except Exception as e:
226
- sys.stderr.write(f"Fisher's method failed: {e}\n")
227
  sys.stderr.flush()
228
- # Use the mean of valid p-values as fallback
229
- aggregate_p_value = sum(valid_p_values) / len(valid_p_values)
 
 
 
 
 
 
 
230
 
231
- sys.stderr.write(f"Aggregate p-value: {aggregate_p_value}\n")
232
  sys.stderr.write("=== MODEL TRACE ANALYSIS COMPLETED ===\n")
233
  sys.stderr.flush()
234
 
235
- # Clean up memory
236
- del base_model
237
- del ft_model
238
- torch.cuda.empty_cache() if torch.cuda.is_available() else None
239
-
240
  return True, aggregate_p_value
241
 
 
 
 
 
 
242
  except Exception as e:
243
  error_msg = str(e)
244
- sys.stderr.write(f"Error in model trace analysis: {error_msg}\n")
245
  import traceback
246
  sys.stderr.write(f"Traceback: {traceback.format_exc()}\n")
247
  sys.stderr.flush()
248
-
249
- # Clean up memory even on error
250
- try:
251
- torch.cuda.empty_cache() if torch.cuda.is_available() else None
252
- except:
253
- pass
254
-
255
  return False, error_msg
256
 
257
 
 
1
  """
2
  Model tracing evaluation for computing p-values from neuron matching statistics.
3
 
4
+ This module runs the model-tracing comparison using the main.py script from model-tracing
5
+ to determine structural similarity via p-value analysis.
6
  """
7
 
8
  import os
 
10
  import subprocess
11
  import tempfile
12
  import pickle
13
+ import statistics
 
14
 
15
+ # Check if model-tracing directory exists
16
  model_tracing_path = os.path.join(os.path.dirname(__file__), '../../model-tracing')
17
+ MODEL_TRACING_AVAILABLE = os.path.exists(model_tracing_path) and os.path.exists(os.path.join(model_tracing_path, 'main.py'))
 
18
 
19
+ sys.stderr.write("πŸ”§ CHECKING MODEL TRACING AVAILABILITY...\n")
20
+ sys.stderr.write(f" - Model tracing path: {model_tracing_path}\n")
21
+ sys.stderr.write(f" - Path exists: {os.path.exists(model_tracing_path)}\n")
22
+ sys.stderr.write(f" - main.py exists: {os.path.exists(os.path.join(model_tracing_path, 'main.py'))}\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  sys.stderr.write(f"🎯 Final MODEL_TRACING_AVAILABLE = {MODEL_TRACING_AVAILABLE}\n")
24
  sys.stderr.flush()
25
 
26
 
27
  def run_model_trace_analysis(ft_model_name, revision="main", precision="float16"):
28
  """
29
+ Run model tracing analysis using the main.py script from model-tracing directory.
30
+
31
+ Runs the exact command:
32
+ python main.py --base_model_id meta-llama/Llama-2-7b-hf --ft_model_id <ft_model_name> --stat match --align
33
 
34
  Args:
35
  ft_model_name: HuggingFace model identifier for the fine-tuned model
 
38
 
39
  Returns:
40
  tuple: (success: bool, result: float or error_message)
41
+ If success, result is the aggregate p-value from aligned test stat
42
  If failure, result is error message
43
  """
44
 
45
  if not MODEL_TRACING_AVAILABLE:
46
+ return False, "Model tracing main.py script not available"
47
 
48
  try:
49
+ sys.stderr.write(f"\n=== RUNNING MODEL TRACE ANALYSIS VIA SUBPROCESS ===\n")
50
+ sys.stderr.write(f"Base model: meta-llama/Llama-2-7b-hf\n")
51
  sys.stderr.write(f"Fine-tuned model: {ft_model_name}\n")
52
  sys.stderr.write(f"Revision: {revision}\n")
53
  sys.stderr.write(f"Precision: {precision}\n")
54
  sys.stderr.flush()
55
 
56
+ # Create a temporary file for results
57
+ with tempfile.NamedTemporaryFile(suffix='.pkl', delete=False) as tmp_file:
58
+ tmp_results_path = tmp_file.name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
+ sys.stderr.write(f"πŸ“ Temporary results file: {tmp_results_path}\n")
61
  sys.stderr.flush()
62
 
63
+ # Build the command exactly as user specified
64
+ base_model_id = "meta-llama/Llama-2-7b-hf"
 
65
 
66
+ # Build the command
67
+ cmd = [
68
+ "python", "main.py",
69
+ "--base_model_id", base_model_id,
70
+ "--ft_model_id", ft_model_name,
71
+ "--stat", "match",
72
+ "--save", tmp_results_path
73
+ ]
74
 
75
+ # Add revision if not main/default
76
+ if revision and revision != "main":
77
+ # Note: main.py doesn't seem to have a revision flag, but we log it for reference
78
+ sys.stderr.write(f"⚠️ Note: Revision '{revision}' specified but main.py doesn't support --revision flag\n")
79
+ sys.stderr.flush()
80
 
81
+ sys.stderr.write(f"πŸš€ Running command: {' '.join(cmd)}\n")
 
82
  sys.stderr.flush()
83
 
84
+ # Change to model-tracing directory and run the command
85
+ original_cwd = os.getcwd()
86
  try:
87
+ os.chdir(model_tracing_path)
88
+ sys.stderr.write(f"πŸ“‚ Changed to directory: {model_tracing_path}\n")
89
+ sys.stderr.flush()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
+ # Run the subprocess
92
+ result = subprocess.run(
93
+ cmd,
94
+ capture_output=True,
95
+ text=True,
96
+ timeout=3600 # 1 hour timeout
97
+ )
98
 
99
+ sys.stderr.write(f"πŸ“Š Subprocess completed with return code: {result.returncode}\n")
 
100
 
101
+ # Log stdout and stderr from the subprocess
102
+ if result.stdout:
103
+ sys.stderr.write(f"πŸ“ STDOUT from model tracing:\n{result.stdout}\n")
104
+ if result.stderr:
105
+ sys.stderr.write(f"⚠️ STDERR from model tracing:\n{result.stderr}\n")
 
 
 
106
  sys.stderr.flush()
107
+
108
+ if result.returncode != 0:
109
+ error_msg = f"Model tracing script failed with return code {result.returncode}"
110
+ if result.stderr:
111
+ error_msg += f"\nSTDERR: {result.stderr}"
112
+ return False, error_msg
113
+
114
+ finally:
115
+ os.chdir(original_cwd)
116
+ sys.stderr.write(f"πŸ“‚ Changed back to directory: {original_cwd}\n")
 
117
  sys.stderr.flush()
 
118
 
119
+ # Load and parse the results
 
120
  try:
121
+ sys.stderr.write(f"πŸ“– Loading results from: {tmp_results_path}\n")
122
+ sys.stderr.flush()
123
+
124
+ with open(tmp_results_path, 'rb') as f:
125
+ results = pickle.load(f)
126
+
127
+ sys.stderr.write(f"βœ… Results loaded successfully\n")
128
+ sys.stderr.write(f"πŸ“‹ Available result keys: {list(results.keys())}\n")
129
+ sys.stderr.flush()
130
+
131
+ # Get the aligned test stat (this is what we want with --align flag)
132
+ if "aligned test stat" in results:
133
+ aligned_stat = results["aligned test stat"]
134
+ sys.stderr.write(f"πŸ“Š Aligned test stat: {aligned_stat}\n")
135
+ sys.stderr.write(f"πŸ“Š Type: {type(aligned_stat)}\n")
136
+
137
+ # The match statistic returns a list of p-values per layer
138
+ if isinstance(aligned_stat, list):
139
+ sys.stderr.write(f"πŸ“Š List of {len(aligned_stat)} p-values: {aligned_stat}\n")
140
+
141
+ # Filter valid p-values
142
+ valid_p_values = [p for p in aligned_stat if p is not None and isinstance(p, (int, float)) and 0 <= p <= 1]
143
+ sys.stderr.write(f"πŸ“Š Valid p-values: {len(valid_p_values)}/{len(aligned_stat)}\n")
144
+
145
+ if valid_p_values:
146
+ # Use median as the representative p-value
147
+ aggregate_p_value = statistics.median(valid_p_values)
148
+ sys.stderr.write(f"πŸ“Š Using median p-value: {aggregate_p_value}\n")
149
+ else:
150
+ sys.stderr.write("⚠️ No valid p-values found, using default\n")
151
+ aggregate_p_value = 1.0
152
+
153
+ elif isinstance(aligned_stat, (int, float)):
154
+ aggregate_p_value = float(aligned_stat)
155
+ sys.stderr.write(f"πŸ“Š Using single p-value: {aggregate_p_value}\n")
156
+ else:
157
+ sys.stderr.write(f"⚠️ Unexpected aligned_stat type: {type(aligned_stat)}, using default\n")
158
+ aggregate_p_value = 1.0
159
+
160
+ else:
161
+ sys.stderr.write("⚠️ No 'aligned test stat' found in results, checking non-aligned\n")
162
+ if "non-aligned test stat" in results:
163
+ non_aligned_stat = results["non-aligned test stat"]
164
+ sys.stderr.write(f"πŸ“Š Using non-aligned test stat: {non_aligned_stat}\n")
165
+
166
+ if isinstance(non_aligned_stat, list):
167
+ valid_p_values = [p for p in non_aligned_stat if p is not None and isinstance(p, (int, float)) and 0 <= p <= 1]
168
+ if valid_p_values:
169
+ aggregate_p_value = statistics.median(valid_p_values)
170
+ else:
171
+ aggregate_p_value = 1.0
172
+ else:
173
+ aggregate_p_value = float(non_aligned_stat) if isinstance(non_aligned_stat, (int, float)) else 1.0
174
+ else:
175
+ sys.stderr.write("❌ No test stat found in results\n")
176
+ return False, "No test statistic found in results"
177
+
178
+ sys.stderr.flush()
179
+
180
  except Exception as e:
181
+ sys.stderr.write(f"❌ Failed to load results: {e}\n")
182
  sys.stderr.flush()
183
+ return False, f"Failed to load results: {e}"
184
+
185
+ finally:
186
+ # Clean up temporary file
187
+ try:
188
+ os.unlink(tmp_results_path)
189
+ sys.stderr.write(f"πŸ—‘οΈ Cleaned up temporary file: {tmp_results_path}\n")
190
+ except:
191
+ pass
192
 
193
+ sys.stderr.write(f"βœ… Final aggregate p-value: {aggregate_p_value}\n")
194
  sys.stderr.write("=== MODEL TRACE ANALYSIS COMPLETED ===\n")
195
  sys.stderr.flush()
196
 
 
 
 
 
 
197
  return True, aggregate_p_value
198
 
199
+ except subprocess.TimeoutExpired:
200
+ sys.stderr.write("❌ Model tracing analysis timed out after 1 hour\n")
201
+ sys.stderr.flush()
202
+ return False, "Analysis timed out"
203
+
204
  except Exception as e:
205
  error_msg = str(e)
206
+ sys.stderr.write(f"πŸ’₯ Error in model trace analysis: {error_msg}\n")
207
  import traceback
208
  sys.stderr.write(f"Traceback: {traceback.format_exc()}\n")
209
  sys.stderr.flush()
 
 
 
 
 
 
 
210
  return False, error_msg
211
 
212
 
src/leaderboard/read_evals.py CHANGED
@@ -8,6 +8,7 @@ from src.display.formatting import make_clickable_model
8
  from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
9
  from src.submission.check_validity import is_model_on_hub
10
  from src.evaluation.model_trace_eval import compute_model_trace_p_value
 
11
 
12
  @dataclass
13
  class EvalResult:
@@ -236,6 +237,13 @@ def get_raw_eval_results(results_path: str) -> list[EvalResult]:
236
  try:
237
  sys.stderr.write(f"\nConverting result to dict for: {v.full_model}\n")
238
  sys.stderr.flush()
 
 
 
 
 
 
 
239
  v.to_dict() # we test if the dict version is complete
240
  results.append(v)
241
  sys.stderr.write("Successfully converted and added result\n")
 
8
  from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
9
  from src.submission.check_validity import is_model_on_hub
10
  from src.evaluation.model_trace_eval import compute_model_trace_p_value
11
+ from src.evaluation.initialize_models import is_model_allowed
12
 
13
  @dataclass
14
  class EvalResult:
 
237
  try:
238
  sys.stderr.write(f"\nConverting result to dict for: {v.full_model}\n")
239
  sys.stderr.flush()
240
+
241
+ # Filter to only allowed models
242
+ if not is_model_allowed(v.full_model):
243
+ sys.stderr.write(f"⏭️ Skipping non-allowed model: {v.full_model}\n")
244
+ sys.stderr.flush()
245
+ continue
246
+
247
  v.to_dict() # we test if the dict version is complete
248
  results.append(v)
249
  sys.stderr.write("Successfully converted and added result\n")
test_model_trace.py DELETED
@@ -1,43 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Test script for model tracing integration.
4
- Tests the p-value computation for a simple model comparison.
5
- """
6
-
7
- import sys
8
- import os
9
-
10
- # Add src to path
11
- sys.path.append('src')
12
-
13
- from evaluation.model_trace_eval import compute_model_trace_p_value
14
-
15
- def test_model_trace():
16
- """Test the model trace p-value computation with a simple example."""
17
-
18
- print("Testing model trace p-value computation...")
19
-
20
- # Test with a simple model (should be fast)
21
- test_model = "openai-community/gpt2"
22
-
23
- print(f"Computing p-value for {test_model} vs GPT-2...")
24
-
25
- try:
26
- p_value = compute_model_trace_p_value(test_model, "main", "float16")
27
-
28
- if p_value is not None:
29
- print(f"βœ… Success! P-value: {p_value}")
30
- if 0 <= p_value <= 1:
31
- print("βœ… P-value is in valid range [0, 1]")
32
- else:
33
- print(f"⚠️ Warning: P-value {p_value} is outside expected range [0, 1]")
34
- else:
35
- print("❌ Failed: P-value is None")
36
-
37
- except Exception as e:
38
- print(f"❌ Error: {e}")
39
- import traceback
40
- traceback.print_exc()
41
-
42
- if __name__ == "__main__":
43
- test_model_trace()