Spaces:

ahmedsqrd
/

model_trace

Runtime error

App Files Files Community

Ahmed Ahmed commited on 17 days ago

Commit

4864926

1 Parent(s): de071e9

ok

Browse files

Files changed (6) hide show

app.py +27 -8
src/about.py +29 -14
src/evaluation/initialize_models.py +121 -0
src/evaluation/model_trace_eval.py +142 -187
src/leaderboard/read_evals.py +8 -0
test_model_trace.py +0 -43

app.py CHANGED Viewed

@@ -89,9 +89,13 @@ def run_perplexity_test(model_name, revision, precision):
     import sys
     import traceback
     import gradio as gr
     if not model_name:
-        return "Please enter a model name.", gr.update(), gr.update()
     try:
         # Use stderr for more reliable logging in HF Spaces
@@ -125,7 +129,7 @@ def run_perplexity_test(model_name, revision, precision):
 🎉 **Results have been saved and both tables have been updated!**
-Note: Model trace p-value computation may take additional time and will appear in the logs."""
             return success_msg, gr.update(value=updated_df), gr.update(value=updated_df)
         else:
@@ -167,9 +171,17 @@ except Exception as e:
     # Ensure local directory exists even if repo operations fail
     os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
-# Get initial results data
 import sys
 sys.stderr.write("\n🚀 STARTING GRADIO APP INITIALIZATION\n")
 sys.stderr.write("📊 Creating initial results DataFrame...\n")
 sys.stderr.flush()
@@ -202,11 +214,17 @@ with demo:
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
         with gr.TabItem("🧪 Test Model", elem_id="test-model-tab", id=2):
-            gr.Markdown("## Run Perplexity Test\n\nTest any Hugging Face model for perplexity evaluation.")
             with gr.Row():
                 with gr.Column():
-                    model_name = gr.Textbox(label="Model name", placeholder="openai-community/gpt2")
                     revision = gr.Textbox(label="Revision", placeholder="main", value="main")
                     precision = gr.Dropdown(
                         choices=["float16", "bfloat16"],
@@ -231,13 +249,14 @@ with demo:
             ### Tips:
             - **Check stderr logs** in HF Spaces for detailed debugging information
             - **Results will update automatically** in the table above after evaluation completes
-            - **Example models to test**: `openai-community/gpt2`, `EleutherAI/gpt-neo-1.3B`, `openai-community/gpt2-large`
             - **Lower perplexity scores = better performance** (better at predicting text)
             ### How it works:
-            1. Enter a model name from Hugging Face Hub
             2. Click "Run Perplexity Test"
-            3. Wait for evaluation to complete (may take a few minutes for large models)
             4. Results will appear automatically in the table above!
             """)

     import sys
     import traceback
     import gradio as gr
+    from src.evaluation.initialize_models import is_model_allowed
     if not model_name:
+        return "Please select a model.", gr.update(), gr.update()
+    if not is_model_allowed(model_name):
+        return f"❌ Model '{model_name}' is not in the allowed list. Please select from the dropdown.", gr.update(), gr.update()
     try:
         # Use stderr for more reliable logging in HF Spaces
 🎉 **Results have been saved and both tables have been updated!**
+⏰ **Note**: Model trace p-value computation runs a full model comparison analysis and may take 10-30 minutes per model. Progress will appear in the logs."""
             return success_msg, gr.update(value=updated_df), gr.update(value=updated_df)
         else:
     # Ensure local directory exists even if repo operations fail
     os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
+# Initialize allowed models
 import sys
+from src.evaluation.initialize_models import initialize_allowed_models, get_allowed_models
 sys.stderr.write("\n🚀 STARTING GRADIO APP INITIALIZATION\n")
+sys.stderr.write("📊 Initializing allowed models...\n")
+sys.stderr.flush()
+# Initialize the allowed models
+initialize_allowed_models()
 sys.stderr.write("📊 Creating initial results DataFrame...\n")
 sys.stderr.flush()
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
         with gr.TabItem("🧪 Test Model", elem_id="test-model-tab", id=2):
+            gr.Markdown("## Run Perplexity Test\n\nTest one of the supported models for perplexity evaluation.")
+            allowed_models = get_allowed_models()
             with gr.Row():
                 with gr.Column():
+                    model_name = gr.Dropdown(
+                        choices=allowed_models,
+                        label="Model name",
+                        value=allowed_models[0] if allowed_models else None
+                    )
                     revision = gr.Textbox(label="Revision", placeholder="main", value="main")
                     precision = gr.Dropdown(
                         choices=["float16", "bfloat16"],
             ### Tips:
             - **Check stderr logs** in HF Spaces for detailed debugging information
             - **Results will update automatically** in the table above after evaluation completes
+            - **Available models**: Vicuna 7B v1.5, IBM Granite 7B Base, LLeMa 7B
             - **Lower perplexity scores = better performance** (better at predicting text)
+            - **Model trace p-values are computed automatically** (may take 10-30 minutes)
             ### How it works:
+            1. Select a model from the dropdown
             2. Click "Run Perplexity Test"
+            3. Wait for evaluation to complete (may take a few minutes for perplexity + longer for p-value)
             4. Results will appear automatically in the table above!
             """)

src/about.py CHANGED Viewed

@@ -17,37 +17,48 @@ NUM_FEWSHOT = 0 # Not used for perplexity
 # ---------------------------------------------------
 # Your leaderboard name
-TITLE = """<h1 align="center" id="space-title">Model Perplexity Leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
-This leaderboard evaluates language models based on their perplexity scores on a fixed test passage and
-structural similarity to GPT-2 using model tracing analysis.
 - **Perplexity**: Lower perplexity scores indicate better performance - it means the model is better at predicting the next token in the text.
-- **Match P-Value**: Lower p-values indicate the model preserves structural similarity to GPT-2 after fine-tuning (neuron organization is maintained).
 """
 # Which evaluations are you running?
 LLM_BENCHMARKS_TEXT = """
 ## How it works
-The evaluation runs two types of analysis on language models:
 ### 1. Perplexity Evaluation
 Perplexity tests using a fixed test passage about artificial intelligence.
 Perplexity measures how well a model predicts text - lower scores mean better predictions.
 ### 2. Model Tracing Analysis
-Compares each model's internal structure to GPT-2 using the "match" statistic with alignment:
-- **Base Model**: GPT-2 (`openai-community/gpt2`)
-- **Comparison**: Each model on the leaderboard
 - **Method**: Neuron matching analysis across transformer layers
 - **Alignment**: Models are aligned before comparison using the Hungarian algorithm
-- **Output**: P-value indicating structural similarity (lower = more similar to GPT-2)
 The match statistic tests whether neurons in corresponding layers maintain similar functional roles
-between the base model and fine-tuned variants.
 ## Test Text
@@ -62,11 +73,15 @@ with these important social considerations.
 """
 EVALUATION_QUEUE_TEXT = """
-## Before submitting a model
-1. Make sure your model is public on the Hugging Face Hub
-2. The model should be loadable with AutoModelForCausalLM
-3. The model should support text generation tasks
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"

 # ---------------------------------------------------
 # Your leaderboard name
+TITLE = """<h1 align="center" id="space-title">Model Tracing Leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
+This leaderboard evaluates specific language models based on their perplexity scores and
+structural similarity to Llama-2-7B using model tracing analysis.
+**Models Evaluated:**
+- `lmsys/vicuna-7b-v1.5` - Vicuna 7B v1.5
+- `ibm-granite/granite-7b-base` - IBM Granite 7B Base
+- `EleutherAI/llemma_7b` - LLeMa 7B
+**Metrics:**
 - **Perplexity**: Lower perplexity scores indicate better performance - it means the model is better at predicting the next token in the text.
+- **Match P-Value**: Lower p-values indicate the model preserves structural similarity to Llama-2-7B after fine-tuning (neuron organization is maintained).
 """
 # Which evaluations are you running?
 LLM_BENCHMARKS_TEXT = """
 ## How it works
+The evaluation runs two types of analysis on the supported language models:
+### Supported Models
+- **Vicuna 7B v1.5** (`lmsys/vicuna-7b-v1.5`) - Chat-optimized LLaMA variant
+- **IBM Granite 7B** (`ibm-granite/granite-7b-base`) - IBM's foundational language model
+- **LLeMa 7B** (`EleutherAI/llemma_7b`) - EleutherAI's mathematical language model
 ### 1. Perplexity Evaluation
 Perplexity tests using a fixed test passage about artificial intelligence.
 Perplexity measures how well a model predicts text - lower scores mean better predictions.
 ### 2. Model Tracing Analysis
+Compares each model's internal structure to Llama-2-7B using the "match" statistic:
+- **Base Model**: Llama-2-7B (`meta-llama/Llama-2-7b-hf`)
+- **Comparison Models**: The 3 supported models listed above
 - **Method**: Neuron matching analysis across transformer layers
 - **Alignment**: Models are aligned before comparison using the Hungarian algorithm
+- **Output**: P-value indicating structural similarity (lower = more similar to Llama-2-7B)
 The match statistic tests whether neurons in corresponding layers maintain similar functional roles
+between the base model and the comparison models.
 ## Test Text
 """
 EVALUATION_QUEUE_TEXT = """
+## Testing Models
+This leaderboard focuses on comparing specific models:
+1. **Vicuna 7B v1.5** - Chat-optimized variant of LLaMA
+2. **IBM Granite 7B Base** - IBM's foundational language model
+3. **LLeMa 7B** - EleutherAI's mathematical language model
+Use the "Test Model" tab to run perplexity evaluation on any of these models.
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"

src/evaluation/initialize_models.py ADDED Viewed

	@@ -0,0 +1,121 @@

+"""
+Initialize the leaderboard with specific models and compute their p-values.
+This module ensures only the specified models are included in the leaderboard
+and their model trace p-values are computed.
+"""
+import os
+import json
+import sys
+from src.evaluation.model_trace_eval import compute_model_trace_p_value
+from src.envs import EVAL_RESULTS_PATH
+# The specific models we want to include
+ALLOWED_MODELS = [
+    "lmsys/vicuna-7b-v1.5",
+    "ibm-granite/granite-7b-base",
+    "EleutherAI/llemma_7b"
+]
+def create_model_result_file(model_name, precision="float16"):
+    """
+    Create a result file for a model with computed p-value.
+    Args:
+        model_name: HuggingFace model identifier
+        precision: Model precision
+    """
+    sys.stderr.write(f"\n🔧 CREATING RESULT FILE FOR: {model_name}\n")
+    sys.stderr.flush()
+    # Create the results directory if it doesn't exist
+    os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
+    # Generate a safe filename
+    safe_name = model_name.replace("/", "_").replace("-", "_")
+    result_file = os.path.join(EVAL_RESULTS_PATH, f"{safe_name}_{precision}.json")
+    sys.stderr.write(f"📁 Result file path: {result_file}\n")
+    sys.stderr.flush()
+    # Check if file already exists
+    if os.path.exists(result_file):
+        sys.stderr.write(f"✅ Result file already exists: {result_file}\n")
+        sys.stderr.flush()
+        return result_file
+    # Create basic result structure
+    result_data = {
+        "config": {
+            "model_dtype": f"torch.{precision}",
+            "model_name": model_name,
+            "model_sha": "main"
+        },
+        "results": {
+            "perplexity": {
+                "perplexity": None  # Will be populated when user tests
+            }
+        }
+    }
+    # Save the result file
+    try:
+        with open(result_file, 'w') as f:
+            json.dump(result_data, f, indent=2)
+        sys.stderr.write(f"✅ Created result file: {result_file}\n")
+        sys.stderr.flush()
+        return result_file
+    except Exception as e:
+        sys.stderr.write(f"❌ Failed to create result file: {e}\n")
+        sys.stderr.flush()
+        return None
+def initialize_allowed_models():
+    """
+    Initialize result files for all allowed models.
+    """
+    sys.stderr.write(f"\n🚀 INITIALIZING ALLOWED MODELS\n")
+    sys.stderr.write(f"📋 Models to initialize: {ALLOWED_MODELS}\n")
+    sys.stderr.flush()
+    created_files = []
+    for model_name in ALLOWED_MODELS:
+        try:
+            result_file = create_model_result_file(model_name)
+            if result_file:
+                created_files.append(result_file)
+        except Exception as e:
+            sys.stderr.write(f"❌ Failed to initialize {model_name}: {e}\n")
+            sys.stderr.flush()
+            continue
+    sys.stderr.write(f"✅ Initialized {len(created_files)} model result files\n")
+    sys.stderr.flush()
+    return created_files
+def is_model_allowed(model_name):
+    """
+    Check if a model is in the allowed list.
+    Args:
+        model_name: HuggingFace model identifier
+    Returns:
+        bool: True if model is allowed
+    """
+    return model_name in ALLOWED_MODELS
+def get_allowed_models():
+    """
+    Get the list of allowed models.
+    Returns:
+        list: List of allowed model names
+    """
+    return ALLOWED_MODELS.copy()

src/evaluation/model_trace_eval.py CHANGED Viewed

@@ -1,8 +1,8 @@
 """
 Model tracing evaluation for computing p-values from neuron matching statistics.
-This module runs the model-tracing comparison between a base model (gpt2) and
-fine-tuned models to determine structural similarity via p-value analysis.
 """
 import os
@@ -10,49 +10,26 @@ import sys
 import subprocess
 import tempfile
 import pickle
-import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
-# Add model-tracing to path
 model_tracing_path = os.path.join(os.path.dirname(__file__), '../../model-tracing')
-if model_tracing_path not in sys.path:
-    sys.path.append(model_tracing_path)
-sys.stderr.write("🔧 ATTEMPTING TO IMPORT MODEL TRACING DEPENDENCIES...\n")
-sys.stderr.flush()
-try:
-    sys.stderr.write("   - Importing tracing.utils.llama.model...\n")
-    from tracing.utils.llama.model import permute_model, rotate_model
-    sys.stderr.write("   - Importing tracing.utils.llama.matching...\n")
-    from tracing.utils.llama.matching import align_model
-    sys.stderr.write("   - Importing tracing.utils.evaluate...\n")
-    from tracing.utils.evaluate import prepare_hf_dataset, prepare_hf_dataloader
-    sys.stderr.write("   - Importing tracing.utils.utils...\n")
-    from tracing.utils.utils import manual_seed
-    sys.stderr.write("   - Importing tracing.statistics.match...\n")
-    from tracing.statistics.match import statistic as match_stat
-    MODEL_TRACING_AVAILABLE = True
-    sys.stderr.write("✅ ALL MODEL TRACING IMPORTS SUCCESSFUL\n")
-except ImportError as e:
-    sys.stderr.write(f"❌ MODEL TRACING IMPORTS FAILED: {e}\n")
-    import traceback
-    sys.stderr.write(f"Full import traceback:\n{traceback.format_exc()}\n")
-    MODEL_TRACING_AVAILABLE = False
 sys.stderr.write(f"🎯 Final MODEL_TRACING_AVAILABLE = {MODEL_TRACING_AVAILABLE}\n")
 sys.stderr.flush()
 def run_model_trace_analysis(ft_model_name, revision="main", precision="float16"):
     """
-    Run model tracing analysis comparing ft_model against gpt2 base.
     Args:
         ft_model_name: HuggingFace model identifier for the fine-tuned model
@@ -61,197 +38,175 @@ def run_model_trace_analysis(ft_model_name, revision="main", precision="float16"
     Returns:
         tuple: (success: bool, result: float or error_message)
-               If success, result is the aggregate p-value
                If failure, result is error message
     """
     if not MODEL_TRACING_AVAILABLE:
-        return False, "Model tracing dependencies not available"
     try:
-        sys.stderr.write(f"\n=== RUNNING MODEL TRACE ANALYSIS ===\n")
-        sys.stderr.write(f"Base model: openai-community/gpt2\n")
         sys.stderr.write(f"Fine-tuned model: {ft_model_name}\n")
         sys.stderr.write(f"Revision: {revision}\n")
         sys.stderr.write(f"Precision: {precision}\n")
         sys.stderr.flush()
-        # Set random seed for reproducibility
-        manual_seed(0)
-        # Determine dtype
-        if precision == "bfloat16":
-            dtype = torch.bfloat16
-        else:
-            dtype = torch.float16
-        # Load base model (gpt2)
-        base_model_id = "openai-community/gpt2"
-        sys.stderr.write(f"🤖 Loading base model: {base_model_id}\n")
-        sys.stderr.write(f"   - dtype: {dtype}\n")
-        sys.stderr.write(f"   - low_cpu_mem_usage: True\n")
-        sys.stderr.flush()
-        try:
-            base_model = AutoModelForCausalLM.from_pretrained(
-                base_model_id,
-                torch_dtype=dtype,
-                low_cpu_mem_usage=True
-            )
-            sys.stderr.write("✅ Base model loaded successfully\n")
-        except Exception as e:
-            sys.stderr.write(f"❌ Failed to load base model: {e}\n")
-            raise
-        try:
-            base_tokenizer = AutoTokenizer.from_pretrained(base_model_id, use_fast=False)
-            sys.stderr.write("✅ Base tokenizer loaded successfully\n")
-        except Exception as e:
-            sys.stderr.write(f"❌ Failed to load base tokenizer: {e}\n")
-            raise
-        # Load fine-tuned model
-        sys.stderr.write(f"🤖 Loading fine-tuned model: {ft_model_name}\n")
-        sys.stderr.write(f"   - revision: {revision}\n")
-        sys.stderr.write(f"   - dtype: {dtype}\n")
-        sys.stderr.write(f"   - low_cpu_mem_usage: True\n")
-        sys.stderr.flush()
-        try:
-            ft_model = AutoModelForCausalLM.from_pretrained(
-                ft_model_name,
-                revision=revision,
-                torch_dtype=dtype,
-                low_cpu_mem_usage=True
-            )
-            sys.stderr.write("✅ Fine-tuned model loaded successfully\n")
-        except Exception as e:
-            sys.stderr.write(f"❌ Failed to load fine-tuned model: {e}\n")
-            raise
-        try:
-            ft_tokenizer = AutoTokenizer.from_pretrained(ft_model_name, revision=revision, use_fast=False)
-            sys.stderr.write("✅ Fine-tuned tokenizer loaded successfully\n")
-        except Exception as e:
-            sys.stderr.write(f"❌ Failed to load fine-tuned tokenizer: {e}\n")
-            raise
-        sys.stderr.write("🎯 ALL MODELS AND TOKENIZERS LOADED SUCCESSFULLY\n")
-        # Show memory info if available
-        if torch.cuda.is_available():
-            memory_allocated = torch.cuda.memory_allocated() / 1024**3  # GB
-            memory_reserved = torch.cuda.memory_reserved() / 1024**3    # GB
-            sys.stderr.write(f"💾 GPU Memory - Allocated: {memory_allocated:.2f}GB, Reserved: {memory_reserved:.2f}GB\n")
         sys.stderr.flush()
-        # Prepare dataset (using wikitext like in the original)
-        sys.stderr.write("Preparing dataset...\n")
-        sys.stderr.flush()
-        block_size = 512
-        batch_size = 1
-        dataset = prepare_hf_dataset("dlwh/wikitext_103_detokenized", block_size, base_tokenizer)
-        dataloader = prepare_hf_dataloader(dataset, batch_size)
-        sys.stderr.write("Dataset prepared\n")
-        sys.stderr.flush()
-        # Run alignment (--align flag)
-        sys.stderr.write("Running model alignment...\n")
         sys.stderr.flush()
         try:
-            align_model(base_model, ft_model, ft_model)
-            sys.stderr.write("Model alignment completed\n")
-        except Exception as e:
-            sys.stderr.write(f"Model alignment failed: {e}\n")
-            sys.stderr.write("Continuing without alignment...\n")
-        sys.stderr.flush()
-        # Run match statistic
-        sys.stderr.write("Computing match statistic...\n")
-        sys.stderr.flush()
-        # Get number of layers for the models
-        if hasattr(base_model, 'transformer') and hasattr(base_model.transformer, 'h'):
-            # GPT-2 style
-            n_blocks = len(base_model.transformer.h)
-        elif hasattr(base_model, 'model') and hasattr(base_model.model, 'layers'):
-            # LLaMA style
-            n_blocks = len(base_model.model.layers)
-        else:
-            # Default fallback
-            n_blocks = 12  # GPT-2 base has 12 layers
-        # Check if fine-tuned model has compatible architecture
-        ft_n_blocks = n_blocks
-        if hasattr(ft_model, 'transformer') and hasattr(ft_model.transformer, 'h'):
-            ft_n_blocks = len(ft_model.transformer.h)
-        elif hasattr(ft_model, 'model') and hasattr(ft_model.model, 'layers'):
-            ft_n_blocks = len(ft_model.model.layers)
-        # Use minimum number of blocks to avoid index errors
-        n_blocks = min(n_blocks, ft_n_blocks)
-        sys.stderr.write(f"Using {n_blocks} blocks for analysis\n")
-        sys.stderr.flush()
-        # Run the match statistic - returns list of p-values per layer
-        try:
-            p_values = match_stat(base_model, ft_model, dataloader, n_blocks=n_blocks)
-        except Exception as e:
-            sys.stderr.write(f"Match statistic computation failed: {e}\n")
             sys.stderr.flush()
-            # Return a default high p-value indicating no similarity
-            return True, 1.0
-        sys.stderr.write(f"Match statistic computed: {len(p_values)} p-values\n")
-        sys.stderr.flush()
-        # Filter out None/NaN values
-        valid_p_values = [p for p in p_values if p is not None and not (isinstance(p, float) and (p != p or p < 0 or p > 1))]
-        if not valid_p_values:
-            sys.stderr.write("No valid p-values found, returning default\n")
             sys.stderr.flush()
-            return True, 1.0
-        # Calculate aggregate p-value using Fisher's method
-        from tracing.utils.utils import fisher
         try:
-            aggregate_p_value = fisher(valid_p_values)
         except Exception as e:
-            sys.stderr.write(f"Fisher's method failed: {e}\n")
             sys.stderr.flush()
-            # Use the mean of valid p-values as fallback
-            aggregate_p_value = sum(valid_p_values) / len(valid_p_values)
-        sys.stderr.write(f"Aggregate p-value: {aggregate_p_value}\n")
         sys.stderr.write("=== MODEL TRACE ANALYSIS COMPLETED ===\n")
         sys.stderr.flush()
-        # Clean up memory
-        del base_model
-        del ft_model
-        torch.cuda.empty_cache() if torch.cuda.is_available() else None
         return True, aggregate_p_value
     except Exception as e:
         error_msg = str(e)
-        sys.stderr.write(f"Error in model trace analysis: {error_msg}\n")
         import traceback
         sys.stderr.write(f"Traceback: {traceback.format_exc()}\n")
         sys.stderr.flush()
-        # Clean up memory even on error
-        try:
-            torch.cuda.empty_cache() if torch.cuda.is_available() else None
-        except:
-            pass
         return False, error_msg

 """
 Model tracing evaluation for computing p-values from neuron matching statistics.
+This module runs the model-tracing comparison using the main.py script from model-tracing
+to determine structural similarity via p-value analysis.
 """
 import os
 import subprocess
 import tempfile
 import pickle
+import statistics
+# Check if model-tracing directory exists
 model_tracing_path = os.path.join(os.path.dirname(__file__), '../../model-tracing')
+MODEL_TRACING_AVAILABLE = os.path.exists(model_tracing_path) and os.path.exists(os.path.join(model_tracing_path, 'main.py'))
+sys.stderr.write("🔧 CHECKING MODEL TRACING AVAILABILITY...\n")
+sys.stderr.write(f"   - Model tracing path: {model_tracing_path}\n")
+sys.stderr.write(f"   - Path exists: {os.path.exists(model_tracing_path)}\n")
+sys.stderr.write(f"   - main.py exists: {os.path.exists(os.path.join(model_tracing_path, 'main.py'))}\n")
 sys.stderr.write(f"🎯 Final MODEL_TRACING_AVAILABLE = {MODEL_TRACING_AVAILABLE}\n")
 sys.stderr.flush()
 def run_model_trace_analysis(ft_model_name, revision="main", precision="float16"):
     """
+    Run model tracing analysis using the main.py script from model-tracing directory.
+    Runs the exact command:
+    python main.py --base_model_id meta-llama/Llama-2-7b-hf --ft_model_id <ft_model_name> --stat match --align
     Args:
         ft_model_name: HuggingFace model identifier for the fine-tuned model
     Returns:
         tuple: (success: bool, result: float or error_message)
+               If success, result is the aggregate p-value from aligned test stat
                If failure, result is error message
     """
     if not MODEL_TRACING_AVAILABLE:
+        return False, "Model tracing main.py script not available"
     try:
+        sys.stderr.write(f"\n=== RUNNING MODEL TRACE ANALYSIS VIA SUBPROCESS ===\n")
+        sys.stderr.write(f"Base model: meta-llama/Llama-2-7b-hf\n")
         sys.stderr.write(f"Fine-tuned model: {ft_model_name}\n")
         sys.stderr.write(f"Revision: {revision}\n")
         sys.stderr.write(f"Precision: {precision}\n")
         sys.stderr.flush()
+        # Create a temporary file for results
+        with tempfile.NamedTemporaryFile(suffix='.pkl', delete=False) as tmp_file:
+            tmp_results_path = tmp_file.name
+        sys.stderr.write(f"📁 Temporary results file: {tmp_results_path}\n")
         sys.stderr.flush()
+        # Build the command exactly as user specified
+        base_model_id = "meta-llama/Llama-2-7b-hf"
+        # Build the command
+        cmd = [
+            "python", "main.py",
+            "--base_model_id", base_model_id,
+            "--ft_model_id", ft_model_name,
+            "--stat", "match",
+            "--save", tmp_results_path
+        ]
+        # Add revision if not main/default
+        if revision and revision != "main":
+            # Note: main.py doesn't seem to have a revision flag, but we log it for reference
+            sys.stderr.write(f"⚠️ Note: Revision '{revision}' specified but main.py doesn't support --revision flag\n")
+            sys.stderr.flush()
+        sys.stderr.write(f"🚀 Running command: {' '.join(cmd)}\n")
         sys.stderr.flush()
+        # Change to model-tracing directory and run the command
+        original_cwd = os.getcwd()
         try:
+            os.chdir(model_tracing_path)
+            sys.stderr.write(f"📂 Changed to directory: {model_tracing_path}\n")
+            sys.stderr.flush()
+            # Run the subprocess
+            result = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                timeout=3600  # 1 hour timeout
+            )
+            sys.stderr.write(f"📊 Subprocess completed with return code: {result.returncode}\n")
+            # Log stdout and stderr from the subprocess
+            if result.stdout:
+                sys.stderr.write(f"📝 STDOUT from model tracing:\n{result.stdout}\n")
+            if result.stderr:
+                sys.stderr.write(f"⚠️ STDERR from model tracing:\n{result.stderr}\n")
             sys.stderr.flush()
+            if result.returncode != 0:
+                error_msg = f"Model tracing script failed with return code {result.returncode}"
+                if result.stderr:
+                    error_msg += f"\nSTDERR: {result.stderr}"
+                return False, error_msg
+        finally:
+            os.chdir(original_cwd)
+            sys.stderr.write(f"📂 Changed back to directory: {original_cwd}\n")
             sys.stderr.flush()
+        # Load and parse the results
         try:
+            sys.stderr.write(f"📖 Loading results from: {tmp_results_path}\n")
+            sys.stderr.flush()
+            with open(tmp_results_path, 'rb') as f:
+                results = pickle.load(f)
+            sys.stderr.write(f"✅ Results loaded successfully\n")
+            sys.stderr.write(f"📋 Available result keys: {list(results.keys())}\n")
+            sys.stderr.flush()
+            # Get the aligned test stat (this is what we want with --align flag)
+            if "aligned test stat" in results:
+                aligned_stat = results["aligned test stat"]
+                sys.stderr.write(f"📊 Aligned test stat: {aligned_stat}\n")
+                sys.stderr.write(f"📊 Type: {type(aligned_stat)}\n")
+                # The match statistic returns a list of p-values per layer
+                if isinstance(aligned_stat, list):
+                    sys.stderr.write(f"📊 List of {len(aligned_stat)} p-values: {aligned_stat}\n")
+                    # Filter valid p-values
+                    valid_p_values = [p for p in aligned_stat if p is not None and isinstance(p, (int, float)) and 0 <= p <= 1]
+                    sys.stderr.write(f"📊 Valid p-values: {len(valid_p_values)}/{len(aligned_stat)}\n")
+                    if valid_p_values:
+                        # Use median as the representative p-value
+                        aggregate_p_value = statistics.median(valid_p_values)
+                        sys.stderr.write(f"📊 Using median p-value: {aggregate_p_value}\n")
+                    else:
+                        sys.stderr.write("⚠️ No valid p-values found, using default\n")
+                        aggregate_p_value = 1.0
+                elif isinstance(aligned_stat, (int, float)):
+                    aggregate_p_value = float(aligned_stat)
+                    sys.stderr.write(f"📊 Using single p-value: {aggregate_p_value}\n")
+                else:
+                    sys.stderr.write(f"⚠️ Unexpected aligned_stat type: {type(aligned_stat)}, using default\n")
+                    aggregate_p_value = 1.0
+            else:
+                sys.stderr.write("⚠️ No 'aligned test stat' found in results, checking non-aligned\n")
+                if "non-aligned test stat" in results:
+                    non_aligned_stat = results["non-aligned test stat"]
+                    sys.stderr.write(f"📊 Using non-aligned test stat: {non_aligned_stat}\n")
+                    if isinstance(non_aligned_stat, list):
+                        valid_p_values = [p for p in non_aligned_stat if p is not None and isinstance(p, (int, float)) and 0 <= p <= 1]
+                        if valid_p_values:
+                            aggregate_p_value = statistics.median(valid_p_values)
+                        else:
+                            aggregate_p_value = 1.0
+                    else:
+                        aggregate_p_value = float(non_aligned_stat) if isinstance(non_aligned_stat, (int, float)) else 1.0
+                else:
+                    sys.stderr.write("❌ No test stat found in results\n")
+                    return False, "No test statistic found in results"
+            sys.stderr.flush()
         except Exception as e:
+            sys.stderr.write(f"❌ Failed to load results: {e}\n")
             sys.stderr.flush()
+            return False, f"Failed to load results: {e}"
+        finally:
+            # Clean up temporary file
+            try:
+                os.unlink(tmp_results_path)
+                sys.stderr.write(f"🗑️ Cleaned up temporary file: {tmp_results_path}\n")
+            except:
+                pass
+        sys.stderr.write(f"✅ Final aggregate p-value: {aggregate_p_value}\n")
         sys.stderr.write("=== MODEL TRACE ANALYSIS COMPLETED ===\n")
         sys.stderr.flush()
         return True, aggregate_p_value
+    except subprocess.TimeoutExpired:
+        sys.stderr.write("❌ Model tracing analysis timed out after 1 hour\n")
+        sys.stderr.flush()
+        return False, "Analysis timed out"
     except Exception as e:
         error_msg = str(e)
+        sys.stderr.write(f"💥 Error in model trace analysis: {error_msg}\n")
         import traceback
         sys.stderr.write(f"Traceback: {traceback.format_exc()}\n")
         sys.stderr.flush()
         return False, error_msg

src/leaderboard/read_evals.py CHANGED Viewed

@@ -8,6 +8,7 @@ from src.display.formatting import make_clickable_model
 from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
 from src.submission.check_validity import is_model_on_hub
 from src.evaluation.model_trace_eval import compute_model_trace_p_value
 @dataclass
 class EvalResult:
@@ -236,6 +237,13 @@ def get_raw_eval_results(results_path: str) -> list[EvalResult]:
         try:
             sys.stderr.write(f"\nConverting result to dict for: {v.full_model}\n")
             sys.stderr.flush()
             v.to_dict() # we test if the dict version is complete
             results.append(v)
             sys.stderr.write("Successfully converted and added result\n")

 from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
 from src.submission.check_validity import is_model_on_hub
 from src.evaluation.model_trace_eval import compute_model_trace_p_value
+from src.evaluation.initialize_models import is_model_allowed
 @dataclass
 class EvalResult:
         try:
             sys.stderr.write(f"\nConverting result to dict for: {v.full_model}\n")
             sys.stderr.flush()
+            # Filter to only allowed models
+            if not is_model_allowed(v.full_model):
+                sys.stderr.write(f"⏭️ Skipping non-allowed model: {v.full_model}\n")
+                sys.stderr.flush()
+                continue
             v.to_dict() # we test if the dict version is complete
             results.append(v)
             sys.stderr.write("Successfully converted and added result\n")

test_model_trace.py DELETED Viewed

@@ -1,43 +0,0 @@
-#!/usr/bin/env python3
-"""
-Test script for model tracing integration.
-Tests the p-value computation for a simple model comparison.
-"""
-import sys
-import os
-# Add src to path
-sys.path.append('src')
-from evaluation.model_trace_eval import compute_model_trace_p_value
-def test_model_trace():
-    """Test the model trace p-value computation with a simple example."""
-    print("Testing model trace p-value computation...")
-    # Test with a simple model (should be fast)
-    test_model = "openai-community/gpt2"
-    print(f"Computing p-value for {test_model} vs GPT-2...")
-    try:
-        p_value = compute_model_trace_p_value(test_model, "main", "float16")
-        if p_value is not None:
-            print(f"✅ Success! P-value: {p_value}")
-            if 0 <= p_value <= 1:
-                print("✅ P-value is in valid range [0, 1]")
-            else:
-                print(f"⚠️ Warning: P-value {p_value} is outside expected range [0, 1]")
-        else:
-            print("❌ Failed: P-value is None")
-    except Exception as e:
-        print(f"❌ Error: {e}")
-        import traceback
-        traceback.print_exc()
-if __name__ == "__main__":
-    test_model_trace()