model_trace / src /about.py
Ahmed Ahmed
RETRY
63076cf
from dataclasses import dataclass
from enum import Enum
# NO TASKS - ONLY P-VALUES
# ---------------------------------------------------
class Tasks(Enum):
pass
NUM_FEWSHOT = 0 # Not used
# ---------------------------------------------------
# Your leaderboard name
TITLE = """<h1 align="center" id="space-title">Model Tracing Leaderboard</h1>"""
# What does your leaderboard evaluate?
INTRODUCTION_TEXT = """
This leaderboard evaluates specific language models based on their structural similarity to Llama-2-7B using model tracing analysis.
**Models Evaluated:**
- `lmsys/vicuna-7b-v1.5` - Vicuna 7B v1.5
- `ibm-granite/granite-7b-base` - IBM Granite 7B Base
- `EleutherAI/llemma_7b` - LLeMa 7B
**Metric:**
- **Match P-Value**: Lower p-values indicate the model preserves structural similarity to Llama-2-7B after fine-tuning (neuron organization is maintained).
"""
# Which evaluations are you running?
LLM_BENCHMARKS_TEXT = """
## How it works
The evaluation runs model tracing analysis on the supported language models:
### Supported Models
- **Vicuna 7B v1.5** (`lmsys/vicuna-7b-v1.5`) - Chat-optimized LLaMA variant
- **IBM Granite 7B** (`ibm-granite/granite-7b-base`) - IBM's foundational language model
- **LLeMa 7B** (`EleutherAI/llemma_7b`) - EleutherAI's mathematical language model
### Model Tracing Analysis
Compares each model's internal structure to Llama-2-7B using the "match" statistic:
- **Base Model**: Llama-2-7B (`meta-llama/Llama-2-7b-hf`)
- **Comparison Models**: The 3 supported models listed above
- **Method**: Neuron matching analysis across transformer layers
- **Alignment**: Models are aligned before comparison using the Hungarian algorithm
- **Output**: P-value indicating structural similarity (lower = more similar to Llama-2-7B)
The match statistic tests whether neurons in corresponding layers maintain similar functional roles
between the base model and the comparison models.
"""
EVALUATION_QUEUE_TEXT = """
## Model Analysis
This leaderboard analyzes structural similarity between specific models and Llama-2-7B:
1. **Vicuna 7B v1.5** - Chat-optimized variant of LLaMA
2. **IBM Granite 7B Base** - IBM's foundational language model
3. **LLeMa 7B** - EleutherAI's mathematical language model
The p-values are computed automatically using the model tracing analysis.
"""
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = ""