Ahmed Ahmed commited on
Commit
63076cf
·
1 Parent(s): 1191811
Files changed (3) hide show
  1. src/about.py +2 -10
  2. src/leaderboard/read_evals.py +8 -32
  3. src/populate.py +10 -19
src/about.py CHANGED
@@ -1,16 +1,9 @@
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
4
- @dataclass
5
- class Task:
6
- benchmark: str
7
- metric: str
8
- col_name: str
9
-
10
- # Select your tasks here
11
  # ---------------------------------------------------
12
  class Tasks(Enum):
13
- # No tasks - we only care about p-values
14
  pass
15
 
16
  NUM_FEWSHOT = 0 # Not used
@@ -21,8 +14,7 @@ TITLE = """<h1 align="center" id="space-title">Model Tracing Leaderboard</h1>"""
21
 
22
  # What does your leaderboard evaluate?
23
  INTRODUCTION_TEXT = """
24
- This leaderboard evaluates specific language models based on their perplexity scores and
25
- structural similarity to Llama-2-7B using model tracing analysis.
26
 
27
  **Models Evaluated:**
28
  - `lmsys/vicuna-7b-v1.5` - Vicuna 7B v1.5
 
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
4
+ # NO TASKS - ONLY P-VALUES
 
 
 
 
 
 
5
  # ---------------------------------------------------
6
  class Tasks(Enum):
 
7
  pass
8
 
9
  NUM_FEWSHOT = 0 # Not used
 
14
 
15
  # What does your leaderboard evaluate?
16
  INTRODUCTION_TEXT = """
17
+ This leaderboard evaluates specific language models based on their structural similarity to Llama-2-7B using model tracing analysis.
 
18
 
19
  **Models Evaluated:**
20
  - `lmsys/vicuna-7b-v1.5` - Vicuna 7B v1.5
src/leaderboard/read_evals.py CHANGED
@@ -75,23 +75,14 @@ class EvalResult:
75
  )
76
 
77
  def to_dict(self):
78
- """Converts the Eval Result to a dict compatible with our dataframe display"""
79
  import sys
80
 
81
- sys.stderr.write(f"\n=== PROCESSING RESULT TO_DICT ===\n")
82
  sys.stderr.write(f"Processing result for model: {self.full_model}\n")
83
- sys.stderr.write(f"Raw results: {self.results}\n")
84
- sys.stderr.write(f"Model precision: {self.precision}\n")
85
- sys.stderr.write(f"Model type: {self.model_type}\n")
86
- sys.stderr.write(f"Weight type: {self.weight_type}\n")
87
  sys.stderr.flush()
88
 
89
- # No task-based scoring - we only care about p-values
90
- average = 0 # Default average since we don't have tasks
91
- sys.stderr.write(f"No task-based scoring, using default average: {average}\n")
92
- sys.stderr.flush()
93
-
94
- # Create data dictionary with comprehensive debugging
95
  data_dict = {}
96
 
97
  # Add core columns
@@ -103,7 +94,6 @@ class EvalResult:
103
  data_dict[AutoEvalColumn.architecture.name] = self.architecture
104
  data_dict[AutoEvalColumn.model.name] = make_clickable_model(self.full_model)
105
  data_dict[AutoEvalColumn.revision.name] = self.revision
106
- data_dict[AutoEvalColumn.average.name] = average
107
  data_dict[AutoEvalColumn.still_on_hub.name] = self.still_on_hub
108
 
109
  # Add default values for missing model info
@@ -112,9 +102,7 @@ class EvalResult:
112
  data_dict[AutoEvalColumn.likes.name] = 0
113
 
114
  # Compute model trace p-value
115
- sys.stderr.write(f"\n🧬 COMPUTING MODEL TRACE P-VALUE FOR: {self.full_model}\n")
116
- sys.stderr.write(f" - Revision: {self.revision if self.revision else 'main'}\n")
117
- sys.stderr.write(f" - Precision: {self.precision.value.name.lower()}\n")
118
  sys.stderr.flush()
119
 
120
  try:
@@ -125,29 +113,17 @@ class EvalResult:
125
  )
126
 
127
  if model_trace_p_value is not None:
128
- sys.stderr.write(f"✅ Model trace p-value computed successfully: {model_trace_p_value}\n")
129
  else:
130
- sys.stderr.write(f"⚠️ Model trace p-value is None (computation failed or not available)\n")
131
 
132
  except Exception as e:
133
- sys.stderr.write(f"💥 Exception during model trace p-value computation: {e}\n")
134
- import traceback
135
- sys.stderr.write(f"Traceback: {traceback.format_exc()}\n")
136
  model_trace_p_value = None
137
 
138
  data_dict[AutoEvalColumn.model_trace_p_value.name] = model_trace_p_value
139
- sys.stderr.write(f"📝 Added to data_dict: {AutoEvalColumn.model_trace_p_value.name} = {model_trace_p_value}\n")
140
- sys.stderr.flush()
141
 
142
- sys.stderr.write(f"Created base data_dict with {len(data_dict)} columns\n")
143
- sys.stderr.flush()
144
-
145
- # No task-specific scores - we only have p-values
146
- sys.stderr.write("No task-specific scores to add\n")
147
- sys.stderr.flush()
148
-
149
- sys.stderr.write(f"Final data dict has {len(data_dict)} columns: {list(data_dict.keys())}\n")
150
- sys.stderr.write(f"=== END PROCESSING RESULT TO_DICT ===\n")
151
  sys.stderr.flush()
152
 
153
  return data_dict
 
75
  )
76
 
77
  def to_dict(self):
78
+ """Converts the Eval Result to a dict compatible with our dataframe display - P-VALUES ONLY"""
79
  import sys
80
 
81
+ sys.stderr.write(f"\n=== PROCESSING RESULT TO_DICT (P-VALUES ONLY) ===\n")
82
  sys.stderr.write(f"Processing result for model: {self.full_model}\n")
 
 
 
 
83
  sys.stderr.flush()
84
 
85
+ # Create data dictionary - NO TASK PROCESSING AT ALL
 
 
 
 
 
86
  data_dict = {}
87
 
88
  # Add core columns
 
94
  data_dict[AutoEvalColumn.architecture.name] = self.architecture
95
  data_dict[AutoEvalColumn.model.name] = make_clickable_model(self.full_model)
96
  data_dict[AutoEvalColumn.revision.name] = self.revision
 
97
  data_dict[AutoEvalColumn.still_on_hub.name] = self.still_on_hub
98
 
99
  # Add default values for missing model info
 
102
  data_dict[AutoEvalColumn.likes.name] = 0
103
 
104
  # Compute model trace p-value
105
+ sys.stderr.write(f"🧬 COMPUTING MODEL TRACE P-VALUE FOR: {self.full_model}\n")
 
 
106
  sys.stderr.flush()
107
 
108
  try:
 
113
  )
114
 
115
  if model_trace_p_value is not None:
116
+ sys.stderr.write(f"✅ P-value: {model_trace_p_value}\n")
117
  else:
118
+ sys.stderr.write(f"⚠️ P-value computation failed\n")
119
 
120
  except Exception as e:
121
+ sys.stderr.write(f"💥 Exception during p-value computation: {e}\n")
 
 
122
  model_trace_p_value = None
123
 
124
  data_dict[AutoEvalColumn.model_trace_p_value.name] = model_trace_p_value
 
 
125
 
126
+ sys.stderr.write(f"=== END PROCESSING - ONLY P-VALUES ===\n")
 
 
 
 
 
 
 
 
127
  sys.stderr.flush()
128
 
129
  return data_dict
src/populate.py CHANGED
@@ -37,13 +37,13 @@ def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> p
37
  sys.stderr.write(f"WARNING: Result for {v.full_model} missing columns: {missing_cols}\n")
38
  # Add missing columns with default values
39
  for col in missing_cols:
40
- if col in benchmark_cols or col == AutoEvalColumn.average.name:
41
  data_dict[col] = 0.0
42
  elif col == AutoEvalColumn.model_type_symbol.name:
43
  data_dict[col] = "?"
44
  else:
45
  data_dict[col] = ""
46
- sys.stderr.flush()
47
 
48
  all_data_json.append(data_dict)
49
  sys.stderr.write(f"Successfully processed result {i+1}/{len(raw_data)}: {v.full_model}\n")
@@ -79,14 +79,11 @@ def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> p
79
  return create_empty_dataframe(cols, benchmark_cols)
80
 
81
  try:
82
- if AutoEvalColumn.average.name in df.columns:
83
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
84
- sys.stderr.write("\nSorted DataFrame by average\n")
85
- else:
86
- sys.stderr.write(f"\nWARNING: Cannot sort by {AutoEvalColumn.average.name} - column not found\n")
87
  sys.stderr.flush()
88
  except Exception as e:
89
- sys.stderr.write(f"\nError sorting DataFrame: {e}\n")
90
  sys.stderr.write("Available columns: " + str(df.columns.tolist()) + "\n")
91
  sys.stderr.flush()
92
 
@@ -111,16 +108,10 @@ def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> p
111
  sys.stderr.flush()
112
  return create_empty_dataframe(cols, benchmark_cols)
113
 
114
- try:
115
- # filter out if perplexity hasn't been evaluated
116
- df = df[has_no_nan_values(df, benchmark_cols)]
117
- sys.stderr.write("\nFinal DataFrame shape after filtering: " + str(df.shape) + "\n")
118
- sys.stderr.write("Final columns: " + str(df.columns.tolist()) + "\n")
119
- sys.stderr.flush()
120
- except Exception as e:
121
- sys.stderr.write(f"Error filtering DataFrame: {e}\n")
122
- sys.stderr.flush()
123
- # Don't return empty, return the unfiltered DataFrame
124
 
125
  # Final validation
126
  if df is None or df.empty:
@@ -150,7 +141,7 @@ def create_empty_dataframe(cols: list, benchmark_cols: list) -> pd.DataFrame:
150
  empty_df = pd.DataFrame(columns=cols)
151
  # Ensure correct column types
152
  for col in cols:
153
- if col in benchmark_cols or col == AutoEvalColumn.average.name:
154
  empty_df[col] = pd.Series(dtype=float)
155
  else:
156
  empty_df[col] = pd.Series(dtype=str)
 
37
  sys.stderr.write(f"WARNING: Result for {v.full_model} missing columns: {missing_cols}\n")
38
  # Add missing columns with default values
39
  for col in missing_cols:
40
+ if col in benchmark_cols:
41
  data_dict[col] = 0.0
42
  elif col == AutoEvalColumn.model_type_symbol.name:
43
  data_dict[col] = "?"
44
  else:
45
  data_dict[col] = ""
46
+ sys.stderr.flush()
47
 
48
  all_data_json.append(data_dict)
49
  sys.stderr.write(f"Successfully processed result {i+1}/{len(raw_data)}: {v.full_model}\n")
 
79
  return create_empty_dataframe(cols, benchmark_cols)
80
 
81
  try:
82
+ # No sorting needed - we only have p-values
83
+ sys.stderr.write("\nNo sorting applied - only p-values\n")
 
 
 
84
  sys.stderr.flush()
85
  except Exception as e:
86
+ sys.stderr.write(f"\nError with DataFrame: {e}\n")
87
  sys.stderr.write("Available columns: " + str(df.columns.tolist()) + "\n")
88
  sys.stderr.flush()
89
 
 
108
  sys.stderr.flush()
109
  return create_empty_dataframe(cols, benchmark_cols)
110
 
111
+ # No filtering needed - we only have p-values
112
+ sys.stderr.write("\nFinal DataFrame shape (no filtering): " + str(df.shape) + "\n")
113
+ sys.stderr.write("Final columns: " + str(df.columns.tolist()) + "\n")
114
+ sys.stderr.flush()
 
 
 
 
 
 
115
 
116
  # Final validation
117
  if df is None or df.empty:
 
141
  empty_df = pd.DataFrame(columns=cols)
142
  # Ensure correct column types
143
  for col in cols:
144
+ if col in benchmark_cols:
145
  empty_df[col] = pd.Series(dtype=float)
146
  else:
147
  empty_df[col] = pd.Series(dtype=str)