Spaces:
Runtime error
Runtime error
Ahmed Ahmed
commited on
Commit
·
63076cf
1
Parent(s):
1191811
RETRY
Browse files- src/about.py +2 -10
- src/leaderboard/read_evals.py +8 -32
- src/populate.py +10 -19
src/about.py
CHANGED
@@ -1,16 +1,9 @@
|
|
1 |
from dataclasses import dataclass
|
2 |
from enum import Enum
|
3 |
|
4 |
-
|
5 |
-
class Task:
|
6 |
-
benchmark: str
|
7 |
-
metric: str
|
8 |
-
col_name: str
|
9 |
-
|
10 |
-
# Select your tasks here
|
11 |
# ---------------------------------------------------
|
12 |
class Tasks(Enum):
|
13 |
-
# No tasks - we only care about p-values
|
14 |
pass
|
15 |
|
16 |
NUM_FEWSHOT = 0 # Not used
|
@@ -21,8 +14,7 @@ TITLE = """<h1 align="center" id="space-title">Model Tracing Leaderboard</h1>"""
|
|
21 |
|
22 |
# What does your leaderboard evaluate?
|
23 |
INTRODUCTION_TEXT = """
|
24 |
-
This leaderboard evaluates specific language models based on their
|
25 |
-
structural similarity to Llama-2-7B using model tracing analysis.
|
26 |
|
27 |
**Models Evaluated:**
|
28 |
- `lmsys/vicuna-7b-v1.5` - Vicuna 7B v1.5
|
|
|
1 |
from dataclasses import dataclass
|
2 |
from enum import Enum
|
3 |
|
4 |
+
# NO TASKS - ONLY P-VALUES
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
# ---------------------------------------------------
|
6 |
class Tasks(Enum):
|
|
|
7 |
pass
|
8 |
|
9 |
NUM_FEWSHOT = 0 # Not used
|
|
|
14 |
|
15 |
# What does your leaderboard evaluate?
|
16 |
INTRODUCTION_TEXT = """
|
17 |
+
This leaderboard evaluates specific language models based on their structural similarity to Llama-2-7B using model tracing analysis.
|
|
|
18 |
|
19 |
**Models Evaluated:**
|
20 |
- `lmsys/vicuna-7b-v1.5` - Vicuna 7B v1.5
|
src/leaderboard/read_evals.py
CHANGED
@@ -75,23 +75,14 @@ class EvalResult:
|
|
75 |
)
|
76 |
|
77 |
def to_dict(self):
|
78 |
-
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
79 |
import sys
|
80 |
|
81 |
-
sys.stderr.write(f"\n=== PROCESSING RESULT TO_DICT ===\n")
|
82 |
sys.stderr.write(f"Processing result for model: {self.full_model}\n")
|
83 |
-
sys.stderr.write(f"Raw results: {self.results}\n")
|
84 |
-
sys.stderr.write(f"Model precision: {self.precision}\n")
|
85 |
-
sys.stderr.write(f"Model type: {self.model_type}\n")
|
86 |
-
sys.stderr.write(f"Weight type: {self.weight_type}\n")
|
87 |
sys.stderr.flush()
|
88 |
|
89 |
-
#
|
90 |
-
average = 0 # Default average since we don't have tasks
|
91 |
-
sys.stderr.write(f"No task-based scoring, using default average: {average}\n")
|
92 |
-
sys.stderr.flush()
|
93 |
-
|
94 |
-
# Create data dictionary with comprehensive debugging
|
95 |
data_dict = {}
|
96 |
|
97 |
# Add core columns
|
@@ -103,7 +94,6 @@ class EvalResult:
|
|
103 |
data_dict[AutoEvalColumn.architecture.name] = self.architecture
|
104 |
data_dict[AutoEvalColumn.model.name] = make_clickable_model(self.full_model)
|
105 |
data_dict[AutoEvalColumn.revision.name] = self.revision
|
106 |
-
data_dict[AutoEvalColumn.average.name] = average
|
107 |
data_dict[AutoEvalColumn.still_on_hub.name] = self.still_on_hub
|
108 |
|
109 |
# Add default values for missing model info
|
@@ -112,9 +102,7 @@ class EvalResult:
|
|
112 |
data_dict[AutoEvalColumn.likes.name] = 0
|
113 |
|
114 |
# Compute model trace p-value
|
115 |
-
sys.stderr.write(f"
|
116 |
-
sys.stderr.write(f" - Revision: {self.revision if self.revision else 'main'}\n")
|
117 |
-
sys.stderr.write(f" - Precision: {self.precision.value.name.lower()}\n")
|
118 |
sys.stderr.flush()
|
119 |
|
120 |
try:
|
@@ -125,29 +113,17 @@ class EvalResult:
|
|
125 |
)
|
126 |
|
127 |
if model_trace_p_value is not None:
|
128 |
-
sys.stderr.write(f"✅
|
129 |
else:
|
130 |
-
sys.stderr.write(f"⚠️
|
131 |
|
132 |
except Exception as e:
|
133 |
-
sys.stderr.write(f"💥 Exception during
|
134 |
-
import traceback
|
135 |
-
sys.stderr.write(f"Traceback: {traceback.format_exc()}\n")
|
136 |
model_trace_p_value = None
|
137 |
|
138 |
data_dict[AutoEvalColumn.model_trace_p_value.name] = model_trace_p_value
|
139 |
-
sys.stderr.write(f"📝 Added to data_dict: {AutoEvalColumn.model_trace_p_value.name} = {model_trace_p_value}\n")
|
140 |
-
sys.stderr.flush()
|
141 |
|
142 |
-
sys.stderr.write(f"
|
143 |
-
sys.stderr.flush()
|
144 |
-
|
145 |
-
# No task-specific scores - we only have p-values
|
146 |
-
sys.stderr.write("No task-specific scores to add\n")
|
147 |
-
sys.stderr.flush()
|
148 |
-
|
149 |
-
sys.stderr.write(f"Final data dict has {len(data_dict)} columns: {list(data_dict.keys())}\n")
|
150 |
-
sys.stderr.write(f"=== END PROCESSING RESULT TO_DICT ===\n")
|
151 |
sys.stderr.flush()
|
152 |
|
153 |
return data_dict
|
|
|
75 |
)
|
76 |
|
77 |
def to_dict(self):
|
78 |
+
"""Converts the Eval Result to a dict compatible with our dataframe display - P-VALUES ONLY"""
|
79 |
import sys
|
80 |
|
81 |
+
sys.stderr.write(f"\n=== PROCESSING RESULT TO_DICT (P-VALUES ONLY) ===\n")
|
82 |
sys.stderr.write(f"Processing result for model: {self.full_model}\n")
|
|
|
|
|
|
|
|
|
83 |
sys.stderr.flush()
|
84 |
|
85 |
+
# Create data dictionary - NO TASK PROCESSING AT ALL
|
|
|
|
|
|
|
|
|
|
|
86 |
data_dict = {}
|
87 |
|
88 |
# Add core columns
|
|
|
94 |
data_dict[AutoEvalColumn.architecture.name] = self.architecture
|
95 |
data_dict[AutoEvalColumn.model.name] = make_clickable_model(self.full_model)
|
96 |
data_dict[AutoEvalColumn.revision.name] = self.revision
|
|
|
97 |
data_dict[AutoEvalColumn.still_on_hub.name] = self.still_on_hub
|
98 |
|
99 |
# Add default values for missing model info
|
|
|
102 |
data_dict[AutoEvalColumn.likes.name] = 0
|
103 |
|
104 |
# Compute model trace p-value
|
105 |
+
sys.stderr.write(f"🧬 COMPUTING MODEL TRACE P-VALUE FOR: {self.full_model}\n")
|
|
|
|
|
106 |
sys.stderr.flush()
|
107 |
|
108 |
try:
|
|
|
113 |
)
|
114 |
|
115 |
if model_trace_p_value is not None:
|
116 |
+
sys.stderr.write(f"✅ P-value: {model_trace_p_value}\n")
|
117 |
else:
|
118 |
+
sys.stderr.write(f"⚠️ P-value computation failed\n")
|
119 |
|
120 |
except Exception as e:
|
121 |
+
sys.stderr.write(f"💥 Exception during p-value computation: {e}\n")
|
|
|
|
|
122 |
model_trace_p_value = None
|
123 |
|
124 |
data_dict[AutoEvalColumn.model_trace_p_value.name] = model_trace_p_value
|
|
|
|
|
125 |
|
126 |
+
sys.stderr.write(f"=== END PROCESSING - ONLY P-VALUES ===\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
sys.stderr.flush()
|
128 |
|
129 |
return data_dict
|
src/populate.py
CHANGED
@@ -37,13 +37,13 @@ def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> p
|
|
37 |
sys.stderr.write(f"WARNING: Result for {v.full_model} missing columns: {missing_cols}\n")
|
38 |
# Add missing columns with default values
|
39 |
for col in missing_cols:
|
40 |
-
if col in benchmark_cols
|
41 |
data_dict[col] = 0.0
|
42 |
elif col == AutoEvalColumn.model_type_symbol.name:
|
43 |
data_dict[col] = "?"
|
44 |
else:
|
45 |
data_dict[col] = ""
|
46 |
-
|
47 |
|
48 |
all_data_json.append(data_dict)
|
49 |
sys.stderr.write(f"Successfully processed result {i+1}/{len(raw_data)}: {v.full_model}\n")
|
@@ -79,14 +79,11 @@ def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> p
|
|
79 |
return create_empty_dataframe(cols, benchmark_cols)
|
80 |
|
81 |
try:
|
82 |
-
|
83 |
-
|
84 |
-
sys.stderr.write("\nSorted DataFrame by average\n")
|
85 |
-
else:
|
86 |
-
sys.stderr.write(f"\nWARNING: Cannot sort by {AutoEvalColumn.average.name} - column not found\n")
|
87 |
sys.stderr.flush()
|
88 |
except Exception as e:
|
89 |
-
sys.stderr.write(f"\nError
|
90 |
sys.stderr.write("Available columns: " + str(df.columns.tolist()) + "\n")
|
91 |
sys.stderr.flush()
|
92 |
|
@@ -111,16 +108,10 @@ def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> p
|
|
111 |
sys.stderr.flush()
|
112 |
return create_empty_dataframe(cols, benchmark_cols)
|
113 |
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
sys.stderr.write("Final columns: " + str(df.columns.tolist()) + "\n")
|
119 |
-
sys.stderr.flush()
|
120 |
-
except Exception as e:
|
121 |
-
sys.stderr.write(f"Error filtering DataFrame: {e}\n")
|
122 |
-
sys.stderr.flush()
|
123 |
-
# Don't return empty, return the unfiltered DataFrame
|
124 |
|
125 |
# Final validation
|
126 |
if df is None or df.empty:
|
@@ -150,7 +141,7 @@ def create_empty_dataframe(cols: list, benchmark_cols: list) -> pd.DataFrame:
|
|
150 |
empty_df = pd.DataFrame(columns=cols)
|
151 |
# Ensure correct column types
|
152 |
for col in cols:
|
153 |
-
if col in benchmark_cols
|
154 |
empty_df[col] = pd.Series(dtype=float)
|
155 |
else:
|
156 |
empty_df[col] = pd.Series(dtype=str)
|
|
|
37 |
sys.stderr.write(f"WARNING: Result for {v.full_model} missing columns: {missing_cols}\n")
|
38 |
# Add missing columns with default values
|
39 |
for col in missing_cols:
|
40 |
+
if col in benchmark_cols:
|
41 |
data_dict[col] = 0.0
|
42 |
elif col == AutoEvalColumn.model_type_symbol.name:
|
43 |
data_dict[col] = "?"
|
44 |
else:
|
45 |
data_dict[col] = ""
|
46 |
+
sys.stderr.flush()
|
47 |
|
48 |
all_data_json.append(data_dict)
|
49 |
sys.stderr.write(f"Successfully processed result {i+1}/{len(raw_data)}: {v.full_model}\n")
|
|
|
79 |
return create_empty_dataframe(cols, benchmark_cols)
|
80 |
|
81 |
try:
|
82 |
+
# No sorting needed - we only have p-values
|
83 |
+
sys.stderr.write("\nNo sorting applied - only p-values\n")
|
|
|
|
|
|
|
84 |
sys.stderr.flush()
|
85 |
except Exception as e:
|
86 |
+
sys.stderr.write(f"\nError with DataFrame: {e}\n")
|
87 |
sys.stderr.write("Available columns: " + str(df.columns.tolist()) + "\n")
|
88 |
sys.stderr.flush()
|
89 |
|
|
|
108 |
sys.stderr.flush()
|
109 |
return create_empty_dataframe(cols, benchmark_cols)
|
110 |
|
111 |
+
# No filtering needed - we only have p-values
|
112 |
+
sys.stderr.write("\nFinal DataFrame shape (no filtering): " + str(df.shape) + "\n")
|
113 |
+
sys.stderr.write("Final columns: " + str(df.columns.tolist()) + "\n")
|
114 |
+
sys.stderr.flush()
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
|
116 |
# Final validation
|
117 |
if df is None or df.empty:
|
|
|
141 |
empty_df = pd.DataFrame(columns=cols)
|
142 |
# Ensure correct column types
|
143 |
for col in cols:
|
144 |
+
if col in benchmark_cols:
|
145 |
empty_df[col] = pd.Series(dtype=float)
|
146 |
else:
|
147 |
empty_df[col] = pd.Series(dtype=str)
|