Ahmed Ahmed commited on
Commit
21bc425
·
1 Parent(s): 536d515

consolidate

Browse files
Files changed (6) hide show
  1. app.py +125 -24
  2. leaderboard.py +402 -0
  3. logs.txt +266 -0
  4. src/display/utils.py +17 -1
  5. src/leaderboard/read_evals.py +49 -27
  6. src/populate.py +98 -35
app.py CHANGED
@@ -43,47 +43,138 @@ def init_leaderboard(dataframe):
43
  def refresh_leaderboard():
44
  import sys
45
  import traceback
 
46
 
47
  try:
 
48
  sys.stderr.write("Refreshing leaderboard data...\n")
49
  sys.stderr.flush()
50
 
51
  # Get fresh leaderboard data
52
  df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
53
- sys.stderr.write(f"Got DataFrame with shape: {df.shape}\n")
54
- sys.stderr.write(f"DataFrame columns: {df.columns.tolist()}\n")
 
 
 
 
 
 
55
  sys.stderr.flush()
56
 
57
  # Check if DataFrame is valid for leaderboard
58
  if df is None:
59
- sys.stderr.write("DataFrame is None, cannot create leaderboard\n")
60
  sys.stderr.flush()
61
- raise ValueError("DataFrame is None")
 
62
 
63
- if df.empty:
64
- sys.stderr.write("DataFrame is empty, creating minimal valid DataFrame\n")
 
 
 
 
 
 
65
  sys.stderr.flush()
66
- # Create a minimal valid DataFrame that won't crash the leaderboard
67
- import pandas as pd
68
- empty_df = pd.DataFrame(columns=COLS)
69
- # Add one dummy row to prevent leaderboard component from crashing
70
- dummy_row = {col: 0 if col in BENCHMARK_COLS or col == AutoEvalColumn.average.name else "" for col in COLS}
71
- dummy_row[AutoEvalColumn.model.name] = "No models evaluated yet"
72
- dummy_row[AutoEvalColumn.model_type_symbol.name] = "?"
73
- empty_df = pd.DataFrame([dummy_row])
74
- return init_leaderboard(empty_df)
75
 
76
- sys.stderr.write("Creating leaderboard with valid DataFrame\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  sys.stderr.flush()
78
- return init_leaderboard(df)
 
 
 
 
 
79
 
80
  except Exception as e:
81
  error_msg = str(e)
82
  traceback_str = traceback.format_exc()
83
- sys.stderr.write(f"Error in refresh_leaderboard: {error_msg}\n")
84
  sys.stderr.write(f"Traceback: {traceback_str}\n")
85
  sys.stderr.flush()
86
- raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
  def run_perplexity_test(model_name, revision, precision):
89
  """Run perplexity evaluation on demand."""
@@ -95,7 +186,7 @@ def run_perplexity_test(model_name, revision, precision):
95
 
96
  try:
97
  # Use stderr for more reliable logging in HF Spaces
98
- sys.stderr.write(f"\n=== Running Perplexity Test ===\n")
99
  sys.stderr.write(f"Model: {model_name}\n")
100
  sys.stderr.write(f"Revision: {revision}\n")
101
  sys.stderr.write(f"Precision: {precision}\n")
@@ -112,10 +203,16 @@ def run_perplexity_test(model_name, revision, precision):
112
  sys.stderr.flush()
113
 
114
  new_leaderboard = refresh_leaderboard()
115
- sys.stderr.write("Leaderboard refresh successful\n")
116
- sys.stderr.flush()
117
 
118
- return f"✅ Perplexity evaluation completed!\nPerplexity: {result:.4f}\n\nResults saved to leaderboard.", new_leaderboard
 
 
 
 
 
 
 
 
119
  except Exception as refresh_error:
120
  # If leaderboard refresh fails, still show success but don't update leaderboard
121
  error_msg = str(refresh_error)
@@ -124,7 +221,11 @@ def run_perplexity_test(model_name, revision, precision):
124
  sys.stderr.write(f"Traceback: {traceback_str}\n")
125
  sys.stderr.flush()
126
 
127
- return f"✅ Perplexity evaluation completed!\nPerplexity: {result:.4f}\n\n⚠️ Results saved but leaderboard refresh failed: {error_msg}\n\nPlease refresh the page to see updated results.", None
 
 
 
 
128
  else:
129
  return f"❌ Evaluation failed: {result}", None
130
 
 
43
  def refresh_leaderboard():
44
  import sys
45
  import traceback
46
+ import pandas as pd
47
 
48
  try:
49
+ sys.stderr.write("=== REFRESH LEADERBOARD DEBUG ===\n")
50
  sys.stderr.write("Refreshing leaderboard data...\n")
51
  sys.stderr.flush()
52
 
53
  # Get fresh leaderboard data
54
  df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
55
+
56
+ sys.stderr.write(f"get_leaderboard_df returned: {type(df)}\n")
57
+ if df is not None:
58
+ sys.stderr.write(f"DataFrame shape: {df.shape}\n")
59
+ sys.stderr.write(f"DataFrame columns: {df.columns.tolist()}\n")
60
+ sys.stderr.write(f"DataFrame empty: {df.empty}\n")
61
+ else:
62
+ sys.stderr.write("DataFrame is None!\n")
63
  sys.stderr.flush()
64
 
65
  # Check if DataFrame is valid for leaderboard
66
  if df is None:
67
+ sys.stderr.write("DataFrame is None, creating fallback DataFrame\n")
68
  sys.stderr.flush()
69
+ # Create a fallback DataFrame
70
+ df = create_fallback_dataframe()
71
 
72
+ elif df.empty:
73
+ sys.stderr.write("DataFrame is empty, creating fallback DataFrame\n")
74
+ sys.stderr.flush()
75
+ # Create a fallback DataFrame for empty case
76
+ df = create_fallback_dataframe()
77
+
78
+ elif not all(col in df.columns for col in COLS):
79
+ sys.stderr.write(f"DataFrame missing required columns. Has: {df.columns.tolist()}, Needs: {COLS}\n")
80
  sys.stderr.flush()
81
+ # Create a fallback DataFrame for missing columns
82
+ df = create_fallback_dataframe()
83
+
84
+ sys.stderr.write(f"Final DataFrame for leaderboard - Shape: {df.shape}, Columns: {df.columns.tolist()}\n")
85
+ sys.stderr.flush()
 
 
 
 
86
 
87
+ # Ensure DataFrame has the exact columns expected
88
+ for col in COLS:
89
+ if col not in df.columns:
90
+ sys.stderr.write(f"Adding missing column: {col}\n")
91
+ if col in BENCHMARK_COLS or col == AutoEvalColumn.average.name:
92
+ df[col] = 0.0
93
+ elif col == AutoEvalColumn.model.name:
94
+ df[col] = "Unknown Model"
95
+ elif col == AutoEvalColumn.model_type_symbol.name:
96
+ df[col] = "?"
97
+ else:
98
+ df[col] = ""
99
+ sys.stderr.flush()
100
+
101
+ # Reorder columns to match expected order
102
+ df = df[COLS]
103
+
104
+ sys.stderr.write("Creating leaderboard component...\n")
105
  sys.stderr.flush()
106
+
107
+ new_leaderboard = init_leaderboard(df)
108
+ sys.stderr.write("Leaderboard component created successfully\n")
109
+ sys.stderr.flush()
110
+
111
+ return new_leaderboard
112
 
113
  except Exception as e:
114
  error_msg = str(e)
115
  traceback_str = traceback.format_exc()
116
+ sys.stderr.write(f"CRITICAL ERROR in refresh_leaderboard: {error_msg}\n")
117
  sys.stderr.write(f"Traceback: {traceback_str}\n")
118
  sys.stderr.flush()
119
+
120
+ # Create emergency fallback leaderboard
121
+ try:
122
+ sys.stderr.write("Creating emergency fallback leaderboard...\n")
123
+ sys.stderr.flush()
124
+ fallback_df = create_fallback_dataframe()
125
+ return init_leaderboard(fallback_df)
126
+ except Exception as fallback_error:
127
+ sys.stderr.write(f"Even fallback failed: {fallback_error}\n")
128
+ sys.stderr.flush()
129
+ raise Exception(f"Complete leaderboard failure: {error_msg}")
130
+
131
+ def create_fallback_dataframe():
132
+ """Create a minimal valid DataFrame that won't crash the leaderboard"""
133
+ import pandas as pd
134
+ import sys
135
+
136
+ sys.stderr.write("Creating fallback DataFrame...\n")
137
+ sys.stderr.flush()
138
+
139
+ # Create minimal valid data
140
+ fallback_data = {col: [] for col in COLS}
141
+
142
+ # Add one dummy row to prevent leaderboard component from crashing
143
+ dummy_row = {}
144
+ for col in COLS:
145
+ if col in BENCHMARK_COLS or col == AutoEvalColumn.average.name:
146
+ dummy_row[col] = 0.0
147
+ elif col == AutoEvalColumn.model.name:
148
+ dummy_row[col] = "No models evaluated yet"
149
+ elif col == AutoEvalColumn.model_type_symbol.name:
150
+ dummy_row[col] = "?"
151
+ elif col == AutoEvalColumn.precision.name:
152
+ dummy_row[col] = "float16"
153
+ elif col == AutoEvalColumn.model_type.name:
154
+ dummy_row[col] = "pretrained"
155
+ elif col == AutoEvalColumn.weight_type.name:
156
+ dummy_row[col] = "Original"
157
+ elif col == AutoEvalColumn.architecture.name:
158
+ dummy_row[col] = "Unknown"
159
+ elif col == AutoEvalColumn.still_on_hub.name:
160
+ dummy_row[col] = True
161
+ elif col == AutoEvalColumn.license.name:
162
+ dummy_row[col] = "Unknown"
163
+ elif col == AutoEvalColumn.params.name:
164
+ dummy_row[col] = 0.0
165
+ elif col == AutoEvalColumn.likes.name:
166
+ dummy_row[col] = 0.0
167
+ elif col == AutoEvalColumn.revision.name:
168
+ dummy_row[col] = ""
169
+ else:
170
+ dummy_row[col] = ""
171
+
172
+ df = pd.DataFrame([dummy_row])
173
+ sys.stderr.write(f"Fallback DataFrame created with shape: {df.shape}\n")
174
+ sys.stderr.write(f"Fallback DataFrame columns: {df.columns.tolist()}\n")
175
+ sys.stderr.flush()
176
+
177
+ return df
178
 
179
  def run_perplexity_test(model_name, revision, precision):
180
  """Run perplexity evaluation on demand."""
 
186
 
187
  try:
188
  # Use stderr for more reliable logging in HF Spaces
189
+ sys.stderr.write(f"\n=== RUNNING PERPLEXITY TEST ===\n")
190
  sys.stderr.write(f"Model: {model_name}\n")
191
  sys.stderr.write(f"Revision: {revision}\n")
192
  sys.stderr.write(f"Precision: {precision}\n")
 
203
  sys.stderr.flush()
204
 
205
  new_leaderboard = refresh_leaderboard()
 
 
206
 
207
+ if new_leaderboard is not None:
208
+ sys.stderr.write("Leaderboard refresh successful\n")
209
+ sys.stderr.flush()
210
+ return f"✅ Perplexity evaluation completed!\nPerplexity: {result:.4f}\n\nResults saved and leaderboard updated.", new_leaderboard
211
+ else:
212
+ sys.stderr.write("Leaderboard refresh returned None\n")
213
+ sys.stderr.flush()
214
+ return f"✅ Perplexity evaluation completed!\nPerplexity: {result:.4f}\n\n⚠️ Results saved but leaderboard update returned None.\n\nPlease refresh the page to see updated results.", None
215
+
216
  except Exception as refresh_error:
217
  # If leaderboard refresh fails, still show success but don't update leaderboard
218
  error_msg = str(refresh_error)
 
221
  sys.stderr.write(f"Traceback: {traceback_str}\n")
222
  sys.stderr.flush()
223
 
224
+ # Check if it's the specific "must have a value set" error
225
+ if "must have a value set" in error_msg.lower():
226
+ return f"✅ Perplexity evaluation completed!\nPerplexity: {result:.4f}\n\n⚠️ Results saved but leaderboard component failed to update due to data structure issue.\n\n**Please refresh the page** to see your results in the main leaderboard.", None
227
+ else:
228
+ return f"✅ Perplexity evaluation completed!\nPerplexity: {result:.4f}\n\n⚠️ Results saved but leaderboard refresh failed: {error_msg}\n\nPlease refresh the page to see updated results.", None
229
  else:
230
  return f"❌ Evaluation failed: {result}", None
231
 
leaderboard.py ADDED
@@ -0,0 +1,402 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """gr.Leaderboard() component"""
2
+
3
+ from __future__ import annotations
4
+
5
+ import warnings
6
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union, Literal
7
+
8
+ from pandas.api.types import (
9
+ is_numeric_dtype,
10
+ is_object_dtype,
11
+ is_string_dtype,
12
+ is_bool_dtype,
13
+ )
14
+ import semantic_version
15
+ from dataclasses import dataclass, field
16
+
17
+ from gradio.components import Component
18
+ from gradio.data_classes import GradioModel
19
+ from gradio.events import Events
20
+
21
+ if TYPE_CHECKING:
22
+ import pandas as pd
23
+ from pandas.io.formats.style import Styler
24
+
25
+
26
+ @dataclass
27
+ class SearchColumns:
28
+ primary_column: str
29
+ secondary_columns: Optional[List[str]]
30
+ label: Optional[str] = None
31
+ placeholder: Optional[str] = None
32
+
33
+
34
+ @dataclass
35
+ class SelectColumns:
36
+ default_selection: Optional[list[str]] = field(default_factory=list)
37
+ cant_deselect: Optional[list[str]] = field(default_factory=list)
38
+ allow: bool = True
39
+ label: Optional[str] = None
40
+ show_label: bool = True
41
+ info: Optional[str] = None
42
+
43
+
44
+ @dataclass
45
+ class ColumnFilter:
46
+ column: str
47
+ type: Literal["slider", "dropdown", "checkboxgroup", "boolean"] = None
48
+ default: Optional[Union[int, float, List[Tuple[str, str]]]] = None
49
+ choices: Optional[Union[int, float, List[Tuple[str, str]]]] = None
50
+ label: Optional[str] = None
51
+ info: Optional[str] = None
52
+ show_label: bool = True
53
+ min: Optional[Union[int, float]] = None
54
+ max: Optional[Union[int, float]] = None
55
+
56
+
57
+ class DataframeData(GradioModel):
58
+ headers: List[str]
59
+ data: Union[List[List[Any]], List[Tuple[Any, ...]]]
60
+ metadata: Optional[Dict[str, Optional[List[Any]]]] = None
61
+
62
+
63
+ class Leaderboard(Component):
64
+ """
65
+ This component displays a table of value spreadsheet-like component. Can be used to display data as an output component, or as an input to collect data from the user.
66
+ Demos: filter_records, matrix_transpose, tax_calculator, sort_records
67
+ """
68
+
69
+ EVENTS = [Events.change, Events.input, Events.select]
70
+
71
+ data_model = DataframeData
72
+
73
+ def __init__(
74
+ self,
75
+ value: pd.DataFrame | None = None,
76
+ *,
77
+ datatype: str | list[str] = "str",
78
+ search_columns: list[str] | SearchColumns | None = None,
79
+ select_columns: list[str] | SelectColumns | None = None,
80
+ filter_columns: list[str | ColumnFilter] | None = None,
81
+ bool_checkboxgroup_label: str | None = None,
82
+ hide_columns: list[str] | None = None,
83
+ latex_delimiters: list[dict[str, str | bool]] | None = None,
84
+ label: str | None = None,
85
+ show_label: bool | None = None,
86
+ every: float | None = None,
87
+ height: int = 500,
88
+ scale: int | None = None,
89
+ min_width: int = 160,
90
+ interactive: bool | None = None,
91
+ visible: bool = True,
92
+ elem_id: str | None = None,
93
+ elem_classes: list[str] | str | None = None,
94
+ render: bool = True,
95
+ wrap: bool = False,
96
+ line_breaks: bool = True,
97
+ column_widths: list[str | int] | None = None,
98
+ ):
99
+ """
100
+ Parameters:
101
+ value: Default value to display in the DataFrame. Must be a pandas DataFrame.
102
+ datatype: Datatype of values in sheet. Can be provided per column as a list of strings, or for the entire sheet as a single string. Valid datatypes are "str", "number", "bool", "date", and "markdown".
103
+ search_columns: See Configuration section of docs for details.
104
+ select_columns: See Configuration section of docs for details.
105
+ filter_columns: See Configuration section of docs for details.
106
+ bool_checkboxgroup_label: Label for the checkboxgroup filter for boolean columns.
107
+ hide_columns: List of columns to hide by default. They will not be displayed in the table but they can still be used for searching, filtering.
108
+ label: The label for this component. Appears above the component and is also used as the header if there are a table of examples for this component. If None and used in a `gr.Interface`, the label will be the name of the parameter this component is assigned to.
109
+ latex_delimiters: A list of dicts of the form {"left": open delimiter (str), "right": close delimiter (str), "display": whether to display in newline (bool)} that will be used to render LaTeX expressions. If not provided, `latex_delimiters` is set to `[{ "left": "$$", "right": "$$", "display": True }]`, so only expressions enclosed in $$ delimiters will be rendered as LaTeX, and in a new line. Pass in an empty list to disable LaTeX rendering. For more information, see the [KaTeX documentation](https://katex.org/docs/autorender.html). Only applies to columns whose datatype is "markdown".
110
+ label: The label for this component. Appears above the component and is also used as the header if there are a table of examples for this component. If None and used in a `gr.Interface`, the label will be the name of the parameter this component is assigned to.
111
+ show_label: if True, will display label.
112
+ every: If `value` is a callable, run the function 'every' number of seconds while the client connection is open. Has no effect otherwise. The event can be accessed (e.g. to cancel it) via this component's .load_event attribute.
113
+ height: The maximum height of the dataframe, specified in pixels if a number is passed, or in CSS units if a string is passed. If more rows are created than can fit in the height, a scrollbar will appear.
114
+ scale: relative size compared to adjacent Components. For example if Components A and B are in a Row, and A has scale=2, and B has scale=1, A will be twice as wide as B. Should be an integer. scale applies in Rows, and to top-level Components in Blocks where fill_height=True.
115
+ min_width: minimum pixel width, will wrap if not sufficient screen space to satisfy this value. If a certain scale value results in this Component being narrower than min_width, the min_width parameter will be respected first.
116
+ interactive: if True, will allow users to edit the dataframe; if False, can only be used to display data. If not provided, this is inferred based on whether the component is used as an input or output.
117
+ visible: If False, component will be hidden.
118
+ elem_id: An optional string that is assigned as the id of this component in the HTML DOM. Can be used for targeting CSS styles.
119
+ elem_classes: An optional list of strings that are assigned as the classes of this component in the HTML DOM. Can be used for targeting CSS styles.
120
+ render: If False, component will not render be rendered in the Blocks context. Should be used if the intention is to assign event listeners now but render the component later.
121
+ wrap: If True, the text in table cells will wrap when appropriate. If False and the `column_width` parameter is not set, the column widths will expand based on the cell contents and the table may need to be horizontally scrolled. If `column_width` is set, then any overflow text will be hidden.
122
+ line_breaks: If True (default), will enable Github-flavored Markdown line breaks in chatbot messages. If False, single new lines will be ignored. Only applies for columns of type "markdown."
123
+ column_widths: An optional list representing the width of each column. The elements of the list should be in the format "100px" (ints are also accepted and converted to pixel values) or "10%". If not provided, the column widths will be automatically determined based on the content of the cells. Setting this parameter will cause the browser to try to fit the table within the page width.
124
+ """
125
+ if value is None:
126
+ raise ValueError("Leaderboard component must have a value set.")
127
+ self.wrap = wrap
128
+ self.headers = [str(s) for s in value.columns]
129
+ self.datatype = datatype
130
+ self.search_columns = self._get_search_columns(search_columns)
131
+ self.bool_checkboxgroup_label = bool_checkboxgroup_label
132
+ self.select_columns_config = self._get_select_columns(select_columns, value)
133
+ self.filter_columns = self._get_column_filter_configs(filter_columns, value)
134
+ self.raise_error_if_incorrect_config()
135
+
136
+ self.hide_columns = hide_columns or []
137
+ self.col_count = (len(self.headers), "fixed")
138
+ self.row_count = (value.shape[0], "fixed")
139
+
140
+ if latex_delimiters is None:
141
+ latex_delimiters = [{"left": "$$", "right": "$$", "display": True}]
142
+ self.latex_delimiters = latex_delimiters
143
+ self.height = height
144
+ self.line_breaks = line_breaks
145
+ self.column_widths = [
146
+ w if isinstance(w, str) else f"{w}px" for w in (column_widths or [])
147
+ ]
148
+ super().__init__(
149
+ label=label,
150
+ every=every,
151
+ show_label=show_label,
152
+ scale=scale,
153
+ min_width=min_width,
154
+ interactive=interactive,
155
+ visible=visible,
156
+ elem_id=elem_id,
157
+ elem_classes=elem_classes,
158
+ render=render,
159
+ value=value,
160
+ )
161
+
162
+ def raise_error_if_incorrect_config(self):
163
+ for col in [self.search_columns.primary_column, *self.search_columns.secondary_columns]:
164
+ if col not in self.headers:
165
+ raise ValueError(f"Column '{col}' not found in the DataFrame headers.")
166
+ for col in self.select_columns_config.default_selection + self.select_columns_config.cant_deselect:
167
+ if col not in self.headers:
168
+ raise ValueError(f"Column '{col}' not found in the DataFrame headers.")
169
+ for col in [col.column for col in self.filter_columns]:
170
+ if col not in self.headers:
171
+ raise ValueError(f"Column '{col}' not found in the DataFrame headers.")
172
+
173
+ @staticmethod
174
+ def _get_best_filter_type(
175
+ column: str, value: pd.DataFrame
176
+ ) -> Literal["slider", "checkboxgroup", "dropdown", "checkbox"]:
177
+ if is_bool_dtype(value[column]):
178
+ return "checkbox"
179
+ if is_numeric_dtype(value[column]):
180
+ return "slider"
181
+ if is_string_dtype(value[column]) or is_object_dtype(value[column]):
182
+ return "checkboxgroup"
183
+ warnings.warn(
184
+ f"{column}'s type is not numeric or string, defaulting to checkboxgroup filter type.",
185
+ UserWarning,
186
+ )
187
+ return "checkboxgroup"
188
+
189
+ @staticmethod
190
+ def _get_column_filter_configs(
191
+ columns: list[str | ColumnFilter] | None, value: pd.DataFrame
192
+ ) -> list[ColumnFilter]:
193
+ if columns is None:
194
+ return []
195
+ if not isinstance(columns, list):
196
+ raise ValueError(
197
+ "Columns must be a list of strings or ColumnFilter objects"
198
+ )
199
+ return [
200
+ Leaderboard._get_column_filter_config(column, value) for column in columns
201
+ ]
202
+
203
+ @staticmethod
204
+ def _get_column_filter_config(column: str | ColumnFilter, value: pd.DataFrame):
205
+ column_name = column if isinstance(column, str) else column.column
206
+ best_filter_type = Leaderboard._get_best_filter_type(column_name, value)
207
+ min_val = None
208
+ max_val = None
209
+ if best_filter_type == "slider":
210
+ default = [
211
+ value[column_name].quantile(0.25),
212
+ value[column_name].quantile(0.70),
213
+ ]
214
+ min_val = value[column_name].min()
215
+ max_val = value[column_name].max()
216
+ choices = None
217
+ elif best_filter_type == "checkbox":
218
+ default = False
219
+ choices = None
220
+ else:
221
+ default = value[column_name].unique().tolist()
222
+ default = [(s, s) for s in default]
223
+ choices = default
224
+ if isinstance(column, ColumnFilter):
225
+ if column.type == "boolean":
226
+ column.type = "checkbox"
227
+ if not column.type:
228
+ column.type = best_filter_type
229
+ if column.default is None:
230
+ column.default = default
231
+ if not column.choices:
232
+ column.choices = choices
233
+ if min_val is not None and max_val is not None:
234
+ column.min = min_val
235
+ column.max = max_val
236
+ return column
237
+ if isinstance(column, str):
238
+ return ColumnFilter(
239
+ column=column,
240
+ type=best_filter_type,
241
+ default=default,
242
+ choices=choices,
243
+ min=min_val,
244
+ max=max_val,
245
+ )
246
+ raise ValueError(f"Columns {column} must be a string or a ColumnFilter object")
247
+
248
+ @staticmethod
249
+ def _get_search_columns(
250
+ search_columns: list[str] | SearchColumns | None,
251
+ ) -> SearchColumns:
252
+ if search_columns is None:
253
+ return SearchColumns(primary_column=None, secondary_columns=[])
254
+ if isinstance(search_columns, SearchColumns):
255
+ return search_columns
256
+ if isinstance(search_columns, list):
257
+ return SearchColumns(
258
+ primary_column=search_columns[0], secondary_columns=search_columns[1:]
259
+ )
260
+ raise ValueError(
261
+ "search_columns must be a list of strings or a SearchColumns object"
262
+ )
263
+
264
+ @staticmethod
265
+ def _get_select_columns(
266
+ select_columns: list[str] | SelectColumns | None,
267
+ value: pd.DataFrame,
268
+ ) -> SelectColumns:
269
+ if select_columns is None:
270
+ return SelectColumns(allow=False)
271
+ if isinstance(select_columns, SelectColumns):
272
+ if not select_columns.default_selection:
273
+ select_columns.default_selection = value.columns.tolist()
274
+ return select_columns
275
+ if isinstance(select_columns, list):
276
+ return SelectColumns(default_selection=select_columns, allow=True)
277
+ raise ValueError(
278
+ "select_columns must be a list of strings or a SelectColumns object"
279
+ )
280
+
281
+ def get_config(self):
282
+ return {
283
+ "row_count": self.row_count,
284
+ "col_count": self.col_count,
285
+ "headers": self.headers,
286
+ "select_columns_config": self.select_columns_config,
287
+ **super().get_config(),
288
+ }
289
+
290
+ def preprocess(self, payload: DataframeData) -> pd.DataFrame:
291
+ """
292
+ Parameters:
293
+ payload: the uploaded spreadsheet data as an object with `headers` and `data` attributes
294
+ Returns:
295
+ Passes the uploaded spreadsheet data as a `pandas.DataFrame`, `numpy.array`, `polars.DataFrame`, or native 2D Python `list[list]` depending on `type`
296
+ """
297
+ import pandas as pd
298
+
299
+ if payload.headers is not None:
300
+ return pd.DataFrame(
301
+ [] if payload.data == [[]] else payload.data,
302
+ columns=payload.headers,
303
+ )
304
+ else:
305
+ return pd.DataFrame(payload.data)
306
+
307
+ def postprocess(self, value: pd.DataFrame) -> DataframeData:
308
+ """
309
+ Parameters:
310
+ value: Expects data any of these formats: `pandas.DataFrame`, `pandas.Styler`, `numpy.array`, `polars.DataFrame`, `list[list]`, `list`, or a `dict` with keys 'data' (and optionally 'headers'), or `str` path to a csv, which is rendered as the spreadsheet.
311
+ Returns:
312
+ the uploaded spreadsheet data as an object with `headers` and `data` attributes
313
+ """
314
+ import pandas as pd
315
+ from pandas.io.formats.style import Styler
316
+
317
+ if value is None:
318
+ return self.postprocess(pd.DataFrame({"column 1": []}))
319
+ if isinstance(value, (str, pd.DataFrame)):
320
+ if isinstance(value, str):
321
+ value = pd.read_csv(value) # type: ignore
322
+ if len(value) == 0:
323
+ return DataframeData(
324
+ headers=list(value.columns), # type: ignore
325
+ data=[[]], # type: ignore
326
+ )
327
+ return DataframeData(
328
+ headers=list(value.columns), # type: ignore
329
+ data=value.to_dict(orient="split")["data"], # type: ignore
330
+ )
331
+ elif isinstance(value, Styler):
332
+ if semantic_version.Version(pd.__version__) < semantic_version.Version(
333
+ "1.5.0"
334
+ ):
335
+ raise ValueError(
336
+ "Styler objects are only supported in pandas version 1.5.0 or higher. Please try: `pip install --upgrade pandas` to use this feature."
337
+ )
338
+ if self.interactive:
339
+ warnings.warn(
340
+ "Cannot display Styler object in interactive mode. Will display as a regular pandas dataframe instead."
341
+ )
342
+ df: pd.DataFrame = value.data # type: ignore
343
+ if len(df) == 0:
344
+ return DataframeData(
345
+ headers=list(df.columns),
346
+ data=[[]],
347
+ metadata=self.__extract_metadata(value), # type: ignore
348
+ )
349
+ return DataframeData(
350
+ headers=list(df.columns),
351
+ data=df.to_dict(orient="split")["data"], # type: ignore
352
+ metadata=self.__extract_metadata(value), # type: ignore
353
+ )
354
+
355
+ @staticmethod
356
+ def __get_cell_style(cell_id: str, cell_styles: list[dict]) -> str:
357
+ styles_for_cell = []
358
+ for style in cell_styles:
359
+ if cell_id in style.get("selectors", []):
360
+ styles_for_cell.extend(style.get("props", []))
361
+ styles_str = "; ".join([f"{prop}: {value}" for prop, value in styles_for_cell])
362
+ return styles_str
363
+
364
+ @staticmethod
365
+ def __extract_metadata(df: Styler) -> dict[str, list[list]]:
366
+ metadata = {"display_value": [], "styling": []}
367
+ style_data = df._compute()._translate(None, None) # type: ignore
368
+ cell_styles = style_data.get("cellstyle", [])
369
+ for i in range(len(style_data["body"])):
370
+ metadata["display_value"].append([])
371
+ metadata["styling"].append([])
372
+ for j in range(len(style_data["body"][i])):
373
+ cell_type = style_data["body"][i][j]["type"]
374
+ if cell_type != "td":
375
+ continue
376
+ display_value = style_data["body"][i][j]["display_value"]
377
+ cell_id = style_data["body"][i][j]["id"]
378
+ styles_str = Leaderboard.__get_cell_style(cell_id, cell_styles)
379
+ metadata["display_value"][i].append(display_value)
380
+ metadata["styling"][i].append(styles_str)
381
+ return metadata
382
+
383
+ def process_example(
384
+ self,
385
+ value: pd.DataFrame | Styler | str | None,
386
+ ):
387
+ import pandas as pd
388
+
389
+ if value is None:
390
+ return ""
391
+ value_df_data = self.postprocess(value)
392
+ value_df = pd.DataFrame(value_df_data.data, columns=value_df_data.headers)
393
+ return value_df.head(n=5).to_dict(orient="split")["data"]
394
+
395
+ def example_payload(self) -> Any:
396
+ return {"headers": ["a", "b"], "data": [["foo", "bar"]]}
397
+
398
+ def example_inputs(self) -> Any:
399
+ return self.example_value()
400
+
401
+ def example_value(self) -> Any:
402
+ return {"headers": ["a", "b"], "data": [["foo", "bar"]]}
logs.txt ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ==== Application Startup at 2025-07-25 22:55:49 =====
2
+
3
+
4
+ .gitattributes: 0%| | 0.00/2.46k [00:00<?, ?B/s]
5
+ .gitattributes: 100%|██████████| 2.46k/2.46k [00:00<00:00, 10.5MB/s]
6
+
7
+ (…)enai-community_gpt2_20250725_231201.json: 0%| | 0.00/209 [00:00<?, ?B/s]
8
+ (…)enai-community_gpt2_20250725_231201.json: 100%|██████████| 209/209 [00:00<00:00, 1.71MB/s]
9
+
10
+ (…)enai-community_gpt2_20250725_233155.json: 0%| | 0.00/209 [00:00<?, ?B/s]
11
+ (…)enai-community_gpt2_20250725_233155.json: 100%|██████████| 209/209 [00:00<00:00, 1.26MB/s]
12
+
13
+ (…)enai-community_gpt2_20250725_235115.json: 0%| | 0.00/209 [00:00<?, ?B/s]
14
+ (…)enai-community_gpt2_20250725_235115.json: 100%|██████████| 209/209 [00:00<00:00, 2.02MB/s]
15
+
16
+ (…)enai-community_gpt2_20250725_235748.json: 0%| | 0.00/209 [00:00<?, ?B/s]
17
+ (…)enai-community_gpt2_20250725_235748.json: 100%|██████████| 209/209 [00:00<00:00, 2.08MB/s]
18
+
19
+ (…)enai-community_gpt2_20250726_000358.json: 0%| | 0.00/209 [00:00<?, ?B/s]
20
+ (…)enai-community_gpt2_20250726_000358.json: 100%|██████████| 209/209 [00:00<00:00, 1.54MB/s]
21
+
22
+ (…)enai-community_gpt2_20250726_000650.json: 0%| | 0.00/209 [00:00<?, ?B/s]
23
+ (…)enai-community_gpt2_20250726_000650.json: 100%|██████████| 209/209 [00:00<00:00, 2.35MB/s]
24
+
25
+ === Starting leaderboard creation ===
26
+ Looking for results in: ./eval-results
27
+ Expected columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
28
+ Benchmark columns: ['Perplexity']
29
+
30
+ Searching for result files in: ./eval-results
31
+ Found 6 result files
32
+
33
+ Processing file: ./eval-results/openai-community/results_openai-community_gpt2_20250725_231201.json
34
+
35
+ config.json: 0%| | 0.00/665 [00:00<?, ?B/s]
36
+ config.json: 100%|██████████| 665/665 [00:00<00:00, 6.14MB/s]
37
+ Created result object for: openai-community/gpt2
38
+ Added new result for openai-community_gpt2_float16
39
+
40
+ Processing file: ./eval-results/openai-community/results_openai-community_gpt2_20250725_233155.json
41
+ Created result object for: openai-community/gpt2
42
+ Updated existing result for openai-community_gpt2_float16
43
+
44
+ Processing file: ./eval-results/openai-community/results_openai-community_gpt2_20250725_235115.json
45
+ Created result object for: openai-community/gpt2
46
+ Updated existing result for openai-community_gpt2_float16
47
+
48
+ Processing file: ./eval-results/openai-community/results_openai-community_gpt2_20250725_235748.json
49
+ Created result object for: openai-community/gpt2
50
+ Updated existing result for openai-community_gpt2_float16
51
+
52
+ Processing file: ./eval-results/openai-community/results_openai-community_gpt2_20250726_000358.json
53
+ Created result object for: openai-community/gpt2
54
+ Updated existing result for openai-community_gpt2_float16
55
+
56
+ Processing file: ./eval-results/openai-community/results_openai-community_gpt2_20250726_000650.json
57
+ Created result object for: openai-community/gpt2
58
+ Updated existing result for openai-community_gpt2_float16
59
+
60
+ Processing 1 evaluation results
61
+
62
+ Converting result to dict for: openai-community/gpt2
63
+
64
+ Processing result for model: openai-community/gpt2
65
+ Raw results: {'perplexity': 20.663532257080078}
66
+ Calculated average score: 69.7162958010531
67
+ Added perplexity score 20.663532257080078 under column Perplexity
68
+ Final data dict keys: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
69
+ Successfully converted and added result
70
+
71
+ Returning 1 processed results
72
+
73
+ Found 1 raw results
74
+
75
+ Processing result for model: openai-community/gpt2
76
+ Raw results: {'perplexity': 20.663532257080078}
77
+ Calculated average score: 69.7162958010531
78
+ Added perplexity score 20.663532257080078 under column Perplexity
79
+ Final data dict keys: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
80
+ Successfully processed result 1/1: openai-community/gpt2
81
+
82
+ Converted to 1 JSON records
83
+ Sample record keys: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
84
+
85
+ Created DataFrame with columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
86
+ DataFrame shape: (1, 14)
87
+
88
+ Sorted DataFrame by average
89
+
90
+ Selected and rounded columns
91
+
92
+ Final DataFrame shape after filtering: (1, 12)
93
+ Final columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
94
+
95
+ === Initializing Leaderboard ===
96
+ DataFrame shape: (1, 12)
97
+ DataFrame columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
98
+ * Running on local URL: http://0.0.0.0:7860, with SSR ⚡ (experimental, to disable set `ssr=False` in `launch()`)
99
+
100
+ To create a public link, set `share=True` in `launch()`.
101
+
102
+ === Running Perplexity Test ===
103
+ Model: EleutherAI/gpt-neo-1.3B
104
+ Revision: main
105
+ Precision: float16
106
+ Starting dynamic evaluation for EleutherAI/gpt-neo-1.3B
107
+ Running perplexity evaluation...
108
+ Loading model: EleutherAI/gpt-neo-1.3B (revision: main)
109
+ Loading tokenizer...
110
+
111
+ tokenizer_config.json: 0%| | 0.00/200 [00:00<?, ?B/s]
112
+ tokenizer_config.json: 100%|██████████| 200/200 [00:00<00:00, 1.64MB/s]
113
+
114
+ config.json: 0%| | 0.00/1.35k [00:00<?, ?B/s]
115
+ config.json: 100%|██████████| 1.35k/1.35k [00:00<00:00, 9.77MB/s]
116
+
117
+ vocab.json: 0%| | 0.00/798k [00:00<?, ?B/s]
118
+ vocab.json: 100%|██████████| 798k/798k [00:00<00:00, 27.9MB/s]
119
+
120
+ merges.txt: 0%| | 0.00/456k [00:00<?, ?B/s]
121
+ merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 3.54MB/s]
122
+
123
+ special_tokens_map.json: 0%| | 0.00/90.0 [00:00<?, ?B/s]
124
+ special_tokens_map.json: 100%|██████████| 90.0/90.0 [00:00<00:00, 1.05MB/s]
125
+ Tokenizer loaded successfully
126
+ Loading model...
127
+
128
+ model.safetensors: 0%| | 0.00/5.31G [00:00<?, ?B/s]
129
+ model.safetensors: 0%| | 778k/5.31G [00:01<2:15:00, 656kB/s]
130
+ model.safetensors: 0%| | 7.69M/5.31G [00:02<23:51, 3.70MB/s]
131
+ model.safetensors: 1%|▏ | 74.7M/5.31G [00:03<03:29, 25.0MB/s]
132
+ model.safetensors: 9%|▉ | 496M/5.31G [00:04<00:31, 153MB/s]
133
+ model.safetensors: 19%|█▉ | 1.03G/5.31G [00:06<00:16, 263MB/s]
134
+ model.safetensors: 25%|██▍ | 1.32G/5.31G [00:07<00:16, 235MB/s]
135
+ model.safetensors: 38%|███▊ | 1.99G/5.31G [00:08<00:09, 346MB/s]
136
+ model.safetensors: 47%|████▋ | 2.51G/5.31G [00:09<00:07, 379MB/s]
137
+ model.safetensors: 59%|█████▊ | 3.11G/5.31G [00:10<00:05, 429MB/s]
138
+ model.safetensors: 69%|██████▊ | 3.65G/5.31G [00:11<00:03, 451MB/s]
139
+ model.safetensors: 80%|███████▉ | 4.24G/5.31G [00:13<00:02, 477MB/s]
140
+ model.safetensors: 91%|█████████ | 4.84G/5.31G [00:14<00:00, 494MB/s]
141
+ model.safetensors: 100%|██████████| 5.31G/5.31G [00:14<00:00, 355MB/s]
142
+ Model loaded successfully
143
+ Tokenizing input text...
144
+ Tokenized input shape: torch.Size([1, 141])
145
+ Moved inputs to device: cpu
146
+ Running forward pass...
147
+ Calculated loss: 1.78515625
148
+ Final perplexity: 5.9609375
149
+ Perplexity evaluation completed: 5.9609375
150
+ Created result structure: {'config': {'model_dtype': 'torch.float16', 'model_name': 'EleutherAI/gpt-neo-1.3B', 'model_sha': 'main'}, 'results': {'perplexity': {'perplexity': 5.9609375}}}
151
+ Saving result to: ./eval-results/EleutherAI/results_EleutherAI_gpt-neo-1.3B_20250726_010247.json
152
+ Result file saved locally
153
+ Uploading to HF dataset: ahmedsqrd/results
154
+ Upload completed successfully
155
+ Evaluation result - Success: True, Result: 5.9609375
156
+ Attempting to refresh leaderboard...
157
+ Refreshing leaderboard data...
158
+
159
+ === Starting leaderboard creation ===
160
+ Looking for results in: ./eval-results
161
+ Expected columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
162
+ Benchmark columns: ['Perplexity']
163
+
164
+ Searching for result files in: ./eval-results
165
+ Found 7 result files
166
+
167
+ Processing file: ./eval-results/openai-community/results_openai-community_gpt2_20250725_231201.json
168
+ Created result object for: openai-community/gpt2
169
+ Added new result for openai-community_gpt2_float16
170
+
171
+ Processing file: ./eval-results/openai-community/results_openai-community_gpt2_20250725_233155.json
172
+ Created result object for: openai-community/gpt2
173
+ Updated existing result for openai-community_gpt2_float16
174
+
175
+ Processing file: ./eval-results/openai-community/results_openai-community_gpt2_20250725_235115.json
176
+ Created result object for: openai-community/gpt2
177
+ Updated existing result for openai-community_gpt2_float16
178
+
179
+ Processing file: ./eval-results/openai-community/results_openai-community_gpt2_20250725_235748.json
180
+ Created result object for: openai-community/gpt2
181
+ Updated existing result for openai-community_gpt2_float16
182
+
183
+ Processing file: ./eval-results/openai-community/results_openai-community_gpt2_20250726_000358.json
184
+ Created result object for: openai-community/gpt2
185
+ Updated existing result for openai-community_gpt2_float16
186
+
187
+ Processing file: ./eval-results/openai-community/results_openai-community_gpt2_20250726_000650.json
188
+ Created result object for: openai-community/gpt2
189
+ Updated existing result for openai-community_gpt2_float16
190
+
191
+ Processing file: ./eval-results/EleutherAI/results_EleutherAI_gpt-neo-1.3B_20250726_010247.json
192
+ Created result object for: EleutherAI/gpt-neo-1.3B
193
+ Added new result for EleutherAI_gpt-neo-1.3B_float16
194
+
195
+ Processing 2 evaluation results
196
+
197
+ Converting result to dict for: openai-community/gpt2
198
+
199
+ Processing result for model: openai-community/gpt2
200
+ Raw results: {'perplexity': 20.663532257080078}
201
+ Calculated average score: 69.7162958010531
202
+ Added perplexity score 20.663532257080078 under column Perplexity
203
+ Final data dict keys: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
204
+ Successfully converted and added result
205
+
206
+ Converting result to dict for: EleutherAI/gpt-neo-1.3B
207
+
208
+ Processing result for model: EleutherAI/gpt-neo-1.3B
209
+ Raw results: {'perplexity': 5.9609375}
210
+ Calculated average score: 82.1477223263516
211
+ Added perplexity score 5.9609375 under column Perplexity
212
+ Final data dict keys: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
213
+ Successfully converted and added result
214
+
215
+ Returning 2 processed results
216
+
217
+ Found 2 raw results
218
+
219
+ Processing result for model: openai-community/gpt2
220
+ Raw results: {'perplexity': 20.663532257080078}
221
+ Calculated average score: 69.7162958010531
222
+ Added perplexity score 20.663532257080078 under column Perplexity
223
+ Final data dict keys: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
224
+ Successfully processed result 1/2: openai-community/gpt2
225
+
226
+ Processing result for model: EleutherAI/gpt-neo-1.3B
227
+ Raw results: {'perplexity': 5.9609375}
228
+ Calculated average score: 82.1477223263516
229
+ Added perplexity score 5.9609375 under column Perplexity
230
+ Final data dict keys: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
231
+ Successfully processed result 2/2: EleutherAI/gpt-neo-1.3B
232
+
233
+ Converted to 2 JSON records
234
+ Sample record keys: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
235
+
236
+ Created DataFrame with columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
237
+ DataFrame shape: (2, 14)
238
+
239
+ Sorted DataFrame by average
240
+
241
+ Selected and rounded columns
242
+
243
+ Final DataFrame shape after filtering: (2, 12)
244
+ Final columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
245
+ Got DataFrame with shape: (2, 12)
246
+ DataFrame columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
247
+ Creating leaderboard with valid DataFrame
248
+
249
+ === Initializing Leaderboard ===
250
+ DataFrame shape: (2, 12)
251
+ DataFrame columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
252
+ Leaderboard refresh successful
253
+ Traceback (most recent call last):
254
+ File "/usr/local/lib/python3.10/site-packages/gradio/queueing.py", line 625, in process_events
255
+ response = await route_utils.call_process_api(
256
+ File "/usr/local/lib/python3.10/site-packages/gradio/route_utils.py", line 322, in call_process_api
257
+ output = await app.get_blocks().process_api(
258
+ File "/usr/local/lib/python3.10/site-packages/gradio/blocks.py", line 2106, in process_api
259
+ data = await self.postprocess_data(block_fn, result["prediction"], state)
260
+ File "/usr/local/lib/python3.10/site-packages/gradio/blocks.py", line 1899, in postprocess_data
261
+ state[block._id] = block.__class__(**kwargs)
262
+ File "/usr/local/lib/python3.10/site-packages/gradio/component_meta.py", line 181, in wrapper
263
+ return fn(self, **kwargs)
264
+ File "/usr/local/lib/python3.10/site-packages/gradio_leaderboard/leaderboard.py", line 126, in __init__
265
+ raise ValueError("Leaderboard component must have a value set.")
266
+ ValueError: Leaderboard component must have a value set.
src/display/utils.py CHANGED
@@ -1,5 +1,6 @@
1
  from dataclasses import dataclass, make_dataclass
2
  from enum import Enum
 
3
 
4
  import pandas as pd
5
 
@@ -29,7 +30,10 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
29
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
  for task in Tasks:
31
  # Use exact column name from Tasks
32
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 
 
 
33
  # Model information
34
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
35
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
@@ -44,6 +48,13 @@ auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sh
44
  # We use make dataclass to dynamically fill the scores from Tasks
45
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
46
 
 
 
 
 
 
 
 
47
  ## For the queue columns in the submission tab
48
  @dataclass(frozen=True)
49
  class EvalQueueColumn: # Queue column
@@ -103,9 +114,14 @@ class Precision(Enum):
103
 
104
  # Column selection
105
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
 
 
106
 
107
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
108
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
109
 
110
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
 
 
 
111
 
 
1
  from dataclasses import dataclass, make_dataclass
2
  from enum import Enum
3
+ import sys
4
 
5
  import pandas as pd
6
 
 
30
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
31
  for task in Tasks:
32
  # Use exact column name from Tasks
33
+ task_col_name = task.value.col_name
34
+ sys.stderr.write(f"Adding task column: {task.name} -> column name: {task_col_name}\n")
35
+ auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task_col_name, "number", True)])
36
+ sys.stderr.flush()
37
  # Model information
38
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
39
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
 
48
  # We use make dataclass to dynamically fill the scores from Tasks
49
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
50
 
51
+ # Debug: Print the created columns
52
+ sys.stderr.write("\n=== CREATED AUTOEVALCOLUMN ===\n")
53
+ for field_obj in fields(AutoEvalColumn):
54
+ sys.stderr.write(f"Field: {field_obj.name} -> Display: {field_obj.name}\n")
55
+ sys.stderr.write("=== END AUTOEVALCOLUMN ===\n")
56
+ sys.stderr.flush()
57
+
58
  ## For the queue columns in the submission tab
59
  @dataclass(frozen=True)
60
  class EvalQueueColumn: # Queue column
 
114
 
115
  # Column selection
116
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
117
+ sys.stderr.write(f"\n=== FINAL COLUMN SETUP ===\n")
118
+ sys.stderr.write(f"COLS: {COLS}\n")
119
 
120
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
121
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
122
 
123
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
124
+ sys.stderr.write(f"BENCHMARK_COLS: {BENCHMARK_COLS}\n")
125
+ sys.stderr.write(f"=== END COLUMN SETUP ===\n")
126
+ sys.stderr.flush()
127
 
src/leaderboard/read_evals.py CHANGED
@@ -78,56 +78,78 @@ class EvalResult:
78
  """Converts the Eval Result to a dict compatible with our dataframe display"""
79
  import sys
80
 
81
- sys.stderr.write(f"\nProcessing result for model: {self.full_model}\n")
 
82
  sys.stderr.write(f"Raw results: {self.results}\n")
 
 
 
83
  sys.stderr.flush()
84
 
85
  # Calculate average, handling perplexity (lower is better)
86
  scores = []
87
  perplexity_score = None
 
 
88
  for task in Tasks:
 
89
  if task.value.benchmark in self.results:
90
  score = self.results[task.value.benchmark]
91
  perplexity_score = score # Save the raw score
 
92
  # Convert perplexity to a 0-100 scale where lower perplexity = higher score
93
  # Using a log scale since perplexity can vary widely
94
  # Cap at 100 for very low perplexity and 0 for very high perplexity
95
  score = max(0, min(100, 100 * (1 - math.log(score) / 10)))
96
  scores.append(score)
 
 
 
 
97
 
98
  average = sum(scores) / len(scores) if scores else 0
99
  sys.stderr.write(f"Calculated average score: {average}\n")
100
  sys.stderr.flush()
101
 
102
- data_dict = {
103
- "eval_name": self.eval_name, # not a column, just a save name,
104
- AutoEvalColumn.precision.name: self.precision.value.name,
105
- AutoEvalColumn.model_type.name: self.model_type.value.name,
106
- AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
107
- AutoEvalColumn.weight_type.name: self.weight_type.value.name,
108
- AutoEvalColumn.architecture.name: self.architecture,
109
- AutoEvalColumn.model.name: make_clickable_model(self.full_model),
110
- AutoEvalColumn.revision.name: self.revision,
111
- AutoEvalColumn.average.name: average,
112
- AutoEvalColumn.still_on_hub.name: self.still_on_hub,
113
- # Add missing columns with default values
114
- AutoEvalColumn.license.name: "Unknown", # Default license
115
- AutoEvalColumn.params.name: 0, # Default params
116
- AutoEvalColumn.likes.name: 0, # Default likes
117
- }
118
-
119
- # Add perplexity score with the exact column name from Tasks
120
- if perplexity_score is not None:
121
- data_dict[Tasks.task0.value.col_name] = perplexity_score
122
- sys.stderr.write(f"Added perplexity score {perplexity_score} under column {Tasks.task0.value.col_name}\n")
123
- sys.stderr.flush()
124
- else:
125
- data_dict[Tasks.task0.value.col_name] = None
126
- sys.stderr.write(f"No perplexity score found for column {Tasks.task0.value.col_name}\n")
 
 
 
 
 
 
 
 
127
  sys.stderr.flush()
128
 
129
- sys.stderr.write(f"Final data dict keys: {list(data_dict.keys())}\n")
 
130
  sys.stderr.flush()
 
131
  return data_dict
132
 
133
  def get_raw_eval_results(results_path: str) -> list[EvalResult]:
 
78
  """Converts the Eval Result to a dict compatible with our dataframe display"""
79
  import sys
80
 
81
+ sys.stderr.write(f"\n=== PROCESSING RESULT TO_DICT ===\n")
82
+ sys.stderr.write(f"Processing result for model: {self.full_model}\n")
83
  sys.stderr.write(f"Raw results: {self.results}\n")
84
+ sys.stderr.write(f"Model precision: {self.precision}\n")
85
+ sys.stderr.write(f"Model type: {self.model_type}\n")
86
+ sys.stderr.write(f"Weight type: {self.weight_type}\n")
87
  sys.stderr.flush()
88
 
89
  # Calculate average, handling perplexity (lower is better)
90
  scores = []
91
  perplexity_score = None
92
+ sys.stderr.write(f"Available tasks: {[task.name for task in Tasks]}\n")
93
+
94
  for task in Tasks:
95
+ sys.stderr.write(f"Looking for task: {task.value.benchmark} in results\n")
96
  if task.value.benchmark in self.results:
97
  score = self.results[task.value.benchmark]
98
  perplexity_score = score # Save the raw score
99
+ sys.stderr.write(f"Found score for {task.value.benchmark}: {score}\n")
100
  # Convert perplexity to a 0-100 scale where lower perplexity = higher score
101
  # Using a log scale since perplexity can vary widely
102
  # Cap at 100 for very low perplexity and 0 for very high perplexity
103
  score = max(0, min(100, 100 * (1 - math.log(score) / 10)))
104
  scores.append(score)
105
+ sys.stderr.write(f"Converted score: {score}\n")
106
+ else:
107
+ sys.stderr.write(f"Task {task.value.benchmark} not found in results\n")
108
+ sys.stderr.flush()
109
 
110
  average = sum(scores) / len(scores) if scores else 0
111
  sys.stderr.write(f"Calculated average score: {average}\n")
112
  sys.stderr.flush()
113
 
114
+ # Create data dictionary with comprehensive debugging
115
+ data_dict = {}
116
+
117
+ # Add core columns
118
+ data_dict["eval_name"] = self.eval_name
119
+ data_dict[AutoEvalColumn.precision.name] = self.precision.value.name
120
+ data_dict[AutoEvalColumn.model_type.name] = self.model_type.value.name
121
+ data_dict[AutoEvalColumn.model_type_symbol.name] = self.model_type.value.symbol
122
+ data_dict[AutoEvalColumn.weight_type.name] = self.weight_type.value.name
123
+ data_dict[AutoEvalColumn.architecture.name] = self.architecture
124
+ data_dict[AutoEvalColumn.model.name] = make_clickable_model(self.full_model)
125
+ data_dict[AutoEvalColumn.revision.name] = self.revision
126
+ data_dict[AutoEvalColumn.average.name] = average
127
+ data_dict[AutoEvalColumn.still_on_hub.name] = self.still_on_hub
128
+
129
+ # Add default values for missing model info
130
+ data_dict[AutoEvalColumn.license.name] = "Unknown"
131
+ data_dict[AutoEvalColumn.params.name] = 0
132
+ data_dict[AutoEvalColumn.likes.name] = 0
133
+
134
+ sys.stderr.write(f"Created base data_dict with {len(data_dict)} columns\n")
135
+ sys.stderr.flush()
136
+
137
+ # Add task-specific scores
138
+ for task in Tasks:
139
+ task_col_name = task.value.col_name
140
+ if task.value.benchmark in self.results:
141
+ task_score = self.results[task.value.benchmark]
142
+ data_dict[task_col_name] = task_score
143
+ sys.stderr.write(f"Added task score: {task_col_name} = {task_score}\n")
144
+ else:
145
+ data_dict[task_col_name] = None
146
+ sys.stderr.write(f"Added None for missing task: {task_col_name}\n")
147
  sys.stderr.flush()
148
 
149
+ sys.stderr.write(f"Final data dict has {len(data_dict)} columns: {list(data_dict.keys())}\n")
150
+ sys.stderr.write(f"=== END PROCESSING RESULT TO_DICT ===\n")
151
  sys.stderr.flush()
152
+
153
  return data_dict
154
 
155
  def get_raw_eval_results(results_path: str) -> list[EvalResult]:
src/populate.py CHANGED
@@ -7,7 +7,8 @@ from src.leaderboard.read_evals import get_raw_eval_results
7
  def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
8
  """Creates a dataframe from all the individual experiment results"""
9
  try:
10
- sys.stderr.write("\n=== Starting leaderboard creation ===\n")
 
11
  sys.stderr.write(f"Looking for results in: {results_path}\n")
12
  sys.stderr.write(f"Expected columns: {cols}\n")
13
  sys.stderr.write(f"Benchmark columns: {benchmark_cols}\n")
@@ -17,81 +18,143 @@ def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> p
17
  sys.stderr.write(f"\nFound {len(raw_data)} raw results\n")
18
  sys.stderr.flush()
19
 
 
 
 
 
 
20
  all_data_json = []
21
  for i, v in enumerate(raw_data):
22
  try:
 
 
 
23
  data_dict = v.to_dict()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  all_data_json.append(data_dict)
25
  sys.stderr.write(f"Successfully processed result {i+1}/{len(raw_data)}: {v.full_model}\n")
26
  sys.stderr.flush()
 
27
  except Exception as e:
28
  sys.stderr.write(f"Error processing result {i+1}/{len(raw_data)} ({v.full_model}): {e}\n")
 
 
29
  sys.stderr.flush()
30
  continue
31
 
32
  sys.stderr.write(f"\nConverted to {len(all_data_json)} JSON records\n")
33
  sys.stderr.flush()
34
 
 
 
 
 
 
35
  if all_data_json:
36
  sys.stderr.write("Sample record keys: " + str(list(all_data_json[0].keys())) + "\n")
37
  sys.stderr.flush()
38
 
39
- if not all_data_json:
40
- sys.stderr.write("\nNo data found, creating empty DataFrame\n")
 
 
41
  sys.stderr.flush()
42
- empty_df = pd.DataFrame(columns=cols)
43
- # Ensure correct column types
44
- empty_df[AutoEvalColumn.average.name] = pd.Series(dtype=float)
45
- for col in benchmark_cols:
46
- empty_df[col] = pd.Series(dtype=float)
47
- return empty_df
48
-
49
- df = pd.DataFrame.from_records(all_data_json)
50
- sys.stderr.write("\nCreated DataFrame with columns: " + str(df.columns.tolist()) + "\n")
51
- sys.stderr.write("DataFrame shape: " + str(df.shape) + "\n")
52
- sys.stderr.flush()
53
 
54
  try:
55
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
56
- sys.stderr.write("\nSorted DataFrame by average\n")
 
 
 
57
  sys.stderr.flush()
58
- except KeyError as e:
59
  sys.stderr.write(f"\nError sorting DataFrame: {e}\n")
60
  sys.stderr.write("Available columns: " + str(df.columns.tolist()) + "\n")
61
  sys.stderr.flush()
62
 
63
  try:
 
 
 
 
 
 
 
 
 
 
64
  df = df[cols].round(decimals=2)
65
  sys.stderr.write("\nSelected and rounded columns\n")
66
  sys.stderr.flush()
67
- except KeyError as e:
68
  sys.stderr.write(f"\nError selecting columns: {e}\n")
69
  sys.stderr.write("Requested columns: " + str(cols) + "\n")
70
  sys.stderr.write("Available columns: " + str(df.columns.tolist()) + "\n")
71
  sys.stderr.flush()
72
- # Create empty DataFrame with correct structure
73
- empty_df = pd.DataFrame(columns=cols)
74
- empty_df[AutoEvalColumn.average.name] = pd.Series(dtype=float)
75
- for col in benchmark_cols:
76
- empty_df[col] = pd.Series(dtype=float)
77
- return empty_df
78
 
79
- # filter out if perplexity hasn't been evaluated
80
- df = df[has_no_nan_values(df, benchmark_cols)]
81
- sys.stderr.write("\nFinal DataFrame shape after filtering: " + str(df.shape) + "\n")
82
- sys.stderr.write("Final columns: " + str(df.columns.tolist()) + "\n")
83
- sys.stderr.flush()
 
 
 
 
 
84
 
 
 
 
 
 
 
 
 
85
  return df
86
 
87
  except Exception as e:
88
- sys.stderr.write(f"\nCritical error in get_leaderboard_df: {e}\n")
89
  import traceback
90
  sys.stderr.write(f"Traceback: {traceback.format_exc()}\n")
91
  sys.stderr.flush()
92
- # Return empty DataFrame as fallback
93
- empty_df = pd.DataFrame(columns=cols)
94
- empty_df[AutoEvalColumn.average.name] = pd.Series(dtype=float)
95
- for col in benchmark_cols:
 
 
 
 
 
 
 
 
 
 
96
  empty_df[col] = pd.Series(dtype=float)
97
- return empty_df
 
 
 
 
 
 
7
  def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
8
  """Creates a dataframe from all the individual experiment results"""
9
  try:
10
+ sys.stderr.write("\n=== GET_LEADERBOARD_DF DEBUG ===\n")
11
+ sys.stderr.write("Starting leaderboard creation...\n")
12
  sys.stderr.write(f"Looking for results in: {results_path}\n")
13
  sys.stderr.write(f"Expected columns: {cols}\n")
14
  sys.stderr.write(f"Benchmark columns: {benchmark_cols}\n")
 
18
  sys.stderr.write(f"\nFound {len(raw_data)} raw results\n")
19
  sys.stderr.flush()
20
 
21
+ if not raw_data:
22
+ sys.stderr.write("No raw data found, creating empty DataFrame\n")
23
+ sys.stderr.flush()
24
+ return create_empty_dataframe(cols, benchmark_cols)
25
+
26
  all_data_json = []
27
  for i, v in enumerate(raw_data):
28
  try:
29
+ sys.stderr.write(f"Processing result {i+1}/{len(raw_data)}: {v.full_model}\n")
30
+ sys.stderr.flush()
31
+
32
  data_dict = v.to_dict()
33
+
34
+ # Validate the data_dict has required columns
35
+ missing_cols = [col for col in cols if col not in data_dict]
36
+ if missing_cols:
37
+ sys.stderr.write(f"WARNING: Result for {v.full_model} missing columns: {missing_cols}\n")
38
+ # Add missing columns with default values
39
+ for col in missing_cols:
40
+ if col in benchmark_cols or col == AutoEvalColumn.average.name:
41
+ data_dict[col] = 0.0
42
+ elif col == AutoEvalColumn.model_type_symbol.name:
43
+ data_dict[col] = "?"
44
+ else:
45
+ data_dict[col] = ""
46
+ sys.stderr.flush()
47
+
48
  all_data_json.append(data_dict)
49
  sys.stderr.write(f"Successfully processed result {i+1}/{len(raw_data)}: {v.full_model}\n")
50
  sys.stderr.flush()
51
+
52
  except Exception as e:
53
  sys.stderr.write(f"Error processing result {i+1}/{len(raw_data)} ({v.full_model}): {e}\n")
54
+ import traceback
55
+ sys.stderr.write(f"Traceback: {traceback.format_exc()}\n")
56
  sys.stderr.flush()
57
  continue
58
 
59
  sys.stderr.write(f"\nConverted to {len(all_data_json)} JSON records\n")
60
  sys.stderr.flush()
61
 
62
+ if not all_data_json:
63
+ sys.stderr.write("No valid JSON records, creating empty DataFrame\n")
64
+ sys.stderr.flush()
65
+ return create_empty_dataframe(cols, benchmark_cols)
66
+
67
  if all_data_json:
68
  sys.stderr.write("Sample record keys: " + str(list(all_data_json[0].keys())) + "\n")
69
  sys.stderr.flush()
70
 
71
+ try:
72
+ df = pd.DataFrame.from_records(all_data_json)
73
+ sys.stderr.write("\nCreated DataFrame with columns: " + str(df.columns.tolist()) + "\n")
74
+ sys.stderr.write("DataFrame shape: " + str(df.shape) + "\n")
75
  sys.stderr.flush()
76
+ except Exception as e:
77
+ sys.stderr.write(f"Error creating DataFrame from records: {e}\n")
78
+ sys.stderr.flush()
79
+ return create_empty_dataframe(cols, benchmark_cols)
 
 
 
 
 
 
 
80
 
81
  try:
82
+ if AutoEvalColumn.average.name in df.columns:
83
+ df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
84
+ sys.stderr.write("\nSorted DataFrame by average\n")
85
+ else:
86
+ sys.stderr.write(f"\nWARNING: Cannot sort by {AutoEvalColumn.average.name} - column not found\n")
87
  sys.stderr.flush()
88
+ except Exception as e:
89
  sys.stderr.write(f"\nError sorting DataFrame: {e}\n")
90
  sys.stderr.write("Available columns: " + str(df.columns.tolist()) + "\n")
91
  sys.stderr.flush()
92
 
93
  try:
94
+ # Ensure all required columns exist before selecting
95
+ for col in cols:
96
+ if col not in df.columns:
97
+ sys.stderr.write(f"Adding missing column during selection: {col}\n")
98
+ if col in benchmark_cols or col == AutoEvalColumn.average.name:
99
+ df[col] = 0.0
100
+ else:
101
+ df[col] = ""
102
+ sys.stderr.flush()
103
+
104
  df = df[cols].round(decimals=2)
105
  sys.stderr.write("\nSelected and rounded columns\n")
106
  sys.stderr.flush()
107
+ except Exception as e:
108
  sys.stderr.write(f"\nError selecting columns: {e}\n")
109
  sys.stderr.write("Requested columns: " + str(cols) + "\n")
110
  sys.stderr.write("Available columns: " + str(df.columns.tolist()) + "\n")
111
  sys.stderr.flush()
112
+ return create_empty_dataframe(cols, benchmark_cols)
 
 
 
 
 
113
 
114
+ try:
115
+ # filter out if perplexity hasn't been evaluated
116
+ df = df[has_no_nan_values(df, benchmark_cols)]
117
+ sys.stderr.write("\nFinal DataFrame shape after filtering: " + str(df.shape) + "\n")
118
+ sys.stderr.write("Final columns: " + str(df.columns.tolist()) + "\n")
119
+ sys.stderr.flush()
120
+ except Exception as e:
121
+ sys.stderr.write(f"Error filtering DataFrame: {e}\n")
122
+ sys.stderr.flush()
123
+ # Don't return empty, return the unfiltered DataFrame
124
 
125
+ # Final validation
126
+ if df is None or df.empty:
127
+ sys.stderr.write("Final DataFrame is None or empty, returning fallback\n")
128
+ sys.stderr.flush()
129
+ return create_empty_dataframe(cols, benchmark_cols)
130
+
131
+ sys.stderr.write(f"=== FINAL RESULT: DataFrame with {len(df)} rows and {len(df.columns)} columns ===\n")
132
+ sys.stderr.flush()
133
  return df
134
 
135
  except Exception as e:
136
+ sys.stderr.write(f"\nCRITICAL ERROR in get_leaderboard_df: {e}\n")
137
  import traceback
138
  sys.stderr.write(f"Traceback: {traceback.format_exc()}\n")
139
  sys.stderr.flush()
140
+ # Always return a valid DataFrame, never None
141
+ return create_empty_dataframe(cols, benchmark_cols)
142
+
143
+ def create_empty_dataframe(cols: list, benchmark_cols: list) -> pd.DataFrame:
144
+ """Create a valid empty DataFrame with all required columns"""
145
+ import sys
146
+
147
+ sys.stderr.write("Creating empty fallback DataFrame...\n")
148
+ sys.stderr.flush()
149
+
150
+ empty_df = pd.DataFrame(columns=cols)
151
+ # Ensure correct column types
152
+ for col in cols:
153
+ if col in benchmark_cols or col == AutoEvalColumn.average.name:
154
  empty_df[col] = pd.Series(dtype=float)
155
+ else:
156
+ empty_df[col] = pd.Series(dtype=str)
157
+
158
+ sys.stderr.write(f"Empty DataFrame created with columns: {empty_df.columns.tolist()}\n")
159
+ sys.stderr.flush()
160
+ return empty_df