Ahmed Ahmed commited on
Commit
3a2ac99
·
1 Parent(s): 21bc425

no more dynamic updates

Browse files
Files changed (2) hide show
  1. app.py +31 -172
  2. logs.txt +234 -107
app.py CHANGED
@@ -40,149 +40,14 @@ def init_leaderboard(dataframe):
40
  ],
41
  )
42
 
43
- def refresh_leaderboard():
44
- import sys
45
- import traceback
46
- import pandas as pd
47
-
48
- try:
49
- sys.stderr.write("=== REFRESH LEADERBOARD DEBUG ===\n")
50
- sys.stderr.write("Refreshing leaderboard data...\n")
51
- sys.stderr.flush()
52
-
53
- # Get fresh leaderboard data
54
- df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
55
-
56
- sys.stderr.write(f"get_leaderboard_df returned: {type(df)}\n")
57
- if df is not None:
58
- sys.stderr.write(f"DataFrame shape: {df.shape}\n")
59
- sys.stderr.write(f"DataFrame columns: {df.columns.tolist()}\n")
60
- sys.stderr.write(f"DataFrame empty: {df.empty}\n")
61
- else:
62
- sys.stderr.write("DataFrame is None!\n")
63
- sys.stderr.flush()
64
-
65
- # Check if DataFrame is valid for leaderboard
66
- if df is None:
67
- sys.stderr.write("DataFrame is None, creating fallback DataFrame\n")
68
- sys.stderr.flush()
69
- # Create a fallback DataFrame
70
- df = create_fallback_dataframe()
71
-
72
- elif df.empty:
73
- sys.stderr.write("DataFrame is empty, creating fallback DataFrame\n")
74
- sys.stderr.flush()
75
- # Create a fallback DataFrame for empty case
76
- df = create_fallback_dataframe()
77
-
78
- elif not all(col in df.columns for col in COLS):
79
- sys.stderr.write(f"DataFrame missing required columns. Has: {df.columns.tolist()}, Needs: {COLS}\n")
80
- sys.stderr.flush()
81
- # Create a fallback DataFrame for missing columns
82
- df = create_fallback_dataframe()
83
-
84
- sys.stderr.write(f"Final DataFrame for leaderboard - Shape: {df.shape}, Columns: {df.columns.tolist()}\n")
85
- sys.stderr.flush()
86
-
87
- # Ensure DataFrame has the exact columns expected
88
- for col in COLS:
89
- if col not in df.columns:
90
- sys.stderr.write(f"Adding missing column: {col}\n")
91
- if col in BENCHMARK_COLS or col == AutoEvalColumn.average.name:
92
- df[col] = 0.0
93
- elif col == AutoEvalColumn.model.name:
94
- df[col] = "Unknown Model"
95
- elif col == AutoEvalColumn.model_type_symbol.name:
96
- df[col] = "?"
97
- else:
98
- df[col] = ""
99
- sys.stderr.flush()
100
-
101
- # Reorder columns to match expected order
102
- df = df[COLS]
103
-
104
- sys.stderr.write("Creating leaderboard component...\n")
105
- sys.stderr.flush()
106
-
107
- new_leaderboard = init_leaderboard(df)
108
- sys.stderr.write("Leaderboard component created successfully\n")
109
- sys.stderr.flush()
110
-
111
- return new_leaderboard
112
-
113
- except Exception as e:
114
- error_msg = str(e)
115
- traceback_str = traceback.format_exc()
116
- sys.stderr.write(f"CRITICAL ERROR in refresh_leaderboard: {error_msg}\n")
117
- sys.stderr.write(f"Traceback: {traceback_str}\n")
118
- sys.stderr.flush()
119
-
120
- # Create emergency fallback leaderboard
121
- try:
122
- sys.stderr.write("Creating emergency fallback leaderboard...\n")
123
- sys.stderr.flush()
124
- fallback_df = create_fallback_dataframe()
125
- return init_leaderboard(fallback_df)
126
- except Exception as fallback_error:
127
- sys.stderr.write(f"Even fallback failed: {fallback_error}\n")
128
- sys.stderr.flush()
129
- raise Exception(f"Complete leaderboard failure: {error_msg}")
130
-
131
- def create_fallback_dataframe():
132
- """Create a minimal valid DataFrame that won't crash the leaderboard"""
133
- import pandas as pd
134
- import sys
135
-
136
- sys.stderr.write("Creating fallback DataFrame...\n")
137
- sys.stderr.flush()
138
-
139
- # Create minimal valid data
140
- fallback_data = {col: [] for col in COLS}
141
-
142
- # Add one dummy row to prevent leaderboard component from crashing
143
- dummy_row = {}
144
- for col in COLS:
145
- if col in BENCHMARK_COLS or col == AutoEvalColumn.average.name:
146
- dummy_row[col] = 0.0
147
- elif col == AutoEvalColumn.model.name:
148
- dummy_row[col] = "No models evaluated yet"
149
- elif col == AutoEvalColumn.model_type_symbol.name:
150
- dummy_row[col] = "?"
151
- elif col == AutoEvalColumn.precision.name:
152
- dummy_row[col] = "float16"
153
- elif col == AutoEvalColumn.model_type.name:
154
- dummy_row[col] = "pretrained"
155
- elif col == AutoEvalColumn.weight_type.name:
156
- dummy_row[col] = "Original"
157
- elif col == AutoEvalColumn.architecture.name:
158
- dummy_row[col] = "Unknown"
159
- elif col == AutoEvalColumn.still_on_hub.name:
160
- dummy_row[col] = True
161
- elif col == AutoEvalColumn.license.name:
162
- dummy_row[col] = "Unknown"
163
- elif col == AutoEvalColumn.params.name:
164
- dummy_row[col] = 0.0
165
- elif col == AutoEvalColumn.likes.name:
166
- dummy_row[col] = 0.0
167
- elif col == AutoEvalColumn.revision.name:
168
- dummy_row[col] = ""
169
- else:
170
- dummy_row[col] = ""
171
-
172
- df = pd.DataFrame([dummy_row])
173
- sys.stderr.write(f"Fallback DataFrame created with shape: {df.shape}\n")
174
- sys.stderr.write(f"Fallback DataFrame columns: {df.columns.tolist()}\n")
175
- sys.stderr.flush()
176
-
177
- return df
178
-
179
  def run_perplexity_test(model_name, revision, precision):
180
  """Run perplexity evaluation on demand."""
181
  import sys
182
  import traceback
 
183
 
184
  if not model_name:
185
- return "Please enter a model name.", None
186
 
187
  try:
188
  # Use stderr for more reliable logging in HF Spaces
@@ -197,37 +62,24 @@ def run_perplexity_test(model_name, revision, precision):
197
  sys.stderr.flush()
198
 
199
  if success:
200
- try:
201
- # Try to refresh leaderboard
202
- sys.stderr.write("Attempting to refresh leaderboard...\n")
203
- sys.stderr.flush()
204
-
205
- new_leaderboard = refresh_leaderboard()
206
-
207
- if new_leaderboard is not None:
208
- sys.stderr.write("Leaderboard refresh successful\n")
209
- sys.stderr.flush()
210
- return f"✅ Perplexity evaluation completed!\nPerplexity: {result:.4f}\n\nResults saved and leaderboard updated.", new_leaderboard
211
- else:
212
- sys.stderr.write("Leaderboard refresh returned None\n")
213
- sys.stderr.flush()
214
- return f"✅ Perplexity evaluation completed!\nPerplexity: {result:.4f}\n\n⚠️ Results saved but leaderboard update returned None.\n\nPlease refresh the page to see updated results.", None
215
-
216
- except Exception as refresh_error:
217
- # If leaderboard refresh fails, still show success but don't update leaderboard
218
- error_msg = str(refresh_error)
219
- traceback_str = traceback.format_exc()
220
- sys.stderr.write(f"Leaderboard refresh failed: {error_msg}\n")
221
- sys.stderr.write(f"Traceback: {traceback_str}\n")
222
- sys.stderr.flush()
223
-
224
- # Check if it's the specific "must have a value set" error
225
- if "must have a value set" in error_msg.lower():
226
- return f"✅ Perplexity evaluation completed!\nPerplexity: {result:.4f}\n\n⚠️ Results saved but leaderboard component failed to update due to data structure issue.\n\n**Please refresh the page** to see your results in the main leaderboard.", None
227
- else:
228
- return f"✅ Perplexity evaluation completed!\nPerplexity: {result:.4f}\n\n⚠️ Results saved but leaderboard refresh failed: {error_msg}\n\nPlease refresh the page to see updated results.", None
229
  else:
230
- return f"❌ Evaluation failed: {result}", None
231
 
232
  except Exception as e:
233
  error_msg = str(e)
@@ -235,7 +87,7 @@ def run_perplexity_test(model_name, revision, precision):
235
  sys.stderr.write(f"Critical error in run_perplexity_test: {error_msg}\n")
236
  sys.stderr.write(f"Traceback: {traceback_str}\n")
237
  sys.stderr.flush()
238
- return f"❌ Critical error: {error_msg}", None
239
 
240
  # Initialize results repository and directory
241
  try:
@@ -301,15 +153,22 @@ with demo:
301
 
302
  gr.Markdown("""
303
  ### Tips:
304
- - Check stderr logs in HF Spaces for detailed debugging information
305
- - If evaluation succeeds but leaderboard doesn't update, try refreshing the page
306
- - Example models to test: `openai-community/gpt2`, `EleutherAI/gpt-neo-1.3B`
 
 
 
 
 
 
 
307
  """)
308
 
309
  test_button.click(
310
  run_perplexity_test,
311
  [model_name, revision, precision],
312
- [result, leaderboard]
313
  )
314
 
315
  demo.queue(default_concurrency_limit=5).launch()
 
40
  ],
41
  )
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  def run_perplexity_test(model_name, revision, precision):
44
  """Run perplexity evaluation on demand."""
45
  import sys
46
  import traceback
47
+ import gradio as gr
48
 
49
  if not model_name:
50
+ return "Please enter a model name."
51
 
52
  try:
53
  # Use stderr for more reliable logging in HF Spaces
 
62
  sys.stderr.flush()
63
 
64
  if success:
65
+ sys.stderr.write("Evaluation succeeded - results saved to dataset\n")
66
+ sys.stderr.flush()
67
+
68
+ return f"""✅ **Perplexity evaluation completed successfully!**
69
+
70
+ **Model**: {model_name}
71
+ **Perplexity Score**: {result:.4f}
72
+
73
+ 🎉 **Results have been saved to the dataset.**
74
+
75
+ 📋 **To see your results in the leaderboard:**
76
+ 1. Click on the **🏅 Leaderboard** tab above
77
+ 2. Refresh the page (Ctrl+R or Cmd+R)
78
+ 3. Your model should now appear in the rankings!
79
+
80
+ 💡 **Note**: Due to technical limitations with the leaderboard component, results cannot be updated dynamically. The refresh is necessary to see the latest rankings."""
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  else:
82
+ return f"❌ **Evaluation failed**: {result}"
83
 
84
  except Exception as e:
85
  error_msg = str(e)
 
87
  sys.stderr.write(f"Critical error in run_perplexity_test: {error_msg}\n")
88
  sys.stderr.write(f"Traceback: {traceback_str}\n")
89
  sys.stderr.flush()
90
+ return f"❌ **Critical error**: {error_msg}"
91
 
92
  # Initialize results repository and directory
93
  try:
 
153
 
154
  gr.Markdown("""
155
  ### Tips:
156
+ - **Check stderr logs** in HF Spaces for detailed debugging information
157
+ - **After evaluation completes**, click the 🏅 Leaderboard tab and refresh the page to see results
158
+ - **Example models to test**: `openai-community/gpt2`, `EleutherAI/gpt-neo-1.3B`, `openai-community/gpt2-large`
159
+ - **Lower perplexity scores = better performance** (better at predicting text)
160
+
161
+ ### How it works:
162
+ 1. Enter a model name from Hugging Face Hub
163
+ 2. Click "Run Perplexity Test"
164
+ 3. Wait for evaluation to complete (may take a few minutes for large models)
165
+ 4. Go to 🏅 Leaderboard tab and refresh the page to see your results!
166
  """)
167
 
168
  test_button.click(
169
  run_perplexity_test,
170
  [model_name, revision, precision],
171
+ [result]
172
  )
173
 
174
  demo.queue(default_concurrency_limit=5).launch()
logs.txt CHANGED
@@ -1,39 +1,18 @@
1
- ==== Application Startup at 2025-07-25 22:55:49 =====
2
 
 
 
3
 
4
- .gitattributes: 0%| | 0.00/2.46k [00:00<?, ?B/s]
5
- .gitattributes: 100%|██████████| 2.46k/2.46k [00:00<00:00, 10.5MB/s]
6
-
7
- (…)enai-community_gpt2_20250725_231201.json: 0%| | 0.00/209 [00:00<?, ?B/s]
8
- (…)enai-community_gpt2_20250725_231201.json: 100%|██████████| 209/209 [00:00<00:00, 1.71MB/s]
9
-
10
- (…)enai-community_gpt2_20250725_233155.json: 0%| | 0.00/209 [00:00<?, ?B/s]
11
- (…)enai-community_gpt2_20250725_233155.json: 100%|██████████| 209/209 [00:00<00:00, 1.26MB/s]
12
-
13
- (…)enai-community_gpt2_20250725_235115.json: 0%| | 0.00/209 [00:00<?, ?B/s]
14
- (…)enai-community_gpt2_20250725_235115.json: 100%|██████████| 209/209 [00:00<00:00, 2.02MB/s]
15
-
16
- (…)enai-community_gpt2_20250725_235748.json: 0%| | 0.00/209 [00:00<?, ?B/s]
17
- (…)enai-community_gpt2_20250725_235748.json: 100%|██████████| 209/209 [00:00<00:00, 2.08MB/s]
18
-
19
- (…)enai-community_gpt2_20250726_000358.json: 0%| | 0.00/209 [00:00<?, ?B/s]
20
- (…)enai-community_gpt2_20250726_000358.json: 100%|██████████| 209/209 [00:00<00:00, 1.54MB/s]
21
-
22
- (…)enai-community_gpt2_20250726_000650.json: 0%| | 0.00/209 [00:00<?, ?B/s]
23
- (…)enai-community_gpt2_20250726_000650.json: 100%|██████████| 209/209 [00:00<00:00, 2.35MB/s]
24
-
25
- === Starting leaderboard creation ===
26
- Looking for results in: ./eval-results
27
- Expected columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
28
- Benchmark columns: ['Perplexity']
29
 
30
- Searching for result files in: ./eval-results
31
- Found 6 result files
 
 
32
 
33
  Processing file: ./eval-results/openai-community/results_openai-community_gpt2_20250725_231201.json
34
 
35
  config.json: 0%| | 0.00/665 [00:00<?, ?B/s]
36
- config.json: 100%|██████████| 665/665 [00:00<00:00, 6.14MB/s]
37
  Created result object for: openai-community/gpt2
38
  Added new result for openai-community_gpt2_float16
39
 
@@ -57,112 +36,176 @@ Processing file: ./eval-results/openai-community/results_openai-community_gpt2_2
57
  Created result object for: openai-community/gpt2
58
  Updated existing result for openai-community_gpt2_float16
59
 
60
- Processing 1 evaluation results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
  Converting result to dict for: openai-community/gpt2
63
 
 
64
  Processing result for model: openai-community/gpt2
65
  Raw results: {'perplexity': 20.663532257080078}
 
 
 
 
 
 
 
66
  Calculated average score: 69.7162958010531
67
- Added perplexity score 20.663532257080078 under column Perplexity
68
- Final data dict keys: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
 
 
69
  Successfully converted and added result
70
 
71
- Returning 1 processed results
72
 
73
- Found 1 raw results
 
74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  Processing result for model: openai-community/gpt2
76
  Raw results: {'perplexity': 20.663532257080078}
 
 
 
 
 
 
 
77
  Calculated average score: 69.7162958010531
78
- Added perplexity score 20.663532257080078 under column Perplexity
79
- Final data dict keys: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
80
- Successfully processed result 1/1: openai-community/gpt2
 
 
81
 
82
- Converted to 1 JSON records
83
  Sample record keys: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
84
 
85
  Created DataFrame with columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
86
- DataFrame shape: (1, 14)
87
 
88
  Sorted DataFrame by average
89
 
90
  Selected and rounded columns
91
 
92
- Final DataFrame shape after filtering: (1, 12)
93
  Final columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
 
94
 
95
  === Initializing Leaderboard ===
96
- DataFrame shape: (1, 12)
97
  DataFrame columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
98
  * Running on local URL: http://0.0.0.0:7860, with SSR ⚡ (experimental, to disable set `ssr=False` in `launch()`)
99
 
100
  To create a public link, set `share=True` in `launch()`.
101
 
102
- === Running Perplexity Test ===
103
- Model: EleutherAI/gpt-neo-1.3B
104
  Revision: main
105
  Precision: float16
106
- Starting dynamic evaluation for EleutherAI/gpt-neo-1.3B
107
  Running perplexity evaluation...
108
- Loading model: EleutherAI/gpt-neo-1.3B (revision: main)
109
  Loading tokenizer...
110
 
111
- tokenizer_config.json: 0%| | 0.00/200 [00:00<?, ?B/s]
112
- tokenizer_config.json: 100%|██████████| 200/200 [00:00<00:00, 1.64MB/s]
113
 
114
- config.json: 0%| | 0.00/1.35k [00:00<?, ?B/s]
115
- config.json: 100%|██████████| 1.35k/1.35k [00:00<00:00, 9.77MB/s]
116
 
117
- vocab.json: 0%| | 0.00/798k [00:00<?, ?B/s]
118
- vocab.json: 100%|██████████| 798k/798k [00:00<00:00, 27.9MB/s]
119
 
120
  merges.txt: 0%| | 0.00/456k [00:00<?, ?B/s]
121
- merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 3.54MB/s]
122
 
123
- special_tokens_map.json: 0%| | 0.00/90.0 [00:00<?, ?B/s]
124
- special_tokens_map.json: 100%|██████████| 90.0/90.0 [00:00<00:00, 1.05MB/s]
125
  Tokenizer loaded successfully
126
  Loading model...
127
 
128
- model.safetensors: 0%| | 0.00/5.31G [00:00<?, ?B/s]
129
- model.safetensors: 0%| | 778k/5.31G [00:01<2:15:00, 656kB/s]
130
- model.safetensors: 0%| | 7.69M/5.31G [00:02<23:51, 3.70MB/s]
131
- model.safetensors: 1%|▏ | 74.7M/5.31G [00:03<03:29, 25.0MB/s]
132
- model.safetensors: 9%|▉ | 496M/5.31G [00:04<00:31, 153MB/s]
133
- model.safetensors: 19%|█▉ | 1.03G/5.31G [00:06<00:16, 263MB/s]
134
- model.safetensors: 25%|██▍ | 1.32G/5.31G [00:07<00:16, 235MB/s]
135
- model.safetensors: 38%|███▊ | 1.99G/5.31G [00:08<00:09, 346MB/s]
136
- model.safetensors: 47%|████▋ | 2.51G/5.31G [00:09<00:07, 379MB/s]
137
- model.safetensors: 59%|█████▊ | 3.11G/5.31G [00:10<00:05, 429MB/s]
138
- model.safetensors: 69%|██████▊ | 3.65G/5.31G [00:11<00:03, 451MB/s]
139
- model.safetensors: 80%|███████▉ | 4.24G/5.31G [00:13<00:02, 477MB/s]
140
- model.safetensors: 91%|█████████ | 4.84G/5.31G [00:14<00:00, 494MB/s]
141
- model.safetensors: 100%|██████████| 5.31G/5.31G [00:14<00:00, 355MB/s]
142
  Model loaded successfully
143
  Tokenizing input text...
144
  Tokenized input shape: torch.Size([1, 141])
145
  Moved inputs to device: cpu
146
  Running forward pass...
147
- Calculated loss: 1.78515625
148
- Final perplexity: 5.9609375
149
- Perplexity evaluation completed: 5.9609375
150
- Created result structure: {'config': {'model_dtype': 'torch.float16', 'model_name': 'EleutherAI/gpt-neo-1.3B', 'model_sha': 'main'}, 'results': {'perplexity': {'perplexity': 5.9609375}}}
151
- Saving result to: ./eval-results/EleutherAI/results_EleutherAI_gpt-neo-1.3B_20250726_010247.json
 
152
  Result file saved locally
153
  Uploading to HF dataset: ahmedsqrd/results
154
  Upload completed successfully
155
- Evaluation result - Success: True, Result: 5.9609375
156
  Attempting to refresh leaderboard...
 
157
  Refreshing leaderboard data...
158
 
159
- === Starting leaderboard creation ===
 
160
  Looking for results in: ./eval-results
161
  Expected columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
162
  Benchmark columns: ['Perplexity']
163
 
164
  Searching for result files in: ./eval-results
165
- Found 7 result files
 
 
 
 
166
 
167
  Processing file: ./eval-results/openai-community/results_openai-community_gpt2_20250725_231201.json
168
  Created result object for: openai-community/gpt2
@@ -188,67 +231,151 @@ Processing file: ./eval-results/openai-community/results_openai-community_gpt2_2
188
  Created result object for: openai-community/gpt2
189
  Updated existing result for openai-community_gpt2_float16
190
 
191
- Processing file: ./eval-results/EleutherAI/results_EleutherAI_gpt-neo-1.3B_20250726_010247.json
192
- Created result object for: EleutherAI/gpt-neo-1.3B
193
- Added new result for EleutherAI_gpt-neo-1.3B_float16
194
 
195
- Processing 2 evaluation results
196
-
197
- Converting result to dict for: openai-community/gpt2
198
-
199
- Processing result for model: openai-community/gpt2
200
- Raw results: {'perplexity': 20.663532257080078}
201
- Calculated average score: 69.7162958010531
202
- Added perplexity score 20.663532257080078 under column Perplexity
203
- Final data dict keys: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
204
- Successfully converted and added result
205
 
206
  Converting result to dict for: EleutherAI/gpt-neo-1.3B
207
 
 
208
  Processing result for model: EleutherAI/gpt-neo-1.3B
209
  Raw results: {'perplexity': 5.9609375}
 
 
 
 
 
 
 
210
  Calculated average score: 82.1477223263516
211
- Added perplexity score 5.9609375 under column Perplexity
212
- Final data dict keys: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
 
 
213
  Successfully converted and added result
214
 
215
- Returning 2 processed results
216
-
217
- Found 2 raw results
218
 
 
219
  Processing result for model: openai-community/gpt2
220
  Raw results: {'perplexity': 20.663532257080078}
 
 
 
 
 
 
 
221
  Calculated average score: 69.7162958010531
222
- Added perplexity score 20.663532257080078 under column Perplexity
223
- Final data dict keys: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
224
- Successfully processed result 1/2: openai-community/gpt2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
 
 
226
  Processing result for model: EleutherAI/gpt-neo-1.3B
227
  Raw results: {'perplexity': 5.9609375}
 
 
 
 
 
 
 
228
  Calculated average score: 82.1477223263516
229
- Added perplexity score 5.9609375 under column Perplexity
230
- Final data dict keys: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
231
- Successfully processed result 2/2: EleutherAI/gpt-neo-1.3B
232
-
233
- Converted to 2 JSON records
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  Sample record keys: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
235
 
236
  Created DataFrame with columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
237
- DataFrame shape: (2, 14)
238
 
239
  Sorted DataFrame by average
240
 
241
  Selected and rounded columns
242
 
243
- Final DataFrame shape after filtering: (2, 12)
244
  Final columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
245
- Got DataFrame with shape: (2, 12)
 
 
246
  DataFrame columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
247
- Creating leaderboard with valid DataFrame
 
 
248
 
249
  === Initializing Leaderboard ===
250
- DataFrame shape: (2, 12)
251
  DataFrame columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
 
252
  Leaderboard refresh successful
253
  Traceback (most recent call last):
254
  File "/usr/local/lib/python3.10/site-packages/gradio/queueing.py", line 625, in process_events
 
 
1
 
2
+ Searching for result files in: ./eval-results
3
+ Found 7 result files
4
 
5
+ Processing file: ./eval-results/EleutherAI/results_EleutherAI_gpt-neo-1.3B_20250726_010247.json
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
+ config.json: 0%| | 0.00/1.35k [00:00<?, ?B/s]
8
+ config.json: 100%|██████████| 1.35k/1.35k [00:00<00:00, 17.2MB/s]
9
+ Created result object for: EleutherAI/gpt-neo-1.3B
10
+ Added new result for EleutherAI_gpt-neo-1.3B_float16
11
 
12
  Processing file: ./eval-results/openai-community/results_openai-community_gpt2_20250725_231201.json
13
 
14
  config.json: 0%| | 0.00/665 [00:00<?, ?B/s]
15
+ config.json: 100%|██████████| 665/665 [00:00<00:00, 8.83MB/s]
16
  Created result object for: openai-community/gpt2
17
  Added new result for openai-community_gpt2_float16
18
 
 
36
  Created result object for: openai-community/gpt2
37
  Updated existing result for openai-community_gpt2_float16
38
 
39
+ Processing 2 evaluation results
40
+
41
+ Converting result to dict for: EleutherAI/gpt-neo-1.3B
42
+
43
+ === PROCESSING RESULT TO_DICT ===
44
+ Processing result for model: EleutherAI/gpt-neo-1.3B
45
+ Raw results: {'perplexity': 5.9609375}
46
+ Model precision: Precision.float16
47
+ Model type: ModelType.PT
48
+ Weight type: WeightType.Original
49
+ Available tasks: ['task0']
50
+ Looking for task: perplexity in results
51
+ Found score for perplexity: 5.9609375
52
+ Converted score: 82.1477223263516
53
+ Calculated average score: 82.1477223263516
54
+ Created base data_dict with 13 columns
55
+ Added task score: Perplexity = 5.9609375
56
+ Final data dict has 14 columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
57
+ === END PROCESSING RESULT TO_DICT ===
58
+ Successfully converted and added result
59
 
60
  Converting result to dict for: openai-community/gpt2
61
 
62
+ === PROCESSING RESULT TO_DICT ===
63
  Processing result for model: openai-community/gpt2
64
  Raw results: {'perplexity': 20.663532257080078}
65
+ Model precision: Precision.float16
66
+ Model type: ModelType.PT
67
+ Weight type: WeightType.Original
68
+ Available tasks: ['task0']
69
+ Looking for task: perplexity in results
70
+ Found score for perplexity: 20.663532257080078
71
+ Converted score: 69.7162958010531
72
  Calculated average score: 69.7162958010531
73
+ Created base data_dict with 13 columns
74
+ Added task score: Perplexity = 20.663532257080078
75
+ Final data dict has 14 columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
76
+ === END PROCESSING RESULT TO_DICT ===
77
  Successfully converted and added result
78
 
79
+ Returning 2 processed results
80
 
81
+ Found 2 raw results
82
+ Processing result 1/2: EleutherAI/gpt-neo-1.3B
83
 
84
+ === PROCESSING RESULT TO_DICT ===
85
+ Processing result for model: EleutherAI/gpt-neo-1.3B
86
+ Raw results: {'perplexity': 5.9609375}
87
+ Model precision: Precision.float16
88
+ Model type: ModelType.PT
89
+ Weight type: WeightType.Original
90
+ Available tasks: ['task0']
91
+ Looking for task: perplexity in results
92
+ Found score for perplexity: 5.9609375
93
+ Converted score: 82.1477223263516
94
+ Calculated average score: 82.1477223263516
95
+ Created base data_dict with 13 columns
96
+ Added task score: Perplexity = 5.9609375
97
+ Final data dict has 14 columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
98
+ === END PROCESSING RESULT TO_DICT ===
99
+ Successfully processed result 1/2: EleutherAI/gpt-neo-1.3B
100
+ Processing result 2/2: openai-community/gpt2
101
+
102
+ === PROCESSING RESULT TO_DICT ===
103
  Processing result for model: openai-community/gpt2
104
  Raw results: {'perplexity': 20.663532257080078}
105
+ Model precision: Precision.float16
106
+ Model type: ModelType.PT
107
+ Weight type: WeightType.Original
108
+ Available tasks: ['task0']
109
+ Looking for task: perplexity in results
110
+ Found score for perplexity: 20.663532257080078
111
+ Converted score: 69.7162958010531
112
  Calculated average score: 69.7162958010531
113
+ Created base data_dict with 13 columns
114
+ Added task score: Perplexity = 20.663532257080078
115
+ Final data dict has 14 columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
116
+ === END PROCESSING RESULT TO_DICT ===
117
+ Successfully processed result 2/2: openai-community/gpt2
118
 
119
+ Converted to 2 JSON records
120
  Sample record keys: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
121
 
122
  Created DataFrame with columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
123
+ DataFrame shape: (2, 14)
124
 
125
  Sorted DataFrame by average
126
 
127
  Selected and rounded columns
128
 
129
+ Final DataFrame shape after filtering: (2, 12)
130
  Final columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
131
+ === FINAL RESULT: DataFrame with 2 rows and 12 columns ===
132
 
133
  === Initializing Leaderboard ===
134
+ DataFrame shape: (2, 12)
135
  DataFrame columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
136
  * Running on local URL: http://0.0.0.0:7860, with SSR ⚡ (experimental, to disable set `ssr=False` in `launch()`)
137
 
138
  To create a public link, set `share=True` in `launch()`.
139
 
140
+ === RUNNING PERPLEXITY TEST ===
141
+ Model: openai-community/gpt2-large
142
  Revision: main
143
  Precision: float16
144
+ Starting dynamic evaluation for openai-community/gpt2-large
145
  Running perplexity evaluation...
146
+ Loading model: openai-community/gpt2-large (revision: main)
147
  Loading tokenizer...
148
 
149
+ tokenizer_config.json: 0%| | 0.00/26.0 [00:00<?, ?B/s]
150
+ tokenizer_config.json: 100%|██████████| 26.0/26.0 [00:00<00:00, 183kB/s]
151
 
152
+ config.json: 0%| | 0.00/666 [00:00<?, ?B/s]
153
+ config.json: 100%|██████████| 666/666 [00:00<00:00, 7.11MB/s]
154
 
155
+ vocab.json: 0%| | 0.00/1.04M [00:00<?, ?B/s]
156
+ vocab.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 45.7MB/s]
157
 
158
  merges.txt: 0%| | 0.00/456k [00:00<?, ?B/s]
159
+ merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 44.9MB/s]
160
 
161
+ tokenizer.json: 0%| | 0.00/1.36M [00:00<?, ?B/s]
162
+ tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 25.3MB/s]
163
  Tokenizer loaded successfully
164
  Loading model...
165
 
166
+ model.safetensors: 0%| | 0.00/3.25G [00:00<?, ?B/s]
167
+ model.safetensors: 0%| | 3.99M/3.25G [00:01<18:26, 2.93MB/s]
168
+ model.safetensors: 4%|▍ | 138M/3.25G [00:02<00:47, 65.1MB/s]
169
+ model.safetensors: 7%|▋ | 235M/3.25G [00:03<00:46, 65.4MB/s]
170
+ model.safetensors: 28%|██▊ | 905M/3.25G [00:05<00:09, 258MB/s]
171
+ model.safetensors: 46%|████▋ | 1.51G/3.25G [00:06<00:04, 360MB/s]
172
+ model.safetensors: 71%|███████ | 2.31G/3.25G [00:07<00:01, 484MB/s]
173
+ model.safetensors: 98%|█████████▊| 3.18G/3.25G [00:08<00:00, 593MB/s]
174
+ model.safetensors: 100%|██████████| 3.25G/3.25G [00:08<00:00, 390MB/s]
175
+
176
+ generation_config.json: 0%| | 0.00/124 [00:00<?, ?B/s]
177
+ generation_config.json: 100%|██████████| 124/124 [00:00<00:00, 1.04MB/s]
 
 
178
  Model loaded successfully
179
  Tokenizing input text...
180
  Tokenized input shape: torch.Size([1, 141])
181
  Moved inputs to device: cpu
182
  Running forward pass...
183
+ `loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
184
+ Calculated loss: 2.1944427490234375
185
+ Final perplexity: 8.974998474121094
186
+ Perplexity evaluation completed: 8.974998474121094
187
+ Created result structure: {'config': {'model_dtype': 'torch.float16', 'model_name': 'openai-community/gpt2-large', 'model_sha': 'main'}, 'results': {'perplexity': {'perplexity': 8.974998474121094}}}
188
+ Saving result to: ./eval-results/openai-community/results_openai-community_gpt2-large_20250726_013038.json
189
  Result file saved locally
190
  Uploading to HF dataset: ahmedsqrd/results
191
  Upload completed successfully
192
+ Evaluation result - Success: True, Result: 8.974998474121094
193
  Attempting to refresh leaderboard...
194
+ === REFRESH LEADERBOARD DEBUG ===
195
  Refreshing leaderboard data...
196
 
197
+ === GET_LEADERBOARD_DF DEBUG ===
198
+ Starting leaderboard creation...
199
  Looking for results in: ./eval-results
200
  Expected columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
201
  Benchmark columns: ['Perplexity']
202
 
203
  Searching for result files in: ./eval-results
204
+ Found 8 result files
205
+
206
+ Processing file: ./eval-results/EleutherAI/results_EleutherAI_gpt-neo-1.3B_20250726_010247.json
207
+ Created result object for: EleutherAI/gpt-neo-1.3B
208
+ Added new result for EleutherAI_gpt-neo-1.3B_float16
209
 
210
  Processing file: ./eval-results/openai-community/results_openai-community_gpt2_20250725_231201.json
211
  Created result object for: openai-community/gpt2
 
231
  Created result object for: openai-community/gpt2
232
  Updated existing result for openai-community_gpt2_float16
233
 
234
+ Processing file: ./eval-results/openai-community/results_openai-community_gpt2-large_20250726_013038.json
235
+ Created result object for: openai-community/gpt2-large
236
+ Added new result for openai-community_gpt2-large_float16
237
 
238
+ Processing 3 evaluation results
 
 
 
 
 
 
 
 
 
239
 
240
  Converting result to dict for: EleutherAI/gpt-neo-1.3B
241
 
242
+ === PROCESSING RESULT TO_DICT ===
243
  Processing result for model: EleutherAI/gpt-neo-1.3B
244
  Raw results: {'perplexity': 5.9609375}
245
+ Model precision: Precision.float16
246
+ Model type: ModelType.PT
247
+ Weight type: WeightType.Original
248
+ Available tasks: ['task0']
249
+ Looking for task: perplexity in results
250
+ Found score for perplexity: 5.9609375
251
+ Converted score: 82.1477223263516
252
  Calculated average score: 82.1477223263516
253
+ Created base data_dict with 13 columns
254
+ Added task score: Perplexity = 5.9609375
255
+ Final data dict has 14 columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
256
+ === END PROCESSING RESULT TO_DICT ===
257
  Successfully converted and added result
258
 
259
+ Converting result to dict for: openai-community/gpt2
 
 
260
 
261
+ === PROCESSING RESULT TO_DICT ===
262
  Processing result for model: openai-community/gpt2
263
  Raw results: {'perplexity': 20.663532257080078}
264
+ Model precision: Precision.float16
265
+ Model type: ModelType.PT
266
+ Weight type: WeightType.Original
267
+ Available tasks: ['task0']
268
+ Looking for task: perplexity in results
269
+ Found score for perplexity: 20.663532257080078
270
+ Converted score: 69.7162958010531
271
  Calculated average score: 69.7162958010531
272
+ Created base data_dict with 13 columns
273
+ Added task score: Perplexity = 20.663532257080078
274
+ Final data dict has 14 columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
275
+ === END PROCESSING RESULT TO_DICT ===
276
+ Successfully converted and added result
277
+
278
+ Converting result to dict for: openai-community/gpt2-large
279
+
280
+ === PROCESSING RESULT TO_DICT ===
281
+ Processing result for model: openai-community/gpt2-large
282
+ Raw results: {'perplexity': 8.974998474121094}
283
+ Model precision: Precision.float16
284
+ Model type: ModelType.PT
285
+ Weight type: WeightType.Original
286
+ Available tasks: ['task0']
287
+ Looking for task: perplexity in results
288
+ Found score for perplexity: 8.974998474121094
289
+ Converted score: 78.05557235640035
290
+ Calculated average score: 78.05557235640035
291
+ Created base data_dict with 13 columns
292
+ Added task score: Perplexity = 8.974998474121094
293
+ Final data dict has 14 columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
294
+ === END PROCESSING RESULT TO_DICT ===
295
+ Successfully converted and added result
296
+
297
+ Returning 3 processed results
298
+
299
+ Found 3 raw results
300
+ Processing result 1/3: EleutherAI/gpt-neo-1.3B
301
 
302
+ === PROCESSING RESULT TO_DICT ===
303
  Processing result for model: EleutherAI/gpt-neo-1.3B
304
  Raw results: {'perplexity': 5.9609375}
305
+ Model precision: Precision.float16
306
+ Model type: ModelType.PT
307
+ Weight type: WeightType.Original
308
+ Available tasks: ['task0']
309
+ Looking for task: perplexity in results
310
+ Found score for perplexity: 5.9609375
311
+ Converted score: 82.1477223263516
312
  Calculated average score: 82.1477223263516
313
+ Created base data_dict with 13 columns
314
+ Added task score: Perplexity = 5.9609375
315
+ Final data dict has 14 columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
316
+ === END PROCESSING RESULT TO_DICT ===
317
+ Successfully processed result 1/3: EleutherAI/gpt-neo-1.3B
318
+ Processing result 2/3: openai-community/gpt2
319
+
320
+ === PROCESSING RESULT TO_DICT ===
321
+ Processing result for model: openai-community/gpt2
322
+ Raw results: {'perplexity': 20.663532257080078}
323
+ Model precision: Precision.float16
324
+ Model type: ModelType.PT
325
+ Weight type: WeightType.Original
326
+ Available tasks: ['task0']
327
+ Looking for task: perplexity in results
328
+ Found score for perplexity: 20.663532257080078
329
+ Converted score: 69.7162958010531
330
+ Calculated average score: 69.7162958010531
331
+ Created base data_dict with 13 columns
332
+ Added task score: Perplexity = 20.663532257080078
333
+ Final data dict has 14 columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
334
+ === END PROCESSING RESULT TO_DICT ===
335
+ Successfully processed result 2/3: openai-community/gpt2
336
+ Processing result 3/3: openai-community/gpt2-large
337
+
338
+ === PROCESSING RESULT TO_DICT ===
339
+ Processing result for model: openai-community/gpt2-large
340
+ Raw results: {'perplexity': 8.974998474121094}
341
+ Model precision: Precision.float16
342
+ Model type: ModelType.PT
343
+ Weight type: WeightType.Original
344
+ Available tasks: ['task0']
345
+ Looking for task: perplexity in results
346
+ Found score for perplexity: 8.974998474121094
347
+ Converted score: 78.05557235640035
348
+ Calculated average score: 78.05557235640035
349
+ Created base data_dict with 13 columns
350
+ Added task score: Perplexity = 8.974998474121094
351
+ Final data dict has 14 columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
352
+ === END PROCESSING RESULT TO_DICT ===
353
+ Successfully processed result 3/3: openai-community/gpt2-large
354
+
355
+ Converted to 3 JSON records
356
  Sample record keys: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
357
 
358
  Created DataFrame with columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
359
+ DataFrame shape: (3, 14)
360
 
361
  Sorted DataFrame by average
362
 
363
  Selected and rounded columns
364
 
365
+ Final DataFrame shape after filtering: (3, 12)
366
  Final columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
367
+ === FINAL RESULT: DataFrame with 3 rows and 12 columns ===
368
+ get_leaderboard_df returned: <class 'pandas.core.frame.DataFrame'>
369
+ DataFrame shape: (3, 12)
370
  DataFrame columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
371
+ DataFrame empty: False
372
+ Final DataFrame for leaderboard - Shape: (3, 12), Columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
373
+ Creating leaderboard component...
374
 
375
  === Initializing Leaderboard ===
376
+ DataFrame shape: (3, 12)
377
  DataFrame columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
378
+ Leaderboard component created successfully
379
  Leaderboard refresh successful
380
  Traceback (most recent call last):
381
  File "/usr/local/lib/python3.10/site-packages/gradio/queueing.py", line 625, in process_events