Spaces:
Runtime error
Runtime error
Ahmed Ahmed
commited on
Commit
·
3a2ac99
1
Parent(s):
21bc425
no more dynamic updates
Browse files
app.py
CHANGED
@@ -40,149 +40,14 @@ def init_leaderboard(dataframe):
|
|
40 |
],
|
41 |
)
|
42 |
|
43 |
-
def refresh_leaderboard():
|
44 |
-
import sys
|
45 |
-
import traceback
|
46 |
-
import pandas as pd
|
47 |
-
|
48 |
-
try:
|
49 |
-
sys.stderr.write("=== REFRESH LEADERBOARD DEBUG ===\n")
|
50 |
-
sys.stderr.write("Refreshing leaderboard data...\n")
|
51 |
-
sys.stderr.flush()
|
52 |
-
|
53 |
-
# Get fresh leaderboard data
|
54 |
-
df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
|
55 |
-
|
56 |
-
sys.stderr.write(f"get_leaderboard_df returned: {type(df)}\n")
|
57 |
-
if df is not None:
|
58 |
-
sys.stderr.write(f"DataFrame shape: {df.shape}\n")
|
59 |
-
sys.stderr.write(f"DataFrame columns: {df.columns.tolist()}\n")
|
60 |
-
sys.stderr.write(f"DataFrame empty: {df.empty}\n")
|
61 |
-
else:
|
62 |
-
sys.stderr.write("DataFrame is None!\n")
|
63 |
-
sys.stderr.flush()
|
64 |
-
|
65 |
-
# Check if DataFrame is valid for leaderboard
|
66 |
-
if df is None:
|
67 |
-
sys.stderr.write("DataFrame is None, creating fallback DataFrame\n")
|
68 |
-
sys.stderr.flush()
|
69 |
-
# Create a fallback DataFrame
|
70 |
-
df = create_fallback_dataframe()
|
71 |
-
|
72 |
-
elif df.empty:
|
73 |
-
sys.stderr.write("DataFrame is empty, creating fallback DataFrame\n")
|
74 |
-
sys.stderr.flush()
|
75 |
-
# Create a fallback DataFrame for empty case
|
76 |
-
df = create_fallback_dataframe()
|
77 |
-
|
78 |
-
elif not all(col in df.columns for col in COLS):
|
79 |
-
sys.stderr.write(f"DataFrame missing required columns. Has: {df.columns.tolist()}, Needs: {COLS}\n")
|
80 |
-
sys.stderr.flush()
|
81 |
-
# Create a fallback DataFrame for missing columns
|
82 |
-
df = create_fallback_dataframe()
|
83 |
-
|
84 |
-
sys.stderr.write(f"Final DataFrame for leaderboard - Shape: {df.shape}, Columns: {df.columns.tolist()}\n")
|
85 |
-
sys.stderr.flush()
|
86 |
-
|
87 |
-
# Ensure DataFrame has the exact columns expected
|
88 |
-
for col in COLS:
|
89 |
-
if col not in df.columns:
|
90 |
-
sys.stderr.write(f"Adding missing column: {col}\n")
|
91 |
-
if col in BENCHMARK_COLS or col == AutoEvalColumn.average.name:
|
92 |
-
df[col] = 0.0
|
93 |
-
elif col == AutoEvalColumn.model.name:
|
94 |
-
df[col] = "Unknown Model"
|
95 |
-
elif col == AutoEvalColumn.model_type_symbol.name:
|
96 |
-
df[col] = "?"
|
97 |
-
else:
|
98 |
-
df[col] = ""
|
99 |
-
sys.stderr.flush()
|
100 |
-
|
101 |
-
# Reorder columns to match expected order
|
102 |
-
df = df[COLS]
|
103 |
-
|
104 |
-
sys.stderr.write("Creating leaderboard component...\n")
|
105 |
-
sys.stderr.flush()
|
106 |
-
|
107 |
-
new_leaderboard = init_leaderboard(df)
|
108 |
-
sys.stderr.write("Leaderboard component created successfully\n")
|
109 |
-
sys.stderr.flush()
|
110 |
-
|
111 |
-
return new_leaderboard
|
112 |
-
|
113 |
-
except Exception as e:
|
114 |
-
error_msg = str(e)
|
115 |
-
traceback_str = traceback.format_exc()
|
116 |
-
sys.stderr.write(f"CRITICAL ERROR in refresh_leaderboard: {error_msg}\n")
|
117 |
-
sys.stderr.write(f"Traceback: {traceback_str}\n")
|
118 |
-
sys.stderr.flush()
|
119 |
-
|
120 |
-
# Create emergency fallback leaderboard
|
121 |
-
try:
|
122 |
-
sys.stderr.write("Creating emergency fallback leaderboard...\n")
|
123 |
-
sys.stderr.flush()
|
124 |
-
fallback_df = create_fallback_dataframe()
|
125 |
-
return init_leaderboard(fallback_df)
|
126 |
-
except Exception as fallback_error:
|
127 |
-
sys.stderr.write(f"Even fallback failed: {fallback_error}\n")
|
128 |
-
sys.stderr.flush()
|
129 |
-
raise Exception(f"Complete leaderboard failure: {error_msg}")
|
130 |
-
|
131 |
-
def create_fallback_dataframe():
|
132 |
-
"""Create a minimal valid DataFrame that won't crash the leaderboard"""
|
133 |
-
import pandas as pd
|
134 |
-
import sys
|
135 |
-
|
136 |
-
sys.stderr.write("Creating fallback DataFrame...\n")
|
137 |
-
sys.stderr.flush()
|
138 |
-
|
139 |
-
# Create minimal valid data
|
140 |
-
fallback_data = {col: [] for col in COLS}
|
141 |
-
|
142 |
-
# Add one dummy row to prevent leaderboard component from crashing
|
143 |
-
dummy_row = {}
|
144 |
-
for col in COLS:
|
145 |
-
if col in BENCHMARK_COLS or col == AutoEvalColumn.average.name:
|
146 |
-
dummy_row[col] = 0.0
|
147 |
-
elif col == AutoEvalColumn.model.name:
|
148 |
-
dummy_row[col] = "No models evaluated yet"
|
149 |
-
elif col == AutoEvalColumn.model_type_symbol.name:
|
150 |
-
dummy_row[col] = "?"
|
151 |
-
elif col == AutoEvalColumn.precision.name:
|
152 |
-
dummy_row[col] = "float16"
|
153 |
-
elif col == AutoEvalColumn.model_type.name:
|
154 |
-
dummy_row[col] = "pretrained"
|
155 |
-
elif col == AutoEvalColumn.weight_type.name:
|
156 |
-
dummy_row[col] = "Original"
|
157 |
-
elif col == AutoEvalColumn.architecture.name:
|
158 |
-
dummy_row[col] = "Unknown"
|
159 |
-
elif col == AutoEvalColumn.still_on_hub.name:
|
160 |
-
dummy_row[col] = True
|
161 |
-
elif col == AutoEvalColumn.license.name:
|
162 |
-
dummy_row[col] = "Unknown"
|
163 |
-
elif col == AutoEvalColumn.params.name:
|
164 |
-
dummy_row[col] = 0.0
|
165 |
-
elif col == AutoEvalColumn.likes.name:
|
166 |
-
dummy_row[col] = 0.0
|
167 |
-
elif col == AutoEvalColumn.revision.name:
|
168 |
-
dummy_row[col] = ""
|
169 |
-
else:
|
170 |
-
dummy_row[col] = ""
|
171 |
-
|
172 |
-
df = pd.DataFrame([dummy_row])
|
173 |
-
sys.stderr.write(f"Fallback DataFrame created with shape: {df.shape}\n")
|
174 |
-
sys.stderr.write(f"Fallback DataFrame columns: {df.columns.tolist()}\n")
|
175 |
-
sys.stderr.flush()
|
176 |
-
|
177 |
-
return df
|
178 |
-
|
179 |
def run_perplexity_test(model_name, revision, precision):
|
180 |
"""Run perplexity evaluation on demand."""
|
181 |
import sys
|
182 |
import traceback
|
|
|
183 |
|
184 |
if not model_name:
|
185 |
-
return "Please enter a model name."
|
186 |
|
187 |
try:
|
188 |
# Use stderr for more reliable logging in HF Spaces
|
@@ -197,37 +62,24 @@ def run_perplexity_test(model_name, revision, precision):
|
|
197 |
sys.stderr.flush()
|
198 |
|
199 |
if success:
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
except Exception as refresh_error:
|
217 |
-
# If leaderboard refresh fails, still show success but don't update leaderboard
|
218 |
-
error_msg = str(refresh_error)
|
219 |
-
traceback_str = traceback.format_exc()
|
220 |
-
sys.stderr.write(f"Leaderboard refresh failed: {error_msg}\n")
|
221 |
-
sys.stderr.write(f"Traceback: {traceback_str}\n")
|
222 |
-
sys.stderr.flush()
|
223 |
-
|
224 |
-
# Check if it's the specific "must have a value set" error
|
225 |
-
if "must have a value set" in error_msg.lower():
|
226 |
-
return f"✅ Perplexity evaluation completed!\nPerplexity: {result:.4f}\n\n⚠️ Results saved but leaderboard component failed to update due to data structure issue.\n\n**Please refresh the page** to see your results in the main leaderboard.", None
|
227 |
-
else:
|
228 |
-
return f"✅ Perplexity evaluation completed!\nPerplexity: {result:.4f}\n\n⚠️ Results saved but leaderboard refresh failed: {error_msg}\n\nPlease refresh the page to see updated results.", None
|
229 |
else:
|
230 |
-
return f"❌ Evaluation failed
|
231 |
|
232 |
except Exception as e:
|
233 |
error_msg = str(e)
|
@@ -235,7 +87,7 @@ def run_perplexity_test(model_name, revision, precision):
|
|
235 |
sys.stderr.write(f"Critical error in run_perplexity_test: {error_msg}\n")
|
236 |
sys.stderr.write(f"Traceback: {traceback_str}\n")
|
237 |
sys.stderr.flush()
|
238 |
-
return f"❌ Critical error
|
239 |
|
240 |
# Initialize results repository and directory
|
241 |
try:
|
@@ -301,15 +153,22 @@ with demo:
|
|
301 |
|
302 |
gr.Markdown("""
|
303 |
### Tips:
|
304 |
-
- Check stderr logs in HF Spaces for detailed debugging information
|
305 |
-
-
|
306 |
-
- Example models to test
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
307 |
""")
|
308 |
|
309 |
test_button.click(
|
310 |
run_perplexity_test,
|
311 |
[model_name, revision, precision],
|
312 |
-
[result
|
313 |
)
|
314 |
|
315 |
demo.queue(default_concurrency_limit=5).launch()
|
|
|
40 |
],
|
41 |
)
|
42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
def run_perplexity_test(model_name, revision, precision):
|
44 |
"""Run perplexity evaluation on demand."""
|
45 |
import sys
|
46 |
import traceback
|
47 |
+
import gradio as gr
|
48 |
|
49 |
if not model_name:
|
50 |
+
return "Please enter a model name."
|
51 |
|
52 |
try:
|
53 |
# Use stderr for more reliable logging in HF Spaces
|
|
|
62 |
sys.stderr.flush()
|
63 |
|
64 |
if success:
|
65 |
+
sys.stderr.write("Evaluation succeeded - results saved to dataset\n")
|
66 |
+
sys.stderr.flush()
|
67 |
+
|
68 |
+
return f"""✅ **Perplexity evaluation completed successfully!**
|
69 |
+
|
70 |
+
**Model**: {model_name}
|
71 |
+
**Perplexity Score**: {result:.4f}
|
72 |
+
|
73 |
+
🎉 **Results have been saved to the dataset.**
|
74 |
+
|
75 |
+
📋 **To see your results in the leaderboard:**
|
76 |
+
1. Click on the **🏅 Leaderboard** tab above
|
77 |
+
2. Refresh the page (Ctrl+R or Cmd+R)
|
78 |
+
3. Your model should now appear in the rankings!
|
79 |
+
|
80 |
+
💡 **Note**: Due to technical limitations with the leaderboard component, results cannot be updated dynamically. The refresh is necessary to see the latest rankings."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
else:
|
82 |
+
return f"❌ **Evaluation failed**: {result}"
|
83 |
|
84 |
except Exception as e:
|
85 |
error_msg = str(e)
|
|
|
87 |
sys.stderr.write(f"Critical error in run_perplexity_test: {error_msg}\n")
|
88 |
sys.stderr.write(f"Traceback: {traceback_str}\n")
|
89 |
sys.stderr.flush()
|
90 |
+
return f"❌ **Critical error**: {error_msg}"
|
91 |
|
92 |
# Initialize results repository and directory
|
93 |
try:
|
|
|
153 |
|
154 |
gr.Markdown("""
|
155 |
### Tips:
|
156 |
+
- **Check stderr logs** in HF Spaces for detailed debugging information
|
157 |
+
- **After evaluation completes**, click the 🏅 Leaderboard tab and refresh the page to see results
|
158 |
+
- **Example models to test**: `openai-community/gpt2`, `EleutherAI/gpt-neo-1.3B`, `openai-community/gpt2-large`
|
159 |
+
- **Lower perplexity scores = better performance** (better at predicting text)
|
160 |
+
|
161 |
+
### How it works:
|
162 |
+
1. Enter a model name from Hugging Face Hub
|
163 |
+
2. Click "Run Perplexity Test"
|
164 |
+
3. Wait for evaluation to complete (may take a few minutes for large models)
|
165 |
+
4. Go to 🏅 Leaderboard tab and refresh the page to see your results!
|
166 |
""")
|
167 |
|
168 |
test_button.click(
|
169 |
run_perplexity_test,
|
170 |
[model_name, revision, precision],
|
171 |
+
[result]
|
172 |
)
|
173 |
|
174 |
demo.queue(default_concurrency_limit=5).launch()
|
logs.txt
CHANGED
@@ -1,39 +1,18 @@
|
|
1 |
-
==== Application Startup at 2025-07-25 22:55:49 =====
|
2 |
|
|
|
|
|
3 |
|
4 |
-
|
5 |
-
.gitattributes: 100%|██████████| 2.46k/2.46k [00:00<00:00, 10.5MB/s]
|
6 |
-
|
7 |
-
(…)enai-community_gpt2_20250725_231201.json: 0%| | 0.00/209 [00:00<?, ?B/s]
|
8 |
-
(…)enai-community_gpt2_20250725_231201.json: 100%|██████████| 209/209 [00:00<00:00, 1.71MB/s]
|
9 |
-
|
10 |
-
(…)enai-community_gpt2_20250725_233155.json: 0%| | 0.00/209 [00:00<?, ?B/s]
|
11 |
-
(…)enai-community_gpt2_20250725_233155.json: 100%|██████████| 209/209 [00:00<00:00, 1.26MB/s]
|
12 |
-
|
13 |
-
(…)enai-community_gpt2_20250725_235115.json: 0%| | 0.00/209 [00:00<?, ?B/s]
|
14 |
-
(…)enai-community_gpt2_20250725_235115.json: 100%|██████████| 209/209 [00:00<00:00, 2.02MB/s]
|
15 |
-
|
16 |
-
(…)enai-community_gpt2_20250725_235748.json: 0%| | 0.00/209 [00:00<?, ?B/s]
|
17 |
-
(…)enai-community_gpt2_20250725_235748.json: 100%|██████████| 209/209 [00:00<00:00, 2.08MB/s]
|
18 |
-
|
19 |
-
(…)enai-community_gpt2_20250726_000358.json: 0%| | 0.00/209 [00:00<?, ?B/s]
|
20 |
-
(…)enai-community_gpt2_20250726_000358.json: 100%|██████████| 209/209 [00:00<00:00, 1.54MB/s]
|
21 |
-
|
22 |
-
(…)enai-community_gpt2_20250726_000650.json: 0%| | 0.00/209 [00:00<?, ?B/s]
|
23 |
-
(…)enai-community_gpt2_20250726_000650.json: 100%|██████████| 209/209 [00:00<00:00, 2.35MB/s]
|
24 |
-
|
25 |
-
=== Starting leaderboard creation ===
|
26 |
-
Looking for results in: ./eval-results
|
27 |
-
Expected columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
|
28 |
-
Benchmark columns: ['Perplexity']
|
29 |
|
30 |
-
|
31 |
-
|
|
|
|
|
32 |
|
33 |
Processing file: ./eval-results/openai-community/results_openai-community_gpt2_20250725_231201.json
|
34 |
|
35 |
config.json: 0%| | 0.00/665 [00:00<?, ?B/s]
|
36 |
-
config.json: 100%|██████████| 665/665 [00:00<00:00,
|
37 |
Created result object for: openai-community/gpt2
|
38 |
Added new result for openai-community_gpt2_float16
|
39 |
|
@@ -57,112 +36,176 @@ Processing file: ./eval-results/openai-community/results_openai-community_gpt2_2
|
|
57 |
Created result object for: openai-community/gpt2
|
58 |
Updated existing result for openai-community_gpt2_float16
|
59 |
|
60 |
-
Processing
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
Converting result to dict for: openai-community/gpt2
|
63 |
|
|
|
64 |
Processing result for model: openai-community/gpt2
|
65 |
Raw results: {'perplexity': 20.663532257080078}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
Calculated average score: 69.7162958010531
|
67 |
-
|
68 |
-
|
|
|
|
|
69 |
Successfully converted and added result
|
70 |
|
71 |
-
Returning
|
72 |
|
73 |
-
Found
|
|
|
74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
Processing result for model: openai-community/gpt2
|
76 |
Raw results: {'perplexity': 20.663532257080078}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
Calculated average score: 69.7162958010531
|
78 |
-
|
79 |
-
|
80 |
-
|
|
|
|
|
81 |
|
82 |
-
Converted to
|
83 |
Sample record keys: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
|
84 |
|
85 |
Created DataFrame with columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
|
86 |
-
DataFrame shape: (
|
87 |
|
88 |
Sorted DataFrame by average
|
89 |
|
90 |
Selected and rounded columns
|
91 |
|
92 |
-
Final DataFrame shape after filtering: (
|
93 |
Final columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
|
|
|
94 |
|
95 |
=== Initializing Leaderboard ===
|
96 |
-
DataFrame shape: (
|
97 |
DataFrame columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
|
98 |
* Running on local URL: http://0.0.0.0:7860, with SSR ⚡ (experimental, to disable set `ssr=False` in `launch()`)
|
99 |
|
100 |
To create a public link, set `share=True` in `launch()`.
|
101 |
|
102 |
-
===
|
103 |
-
Model:
|
104 |
Revision: main
|
105 |
Precision: float16
|
106 |
-
Starting dynamic evaluation for
|
107 |
Running perplexity evaluation...
|
108 |
-
Loading model:
|
109 |
Loading tokenizer...
|
110 |
|
111 |
-
tokenizer_config.json: 0%| | 0.00/
|
112 |
-
tokenizer_config.json: 100%|██████████|
|
113 |
|
114 |
-
config.json: 0%| | 0.00/
|
115 |
-
config.json: 100%|██████████|
|
116 |
|
117 |
-
vocab.json: 0%| | 0.00/
|
118 |
-
vocab.json: 100%|██████████|
|
119 |
|
120 |
merges.txt: 0%| | 0.00/456k [00:00<?, ?B/s]
|
121 |
-
merges.txt: 100%|██████████| 456k/456k [00:00<00:00,
|
122 |
|
123 |
-
|
124 |
-
|
125 |
Tokenizer loaded successfully
|
126 |
Loading model...
|
127 |
|
128 |
-
model.safetensors: 0%| | 0.00/
|
129 |
-
model.safetensors: 0%| |
|
130 |
-
model.safetensors:
|
131 |
-
model.safetensors:
|
132 |
-
model.safetensors:
|
133 |
-
model.safetensors:
|
134 |
-
model.safetensors:
|
135 |
-
model.safetensors:
|
136 |
-
model.safetensors:
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
model.safetensors: 91%|█████████ | 4.84G/5.31G [00:14<00:00, 494MB/s]
|
141 |
-
model.safetensors: 100%|██████████| 5.31G/5.31G [00:14<00:00, 355MB/s]
|
142 |
Model loaded successfully
|
143 |
Tokenizing input text...
|
144 |
Tokenized input shape: torch.Size([1, 141])
|
145 |
Moved inputs to device: cpu
|
146 |
Running forward pass...
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
|
|
152 |
Result file saved locally
|
153 |
Uploading to HF dataset: ahmedsqrd/results
|
154 |
Upload completed successfully
|
155 |
-
Evaluation result - Success: True, Result:
|
156 |
Attempting to refresh leaderboard...
|
|
|
157 |
Refreshing leaderboard data...
|
158 |
|
159 |
-
===
|
|
|
160 |
Looking for results in: ./eval-results
|
161 |
Expected columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
|
162 |
Benchmark columns: ['Perplexity']
|
163 |
|
164 |
Searching for result files in: ./eval-results
|
165 |
-
Found
|
|
|
|
|
|
|
|
|
166 |
|
167 |
Processing file: ./eval-results/openai-community/results_openai-community_gpt2_20250725_231201.json
|
168 |
Created result object for: openai-community/gpt2
|
@@ -188,67 +231,151 @@ Processing file: ./eval-results/openai-community/results_openai-community_gpt2_2
|
|
188 |
Created result object for: openai-community/gpt2
|
189 |
Updated existing result for openai-community_gpt2_float16
|
190 |
|
191 |
-
Processing file: ./eval-results/
|
192 |
-
Created result object for:
|
193 |
-
Added new result for
|
194 |
|
195 |
-
Processing
|
196 |
-
|
197 |
-
Converting result to dict for: openai-community/gpt2
|
198 |
-
|
199 |
-
Processing result for model: openai-community/gpt2
|
200 |
-
Raw results: {'perplexity': 20.663532257080078}
|
201 |
-
Calculated average score: 69.7162958010531
|
202 |
-
Added perplexity score 20.663532257080078 under column Perplexity
|
203 |
-
Final data dict keys: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
|
204 |
-
Successfully converted and added result
|
205 |
|
206 |
Converting result to dict for: EleutherAI/gpt-neo-1.3B
|
207 |
|
|
|
208 |
Processing result for model: EleutherAI/gpt-neo-1.3B
|
209 |
Raw results: {'perplexity': 5.9609375}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
210 |
Calculated average score: 82.1477223263516
|
211 |
-
|
212 |
-
|
|
|
|
|
213 |
Successfully converted and added result
|
214 |
|
215 |
-
|
216 |
-
|
217 |
-
Found 2 raw results
|
218 |
|
|
|
219 |
Processing result for model: openai-community/gpt2
|
220 |
Raw results: {'perplexity': 20.663532257080078}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
221 |
Calculated average score: 69.7162958010531
|
222 |
-
|
223 |
-
|
224 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
225 |
|
|
|
226 |
Processing result for model: EleutherAI/gpt-neo-1.3B
|
227 |
Raw results: {'perplexity': 5.9609375}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
228 |
Calculated average score: 82.1477223263516
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
Sample record keys: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
|
235 |
|
236 |
Created DataFrame with columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
|
237 |
-
DataFrame shape: (
|
238 |
|
239 |
Sorted DataFrame by average
|
240 |
|
241 |
Selected and rounded columns
|
242 |
|
243 |
-
Final DataFrame shape after filtering: (
|
244 |
Final columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
|
245 |
-
|
|
|
|
|
246 |
DataFrame columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
|
247 |
-
|
|
|
|
|
248 |
|
249 |
=== Initializing Leaderboard ===
|
250 |
-
DataFrame shape: (
|
251 |
DataFrame columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
|
|
|
252 |
Leaderboard refresh successful
|
253 |
Traceback (most recent call last):
|
254 |
File "/usr/local/lib/python3.10/site-packages/gradio/queueing.py", line 625, in process_events
|
|
|
|
|
1 |
|
2 |
+
Searching for result files in: ./eval-results
|
3 |
+
Found 7 result files
|
4 |
|
5 |
+
Processing file: ./eval-results/EleutherAI/results_EleutherAI_gpt-neo-1.3B_20250726_010247.json
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
+
config.json: 0%| | 0.00/1.35k [00:00<?, ?B/s]
|
8 |
+
config.json: 100%|██████████| 1.35k/1.35k [00:00<00:00, 17.2MB/s]
|
9 |
+
Created result object for: EleutherAI/gpt-neo-1.3B
|
10 |
+
Added new result for EleutherAI_gpt-neo-1.3B_float16
|
11 |
|
12 |
Processing file: ./eval-results/openai-community/results_openai-community_gpt2_20250725_231201.json
|
13 |
|
14 |
config.json: 0%| | 0.00/665 [00:00<?, ?B/s]
|
15 |
+
config.json: 100%|██████████| 665/665 [00:00<00:00, 8.83MB/s]
|
16 |
Created result object for: openai-community/gpt2
|
17 |
Added new result for openai-community_gpt2_float16
|
18 |
|
|
|
36 |
Created result object for: openai-community/gpt2
|
37 |
Updated existing result for openai-community_gpt2_float16
|
38 |
|
39 |
+
Processing 2 evaluation results
|
40 |
+
|
41 |
+
Converting result to dict for: EleutherAI/gpt-neo-1.3B
|
42 |
+
|
43 |
+
=== PROCESSING RESULT TO_DICT ===
|
44 |
+
Processing result for model: EleutherAI/gpt-neo-1.3B
|
45 |
+
Raw results: {'perplexity': 5.9609375}
|
46 |
+
Model precision: Precision.float16
|
47 |
+
Model type: ModelType.PT
|
48 |
+
Weight type: WeightType.Original
|
49 |
+
Available tasks: ['task0']
|
50 |
+
Looking for task: perplexity in results
|
51 |
+
Found score for perplexity: 5.9609375
|
52 |
+
Converted score: 82.1477223263516
|
53 |
+
Calculated average score: 82.1477223263516
|
54 |
+
Created base data_dict with 13 columns
|
55 |
+
Added task score: Perplexity = 5.9609375
|
56 |
+
Final data dict has 14 columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
|
57 |
+
=== END PROCESSING RESULT TO_DICT ===
|
58 |
+
Successfully converted and added result
|
59 |
|
60 |
Converting result to dict for: openai-community/gpt2
|
61 |
|
62 |
+
=== PROCESSING RESULT TO_DICT ===
|
63 |
Processing result for model: openai-community/gpt2
|
64 |
Raw results: {'perplexity': 20.663532257080078}
|
65 |
+
Model precision: Precision.float16
|
66 |
+
Model type: ModelType.PT
|
67 |
+
Weight type: WeightType.Original
|
68 |
+
Available tasks: ['task0']
|
69 |
+
Looking for task: perplexity in results
|
70 |
+
Found score for perplexity: 20.663532257080078
|
71 |
+
Converted score: 69.7162958010531
|
72 |
Calculated average score: 69.7162958010531
|
73 |
+
Created base data_dict with 13 columns
|
74 |
+
Added task score: Perplexity = 20.663532257080078
|
75 |
+
Final data dict has 14 columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
|
76 |
+
=== END PROCESSING RESULT TO_DICT ===
|
77 |
Successfully converted and added result
|
78 |
|
79 |
+
Returning 2 processed results
|
80 |
|
81 |
+
Found 2 raw results
|
82 |
+
Processing result 1/2: EleutherAI/gpt-neo-1.3B
|
83 |
|
84 |
+
=== PROCESSING RESULT TO_DICT ===
|
85 |
+
Processing result for model: EleutherAI/gpt-neo-1.3B
|
86 |
+
Raw results: {'perplexity': 5.9609375}
|
87 |
+
Model precision: Precision.float16
|
88 |
+
Model type: ModelType.PT
|
89 |
+
Weight type: WeightType.Original
|
90 |
+
Available tasks: ['task0']
|
91 |
+
Looking for task: perplexity in results
|
92 |
+
Found score for perplexity: 5.9609375
|
93 |
+
Converted score: 82.1477223263516
|
94 |
+
Calculated average score: 82.1477223263516
|
95 |
+
Created base data_dict with 13 columns
|
96 |
+
Added task score: Perplexity = 5.9609375
|
97 |
+
Final data dict has 14 columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
|
98 |
+
=== END PROCESSING RESULT TO_DICT ===
|
99 |
+
Successfully processed result 1/2: EleutherAI/gpt-neo-1.3B
|
100 |
+
Processing result 2/2: openai-community/gpt2
|
101 |
+
|
102 |
+
=== PROCESSING RESULT TO_DICT ===
|
103 |
Processing result for model: openai-community/gpt2
|
104 |
Raw results: {'perplexity': 20.663532257080078}
|
105 |
+
Model precision: Precision.float16
|
106 |
+
Model type: ModelType.PT
|
107 |
+
Weight type: WeightType.Original
|
108 |
+
Available tasks: ['task0']
|
109 |
+
Looking for task: perplexity in results
|
110 |
+
Found score for perplexity: 20.663532257080078
|
111 |
+
Converted score: 69.7162958010531
|
112 |
Calculated average score: 69.7162958010531
|
113 |
+
Created base data_dict with 13 columns
|
114 |
+
Added task score: Perplexity = 20.663532257080078
|
115 |
+
Final data dict has 14 columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
|
116 |
+
=== END PROCESSING RESULT TO_DICT ===
|
117 |
+
Successfully processed result 2/2: openai-community/gpt2
|
118 |
|
119 |
+
Converted to 2 JSON records
|
120 |
Sample record keys: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
|
121 |
|
122 |
Created DataFrame with columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
|
123 |
+
DataFrame shape: (2, 14)
|
124 |
|
125 |
Sorted DataFrame by average
|
126 |
|
127 |
Selected and rounded columns
|
128 |
|
129 |
+
Final DataFrame shape after filtering: (2, 12)
|
130 |
Final columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
|
131 |
+
=== FINAL RESULT: DataFrame with 2 rows and 12 columns ===
|
132 |
|
133 |
=== Initializing Leaderboard ===
|
134 |
+
DataFrame shape: (2, 12)
|
135 |
DataFrame columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
|
136 |
* Running on local URL: http://0.0.0.0:7860, with SSR ⚡ (experimental, to disable set `ssr=False` in `launch()`)
|
137 |
|
138 |
To create a public link, set `share=True` in `launch()`.
|
139 |
|
140 |
+
=== RUNNING PERPLEXITY TEST ===
|
141 |
+
Model: openai-community/gpt2-large
|
142 |
Revision: main
|
143 |
Precision: float16
|
144 |
+
Starting dynamic evaluation for openai-community/gpt2-large
|
145 |
Running perplexity evaluation...
|
146 |
+
Loading model: openai-community/gpt2-large (revision: main)
|
147 |
Loading tokenizer...
|
148 |
|
149 |
+
tokenizer_config.json: 0%| | 0.00/26.0 [00:00<?, ?B/s]
|
150 |
+
tokenizer_config.json: 100%|██████████| 26.0/26.0 [00:00<00:00, 183kB/s]
|
151 |
|
152 |
+
config.json: 0%| | 0.00/666 [00:00<?, ?B/s]
|
153 |
+
config.json: 100%|██████████| 666/666 [00:00<00:00, 7.11MB/s]
|
154 |
|
155 |
+
vocab.json: 0%| | 0.00/1.04M [00:00<?, ?B/s]
|
156 |
+
vocab.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 45.7MB/s]
|
157 |
|
158 |
merges.txt: 0%| | 0.00/456k [00:00<?, ?B/s]
|
159 |
+
merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 44.9MB/s]
|
160 |
|
161 |
+
tokenizer.json: 0%| | 0.00/1.36M [00:00<?, ?B/s]
|
162 |
+
tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 25.3MB/s]
|
163 |
Tokenizer loaded successfully
|
164 |
Loading model...
|
165 |
|
166 |
+
model.safetensors: 0%| | 0.00/3.25G [00:00<?, ?B/s]
|
167 |
+
model.safetensors: 0%| | 3.99M/3.25G [00:01<18:26, 2.93MB/s]
|
168 |
+
model.safetensors: 4%|▍ | 138M/3.25G [00:02<00:47, 65.1MB/s]
|
169 |
+
model.safetensors: 7%|▋ | 235M/3.25G [00:03<00:46, 65.4MB/s]
|
170 |
+
model.safetensors: 28%|██▊ | 905M/3.25G [00:05<00:09, 258MB/s]
|
171 |
+
model.safetensors: 46%|████▋ | 1.51G/3.25G [00:06<00:04, 360MB/s]
|
172 |
+
model.safetensors: 71%|███████ | 2.31G/3.25G [00:07<00:01, 484MB/s]
|
173 |
+
model.safetensors: 98%|█████████▊| 3.18G/3.25G [00:08<00:00, 593MB/s]
|
174 |
+
model.safetensors: 100%|██████████| 3.25G/3.25G [00:08<00:00, 390MB/s]
|
175 |
+
|
176 |
+
generation_config.json: 0%| | 0.00/124 [00:00<?, ?B/s]
|
177 |
+
generation_config.json: 100%|██████████| 124/124 [00:00<00:00, 1.04MB/s]
|
|
|
|
|
178 |
Model loaded successfully
|
179 |
Tokenizing input text...
|
180 |
Tokenized input shape: torch.Size([1, 141])
|
181 |
Moved inputs to device: cpu
|
182 |
Running forward pass...
|
183 |
+
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
|
184 |
+
Calculated loss: 2.1944427490234375
|
185 |
+
Final perplexity: 8.974998474121094
|
186 |
+
Perplexity evaluation completed: 8.974998474121094
|
187 |
+
Created result structure: {'config': {'model_dtype': 'torch.float16', 'model_name': 'openai-community/gpt2-large', 'model_sha': 'main'}, 'results': {'perplexity': {'perplexity': 8.974998474121094}}}
|
188 |
+
Saving result to: ./eval-results/openai-community/results_openai-community_gpt2-large_20250726_013038.json
|
189 |
Result file saved locally
|
190 |
Uploading to HF dataset: ahmedsqrd/results
|
191 |
Upload completed successfully
|
192 |
+
Evaluation result - Success: True, Result: 8.974998474121094
|
193 |
Attempting to refresh leaderboard...
|
194 |
+
=== REFRESH LEADERBOARD DEBUG ===
|
195 |
Refreshing leaderboard data...
|
196 |
|
197 |
+
=== GET_LEADERBOARD_DF DEBUG ===
|
198 |
+
Starting leaderboard creation...
|
199 |
Looking for results in: ./eval-results
|
200 |
Expected columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
|
201 |
Benchmark columns: ['Perplexity']
|
202 |
|
203 |
Searching for result files in: ./eval-results
|
204 |
+
Found 8 result files
|
205 |
+
|
206 |
+
Processing file: ./eval-results/EleutherAI/results_EleutherAI_gpt-neo-1.3B_20250726_010247.json
|
207 |
+
Created result object for: EleutherAI/gpt-neo-1.3B
|
208 |
+
Added new result for EleutherAI_gpt-neo-1.3B_float16
|
209 |
|
210 |
Processing file: ./eval-results/openai-community/results_openai-community_gpt2_20250725_231201.json
|
211 |
Created result object for: openai-community/gpt2
|
|
|
231 |
Created result object for: openai-community/gpt2
|
232 |
Updated existing result for openai-community_gpt2_float16
|
233 |
|
234 |
+
Processing file: ./eval-results/openai-community/results_openai-community_gpt2-large_20250726_013038.json
|
235 |
+
Created result object for: openai-community/gpt2-large
|
236 |
+
Added new result for openai-community_gpt2-large_float16
|
237 |
|
238 |
+
Processing 3 evaluation results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
239 |
|
240 |
Converting result to dict for: EleutherAI/gpt-neo-1.3B
|
241 |
|
242 |
+
=== PROCESSING RESULT TO_DICT ===
|
243 |
Processing result for model: EleutherAI/gpt-neo-1.3B
|
244 |
Raw results: {'perplexity': 5.9609375}
|
245 |
+
Model precision: Precision.float16
|
246 |
+
Model type: ModelType.PT
|
247 |
+
Weight type: WeightType.Original
|
248 |
+
Available tasks: ['task0']
|
249 |
+
Looking for task: perplexity in results
|
250 |
+
Found score for perplexity: 5.9609375
|
251 |
+
Converted score: 82.1477223263516
|
252 |
Calculated average score: 82.1477223263516
|
253 |
+
Created base data_dict with 13 columns
|
254 |
+
Added task score: Perplexity = 5.9609375
|
255 |
+
Final data dict has 14 columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
|
256 |
+
=== END PROCESSING RESULT TO_DICT ===
|
257 |
Successfully converted and added result
|
258 |
|
259 |
+
Converting result to dict for: openai-community/gpt2
|
|
|
|
|
260 |
|
261 |
+
=== PROCESSING RESULT TO_DICT ===
|
262 |
Processing result for model: openai-community/gpt2
|
263 |
Raw results: {'perplexity': 20.663532257080078}
|
264 |
+
Model precision: Precision.float16
|
265 |
+
Model type: ModelType.PT
|
266 |
+
Weight type: WeightType.Original
|
267 |
+
Available tasks: ['task0']
|
268 |
+
Looking for task: perplexity in results
|
269 |
+
Found score for perplexity: 20.663532257080078
|
270 |
+
Converted score: 69.7162958010531
|
271 |
Calculated average score: 69.7162958010531
|
272 |
+
Created base data_dict with 13 columns
|
273 |
+
Added task score: Perplexity = 20.663532257080078
|
274 |
+
Final data dict has 14 columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
|
275 |
+
=== END PROCESSING RESULT TO_DICT ===
|
276 |
+
Successfully converted and added result
|
277 |
+
|
278 |
+
Converting result to dict for: openai-community/gpt2-large
|
279 |
+
|
280 |
+
=== PROCESSING RESULT TO_DICT ===
|
281 |
+
Processing result for model: openai-community/gpt2-large
|
282 |
+
Raw results: {'perplexity': 8.974998474121094}
|
283 |
+
Model precision: Precision.float16
|
284 |
+
Model type: ModelType.PT
|
285 |
+
Weight type: WeightType.Original
|
286 |
+
Available tasks: ['task0']
|
287 |
+
Looking for task: perplexity in results
|
288 |
+
Found score for perplexity: 8.974998474121094
|
289 |
+
Converted score: 78.05557235640035
|
290 |
+
Calculated average score: 78.05557235640035
|
291 |
+
Created base data_dict with 13 columns
|
292 |
+
Added task score: Perplexity = 8.974998474121094
|
293 |
+
Final data dict has 14 columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
|
294 |
+
=== END PROCESSING RESULT TO_DICT ===
|
295 |
+
Successfully converted and added result
|
296 |
+
|
297 |
+
Returning 3 processed results
|
298 |
+
|
299 |
+
Found 3 raw results
|
300 |
+
Processing result 1/3: EleutherAI/gpt-neo-1.3B
|
301 |
|
302 |
+
=== PROCESSING RESULT TO_DICT ===
|
303 |
Processing result for model: EleutherAI/gpt-neo-1.3B
|
304 |
Raw results: {'perplexity': 5.9609375}
|
305 |
+
Model precision: Precision.float16
|
306 |
+
Model type: ModelType.PT
|
307 |
+
Weight type: WeightType.Original
|
308 |
+
Available tasks: ['task0']
|
309 |
+
Looking for task: perplexity in results
|
310 |
+
Found score for perplexity: 5.9609375
|
311 |
+
Converted score: 82.1477223263516
|
312 |
Calculated average score: 82.1477223263516
|
313 |
+
Created base data_dict with 13 columns
|
314 |
+
Added task score: Perplexity = 5.9609375
|
315 |
+
Final data dict has 14 columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
|
316 |
+
=== END PROCESSING RESULT TO_DICT ===
|
317 |
+
Successfully processed result 1/3: EleutherAI/gpt-neo-1.3B
|
318 |
+
Processing result 2/3: openai-community/gpt2
|
319 |
+
|
320 |
+
=== PROCESSING RESULT TO_DICT ===
|
321 |
+
Processing result for model: openai-community/gpt2
|
322 |
+
Raw results: {'perplexity': 20.663532257080078}
|
323 |
+
Model precision: Precision.float16
|
324 |
+
Model type: ModelType.PT
|
325 |
+
Weight type: WeightType.Original
|
326 |
+
Available tasks: ['task0']
|
327 |
+
Looking for task: perplexity in results
|
328 |
+
Found score for perplexity: 20.663532257080078
|
329 |
+
Converted score: 69.7162958010531
|
330 |
+
Calculated average score: 69.7162958010531
|
331 |
+
Created base data_dict with 13 columns
|
332 |
+
Added task score: Perplexity = 20.663532257080078
|
333 |
+
Final data dict has 14 columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
|
334 |
+
=== END PROCESSING RESULT TO_DICT ===
|
335 |
+
Successfully processed result 2/3: openai-community/gpt2
|
336 |
+
Processing result 3/3: openai-community/gpt2-large
|
337 |
+
|
338 |
+
=== PROCESSING RESULT TO_DICT ===
|
339 |
+
Processing result for model: openai-community/gpt2-large
|
340 |
+
Raw results: {'perplexity': 8.974998474121094}
|
341 |
+
Model precision: Precision.float16
|
342 |
+
Model type: ModelType.PT
|
343 |
+
Weight type: WeightType.Original
|
344 |
+
Available tasks: ['task0']
|
345 |
+
Looking for task: perplexity in results
|
346 |
+
Found score for perplexity: 8.974998474121094
|
347 |
+
Converted score: 78.05557235640035
|
348 |
+
Calculated average score: 78.05557235640035
|
349 |
+
Created base data_dict with 13 columns
|
350 |
+
Added task score: Perplexity = 8.974998474121094
|
351 |
+
Final data dict has 14 columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
|
352 |
+
=== END PROCESSING RESULT TO_DICT ===
|
353 |
+
Successfully processed result 3/3: openai-community/gpt2-large
|
354 |
+
|
355 |
+
Converted to 3 JSON records
|
356 |
Sample record keys: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
|
357 |
|
358 |
Created DataFrame with columns: ['eval_name', 'Precision', 'Type', 'T', 'Weight type', 'Architecture', 'Model', 'Model sha', 'Average ⬆️', 'Available on the hub', 'Hub License', '#Params (B)', 'Hub ❤️', 'Perplexity']
|
359 |
+
DataFrame shape: (3, 14)
|
360 |
|
361 |
Sorted DataFrame by average
|
362 |
|
363 |
Selected and rounded columns
|
364 |
|
365 |
+
Final DataFrame shape after filtering: (3, 12)
|
366 |
Final columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
|
367 |
+
=== FINAL RESULT: DataFrame with 3 rows and 12 columns ===
|
368 |
+
get_leaderboard_df returned: <class 'pandas.core.frame.DataFrame'>
|
369 |
+
DataFrame shape: (3, 12)
|
370 |
DataFrame columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
|
371 |
+
DataFrame empty: False
|
372 |
+
Final DataFrame for leaderboard - Shape: (3, 12), Columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
|
373 |
+
Creating leaderboard component...
|
374 |
|
375 |
=== Initializing Leaderboard ===
|
376 |
+
DataFrame shape: (3, 12)
|
377 |
DataFrame columns: ['T', 'Model', 'Average ⬆️', 'Perplexity', 'Type', 'Architecture', 'Precision', 'Hub License', '#Params (B)', 'Hub ❤️', 'Available on the hub', 'Model sha']
|
378 |
+
Leaderboard component created successfully
|
379 |
Leaderboard refresh successful
|
380 |
Traceback (most recent call last):
|
381 |
File "/usr/local/lib/python3.10/site-packages/gradio/queueing.py", line 625, in process_events
|