Spaces:
Runtime error
Runtime error
Ahmed Ahmed
commited on
Commit
Β·
f02d36b
1
Parent(s):
3a2ac99
no more dynamic updates
Browse files- app.py +50 -37
- src/display/formatting.py +2 -2
app.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
import gradio as gr
|
2 |
-
from gradio_leaderboard import Leaderboard
|
3 |
import pandas as pd
|
4 |
from huggingface_hub import snapshot_download, create_repo
|
5 |
from huggingface_hub.utils import RepositoryNotFoundError
|
@@ -21,24 +20,26 @@ from src.envs import API, EVAL_RESULTS_PATH, RESULTS_REPO, TOKEN, OWNER
|
|
21 |
from src.populate import get_leaderboard_df
|
22 |
from src.evaluation.dynamic_eval import run_dynamic_perplexity_eval
|
23 |
|
24 |
-
def
|
25 |
-
|
26 |
-
|
|
|
|
|
|
|
27 |
|
28 |
-
|
29 |
-
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
37 |
-
filter_columns=[
|
38 |
-
AutoEvalColumn.model_type.name,
|
39 |
-
AutoEvalColumn.precision.name,
|
40 |
-
],
|
41 |
-
)
|
42 |
|
43 |
def run_perplexity_test(model_name, revision, precision):
|
44 |
"""Run perplexity evaluation on demand."""
|
@@ -47,7 +48,7 @@ def run_perplexity_test(model_name, revision, precision):
|
|
47 |
import gradio as gr
|
48 |
|
49 |
if not model_name:
|
50 |
-
return "Please enter a model name."
|
51 |
|
52 |
try:
|
53 |
# Use stderr for more reliable logging in HF Spaces
|
@@ -62,24 +63,22 @@ def run_perplexity_test(model_name, revision, precision):
|
|
62 |
sys.stderr.flush()
|
63 |
|
64 |
if success:
|
65 |
-
sys.stderr.write("Evaluation succeeded - results
|
66 |
sys.stderr.flush()
|
67 |
|
68 |
-
|
|
|
|
|
|
|
69 |
|
70 |
**Model**: {model_name}
|
71 |
**Perplexity Score**: {result:.4f}
|
72 |
|
73 |
-
π **Results have been saved
|
74 |
-
|
75 |
-
|
76 |
-
1. Click on the **π
Leaderboard** tab above
|
77 |
-
2. Refresh the page (Ctrl+R or Cmd+R)
|
78 |
-
3. Your model should now appear in the rankings!
|
79 |
-
|
80 |
-
π‘ **Note**: Due to technical limitations with the leaderboard component, results cannot be updated dynamically. The refresh is necessary to see the latest rankings."""
|
81 |
else:
|
82 |
-
return f"β **Evaluation failed**: {result}"
|
83 |
|
84 |
except Exception as e:
|
85 |
error_msg = str(e)
|
@@ -87,7 +86,7 @@ def run_perplexity_test(model_name, revision, precision):
|
|
87 |
sys.stderr.write(f"Critical error in run_perplexity_test: {error_msg}\n")
|
88 |
sys.stderr.write(f"Traceback: {traceback_str}\n")
|
89 |
sys.stderr.flush()
|
90 |
-
return f"β **Critical error**: {error_msg}"
|
91 |
|
92 |
# Initialize results repository and directory
|
93 |
try:
|
@@ -117,8 +116,8 @@ except Exception as e:
|
|
117 |
# Ensure local directory exists even if repo operations fail
|
118 |
os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
|
119 |
|
120 |
-
# Get initial
|
121 |
-
|
122 |
|
123 |
# Create the Gradio interface
|
124 |
demo = gr.Blocks(css=custom_css)
|
@@ -127,8 +126,14 @@ with demo:
|
|
127 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
128 |
|
129 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
130 |
-
with gr.TabItem("π
|
131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
|
133 |
with gr.TabItem("π About", elem_id="about-tab", id=1):
|
134 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
@@ -151,10 +156,18 @@ with demo:
|
|
151 |
test_button = gr.Button("π Run Perplexity Test", variant="primary")
|
152 |
result = gr.Markdown()
|
153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
gr.Markdown("""
|
155 |
### Tips:
|
156 |
- **Check stderr logs** in HF Spaces for detailed debugging information
|
157 |
-
- **
|
158 |
- **Example models to test**: `openai-community/gpt2`, `EleutherAI/gpt-neo-1.3B`, `openai-community/gpt2-large`
|
159 |
- **Lower perplexity scores = better performance** (better at predicting text)
|
160 |
|
@@ -162,13 +175,13 @@ with demo:
|
|
162 |
1. Enter a model name from Hugging Face Hub
|
163 |
2. Click "Run Perplexity Test"
|
164 |
3. Wait for evaluation to complete (may take a few minutes for large models)
|
165 |
-
4.
|
166 |
""")
|
167 |
|
168 |
test_button.click(
|
169 |
run_perplexity_test,
|
170 |
[model_name, revision, precision],
|
171 |
-
[result]
|
172 |
)
|
173 |
|
174 |
demo.queue(default_concurrency_limit=5).launch()
|
|
|
1 |
import gradio as gr
|
|
|
2 |
import pandas as pd
|
3 |
from huggingface_hub import snapshot_download, create_repo
|
4 |
from huggingface_hub.utils import RepositoryNotFoundError
|
|
|
20 |
from src.populate import get_leaderboard_df
|
21 |
from src.evaluation.dynamic_eval import run_dynamic_perplexity_eval
|
22 |
|
23 |
+
def create_results_dataframe():
|
24 |
+
"""Create and return the results DataFrame for display"""
|
25 |
+
df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
|
26 |
+
if df is None or df.empty:
|
27 |
+
# Return empty DataFrame with proper columns
|
28 |
+
return pd.DataFrame(columns=["Model", "Perplexity", "Average Score", "Type", "Precision"])
|
29 |
|
30 |
+
# Select and rename columns for display
|
31 |
+
display_df = df[[
|
32 |
+
AutoEvalColumn.model.name,
|
33 |
+
"Perplexity", # This matches the task column name from Tasks.task0.value.col_name
|
34 |
+
AutoEvalColumn.average.name,
|
35 |
+
AutoEvalColumn.model_type.name,
|
36 |
+
AutoEvalColumn.precision.name,
|
37 |
+
]].copy()
|
38 |
|
39 |
+
# Rename columns for better display
|
40 |
+
display_df.columns = ["Model", "Perplexity", "Average Score", "Type", "Precision"]
|
41 |
+
|
42 |
+
return display_df
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
def run_perplexity_test(model_name, revision, precision):
|
45 |
"""Run perplexity evaluation on demand."""
|
|
|
48 |
import gradio as gr
|
49 |
|
50 |
if not model_name:
|
51 |
+
return "Please enter a model name.", gr.update()
|
52 |
|
53 |
try:
|
54 |
# Use stderr for more reliable logging in HF Spaces
|
|
|
63 |
sys.stderr.flush()
|
64 |
|
65 |
if success:
|
66 |
+
sys.stderr.write("Evaluation succeeded - updating results table\n")
|
67 |
sys.stderr.flush()
|
68 |
|
69 |
+
# Get updated results
|
70 |
+
updated_df = create_results_dataframe()
|
71 |
+
|
72 |
+
success_msg = f"""β
**Perplexity evaluation completed successfully!**
|
73 |
|
74 |
**Model**: {model_name}
|
75 |
**Perplexity Score**: {result:.4f}
|
76 |
|
77 |
+
π **Results have been saved and the table below has been updated!**"""
|
78 |
+
|
79 |
+
return success_msg, gr.update(value=updated_df)
|
|
|
|
|
|
|
|
|
|
|
80 |
else:
|
81 |
+
return f"β **Evaluation failed**: {result}", gr.update()
|
82 |
|
83 |
except Exception as e:
|
84 |
error_msg = str(e)
|
|
|
86 |
sys.stderr.write(f"Critical error in run_perplexity_test: {error_msg}\n")
|
87 |
sys.stderr.write(f"Traceback: {traceback_str}\n")
|
88 |
sys.stderr.flush()
|
89 |
+
return f"β **Critical error**: {error_msg}", gr.update()
|
90 |
|
91 |
# Initialize results repository and directory
|
92 |
try:
|
|
|
116 |
# Ensure local directory exists even if repo operations fail
|
117 |
os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
|
118 |
|
119 |
+
# Get initial results data
|
120 |
+
RESULTS_DF = create_results_dataframe()
|
121 |
|
122 |
# Create the Gradio interface
|
123 |
demo = gr.Blocks(css=custom_css)
|
|
|
126 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
127 |
|
128 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
129 |
+
with gr.TabItem("π
Results", elem_id="results-tab", id=0):
|
130 |
+
gr.Markdown("## Model Evaluation Results")
|
131 |
+
results_table = gr.DataFrame(
|
132 |
+
value=RESULTS_DF,
|
133 |
+
headers=["Model", "Perplexity", "Average Score", "Type", "Precision"],
|
134 |
+
interactive=False,
|
135 |
+
wrap=False
|
136 |
+
)
|
137 |
|
138 |
with gr.TabItem("π About", elem_id="about-tab", id=1):
|
139 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
|
|
156 |
test_button = gr.Button("π Run Perplexity Test", variant="primary")
|
157 |
result = gr.Markdown()
|
158 |
|
159 |
+
gr.Markdown("## Live Results")
|
160 |
+
live_results_table = gr.DataFrame(
|
161 |
+
value=RESULTS_DF,
|
162 |
+
headers=["Model", "Perplexity", "Average Score", "Type", "Precision"],
|
163 |
+
interactive=False,
|
164 |
+
wrap=False
|
165 |
+
)
|
166 |
+
|
167 |
gr.Markdown("""
|
168 |
### Tips:
|
169 |
- **Check stderr logs** in HF Spaces for detailed debugging information
|
170 |
+
- **Results will update automatically** in the table above after evaluation completes
|
171 |
- **Example models to test**: `openai-community/gpt2`, `EleutherAI/gpt-neo-1.3B`, `openai-community/gpt2-large`
|
172 |
- **Lower perplexity scores = better performance** (better at predicting text)
|
173 |
|
|
|
175 |
1. Enter a model name from Hugging Face Hub
|
176 |
2. Click "Run Perplexity Test"
|
177 |
3. Wait for evaluation to complete (may take a few minutes for large models)
|
178 |
+
4. Results will appear automatically in the table above!
|
179 |
""")
|
180 |
|
181 |
test_button.click(
|
182 |
run_perplexity_test,
|
183 |
[model_name, revision, precision],
|
184 |
+
[result, live_results_table]
|
185 |
)
|
186 |
|
187 |
demo.queue(default_concurrency_limit=5).launch()
|
src/display/formatting.py
CHANGED
@@ -3,8 +3,8 @@ def model_hyperlink(link, model_name):
|
|
3 |
|
4 |
|
5 |
def make_clickable_model(model_name):
|
6 |
-
|
7 |
-
return
|
8 |
|
9 |
|
10 |
def styled_error(error):
|
|
|
3 |
|
4 |
|
5 |
def make_clickable_model(model_name):
|
6 |
+
# Just return the plain model name without HTML formatting
|
7 |
+
return model_name
|
8 |
|
9 |
|
10 |
def styled_error(error):
|