Ahmed Ahmed commited on
Commit
f02d36b
Β·
1 Parent(s): 3a2ac99

no more dynamic updates

Browse files
Files changed (2) hide show
  1. app.py +50 -37
  2. src/display/formatting.py +2 -2
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import gradio as gr
2
- from gradio_leaderboard import Leaderboard
3
  import pandas as pd
4
  from huggingface_hub import snapshot_download, create_repo
5
  from huggingface_hub.utils import RepositoryNotFoundError
@@ -21,24 +20,26 @@ from src.envs import API, EVAL_RESULTS_PATH, RESULTS_REPO, TOKEN, OWNER
21
  from src.populate import get_leaderboard_df
22
  from src.evaluation.dynamic_eval import run_dynamic_perplexity_eval
23
 
24
- def init_leaderboard(dataframe):
25
- if dataframe is None:
26
- raise ValueError("Leaderboard DataFrame is None.")
 
 
 
27
 
28
- print("\n=== Initializing Leaderboard ===", flush=True)
29
- print(f"DataFrame shape: {dataframe.shape}", flush=True)
30
- print(f"DataFrame columns: {dataframe.columns.tolist()}", flush=True)
 
 
 
 
 
31
 
32
- return Leaderboard(
33
- value=dataframe,
34
- select_columns=[c.name for c in fields(AutoEvalColumn) if not c.hidden],
35
- search_columns=[AutoEvalColumn.model.name],
36
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
37
- filter_columns=[
38
- AutoEvalColumn.model_type.name,
39
- AutoEvalColumn.precision.name,
40
- ],
41
- )
42
 
43
  def run_perplexity_test(model_name, revision, precision):
44
  """Run perplexity evaluation on demand."""
@@ -47,7 +48,7 @@ def run_perplexity_test(model_name, revision, precision):
47
  import gradio as gr
48
 
49
  if not model_name:
50
- return "Please enter a model name."
51
 
52
  try:
53
  # Use stderr for more reliable logging in HF Spaces
@@ -62,24 +63,22 @@ def run_perplexity_test(model_name, revision, precision):
62
  sys.stderr.flush()
63
 
64
  if success:
65
- sys.stderr.write("Evaluation succeeded - results saved to dataset\n")
66
  sys.stderr.flush()
67
 
68
- return f"""βœ… **Perplexity evaluation completed successfully!**
 
 
 
69
 
70
  **Model**: {model_name}
71
  **Perplexity Score**: {result:.4f}
72
 
73
- πŸŽ‰ **Results have been saved to the dataset.**
74
-
75
- πŸ“‹ **To see your results in the leaderboard:**
76
- 1. Click on the **πŸ… Leaderboard** tab above
77
- 2. Refresh the page (Ctrl+R or Cmd+R)
78
- 3. Your model should now appear in the rankings!
79
-
80
- πŸ’‘ **Note**: Due to technical limitations with the leaderboard component, results cannot be updated dynamically. The refresh is necessary to see the latest rankings."""
81
  else:
82
- return f"❌ **Evaluation failed**: {result}"
83
 
84
  except Exception as e:
85
  error_msg = str(e)
@@ -87,7 +86,7 @@ def run_perplexity_test(model_name, revision, precision):
87
  sys.stderr.write(f"Critical error in run_perplexity_test: {error_msg}\n")
88
  sys.stderr.write(f"Traceback: {traceback_str}\n")
89
  sys.stderr.flush()
90
- return f"❌ **Critical error**: {error_msg}"
91
 
92
  # Initialize results repository and directory
93
  try:
@@ -117,8 +116,8 @@ except Exception as e:
117
  # Ensure local directory exists even if repo operations fail
118
  os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
119
 
120
- # Get initial leaderboard data
121
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
122
 
123
  # Create the Gradio interface
124
  demo = gr.Blocks(css=custom_css)
@@ -127,8 +126,14 @@ with demo:
127
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
128
 
129
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
130
- with gr.TabItem("πŸ… Leaderboard", elem_id="leaderboard-tab", id=0):
131
- leaderboard = init_leaderboard(LEADERBOARD_DF)
 
 
 
 
 
 
132
 
133
  with gr.TabItem("πŸ“ About", elem_id="about-tab", id=1):
134
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
@@ -151,10 +156,18 @@ with demo:
151
  test_button = gr.Button("πŸš€ Run Perplexity Test", variant="primary")
152
  result = gr.Markdown()
153
 
 
 
 
 
 
 
 
 
154
  gr.Markdown("""
155
  ### Tips:
156
  - **Check stderr logs** in HF Spaces for detailed debugging information
157
- - **After evaluation completes**, click the πŸ… Leaderboard tab and refresh the page to see results
158
  - **Example models to test**: `openai-community/gpt2`, `EleutherAI/gpt-neo-1.3B`, `openai-community/gpt2-large`
159
  - **Lower perplexity scores = better performance** (better at predicting text)
160
 
@@ -162,13 +175,13 @@ with demo:
162
  1. Enter a model name from Hugging Face Hub
163
  2. Click "Run Perplexity Test"
164
  3. Wait for evaluation to complete (may take a few minutes for large models)
165
- 4. Go to πŸ… Leaderboard tab and refresh the page to see your results!
166
  """)
167
 
168
  test_button.click(
169
  run_perplexity_test,
170
  [model_name, revision, precision],
171
- [result]
172
  )
173
 
174
  demo.queue(default_concurrency_limit=5).launch()
 
1
  import gradio as gr
 
2
  import pandas as pd
3
  from huggingface_hub import snapshot_download, create_repo
4
  from huggingface_hub.utils import RepositoryNotFoundError
 
20
  from src.populate import get_leaderboard_df
21
  from src.evaluation.dynamic_eval import run_dynamic_perplexity_eval
22
 
23
+ def create_results_dataframe():
24
+ """Create and return the results DataFrame for display"""
25
+ df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
26
+ if df is None or df.empty:
27
+ # Return empty DataFrame with proper columns
28
+ return pd.DataFrame(columns=["Model", "Perplexity", "Average Score", "Type", "Precision"])
29
 
30
+ # Select and rename columns for display
31
+ display_df = df[[
32
+ AutoEvalColumn.model.name,
33
+ "Perplexity", # This matches the task column name from Tasks.task0.value.col_name
34
+ AutoEvalColumn.average.name,
35
+ AutoEvalColumn.model_type.name,
36
+ AutoEvalColumn.precision.name,
37
+ ]].copy()
38
 
39
+ # Rename columns for better display
40
+ display_df.columns = ["Model", "Perplexity", "Average Score", "Type", "Precision"]
41
+
42
+ return display_df
 
 
 
 
 
 
43
 
44
  def run_perplexity_test(model_name, revision, precision):
45
  """Run perplexity evaluation on demand."""
 
48
  import gradio as gr
49
 
50
  if not model_name:
51
+ return "Please enter a model name.", gr.update()
52
 
53
  try:
54
  # Use stderr for more reliable logging in HF Spaces
 
63
  sys.stderr.flush()
64
 
65
  if success:
66
+ sys.stderr.write("Evaluation succeeded - updating results table\n")
67
  sys.stderr.flush()
68
 
69
+ # Get updated results
70
+ updated_df = create_results_dataframe()
71
+
72
+ success_msg = f"""βœ… **Perplexity evaluation completed successfully!**
73
 
74
  **Model**: {model_name}
75
  **Perplexity Score**: {result:.4f}
76
 
77
+ πŸŽ‰ **Results have been saved and the table below has been updated!**"""
78
+
79
+ return success_msg, gr.update(value=updated_df)
 
 
 
 
 
80
  else:
81
+ return f"❌ **Evaluation failed**: {result}", gr.update()
82
 
83
  except Exception as e:
84
  error_msg = str(e)
 
86
  sys.stderr.write(f"Critical error in run_perplexity_test: {error_msg}\n")
87
  sys.stderr.write(f"Traceback: {traceback_str}\n")
88
  sys.stderr.flush()
89
+ return f"❌ **Critical error**: {error_msg}", gr.update()
90
 
91
  # Initialize results repository and directory
92
  try:
 
116
  # Ensure local directory exists even if repo operations fail
117
  os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
118
 
119
+ # Get initial results data
120
+ RESULTS_DF = create_results_dataframe()
121
 
122
  # Create the Gradio interface
123
  demo = gr.Blocks(css=custom_css)
 
126
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
127
 
128
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
129
+ with gr.TabItem("πŸ… Results", elem_id="results-tab", id=0):
130
+ gr.Markdown("## Model Evaluation Results")
131
+ results_table = gr.DataFrame(
132
+ value=RESULTS_DF,
133
+ headers=["Model", "Perplexity", "Average Score", "Type", "Precision"],
134
+ interactive=False,
135
+ wrap=False
136
+ )
137
 
138
  with gr.TabItem("πŸ“ About", elem_id="about-tab", id=1):
139
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
156
  test_button = gr.Button("πŸš€ Run Perplexity Test", variant="primary")
157
  result = gr.Markdown()
158
 
159
+ gr.Markdown("## Live Results")
160
+ live_results_table = gr.DataFrame(
161
+ value=RESULTS_DF,
162
+ headers=["Model", "Perplexity", "Average Score", "Type", "Precision"],
163
+ interactive=False,
164
+ wrap=False
165
+ )
166
+
167
  gr.Markdown("""
168
  ### Tips:
169
  - **Check stderr logs** in HF Spaces for detailed debugging information
170
+ - **Results will update automatically** in the table above after evaluation completes
171
  - **Example models to test**: `openai-community/gpt2`, `EleutherAI/gpt-neo-1.3B`, `openai-community/gpt2-large`
172
  - **Lower perplexity scores = better performance** (better at predicting text)
173
 
 
175
  1. Enter a model name from Hugging Face Hub
176
  2. Click "Run Perplexity Test"
177
  3. Wait for evaluation to complete (may take a few minutes for large models)
178
+ 4. Results will appear automatically in the table above!
179
  """)
180
 
181
  test_button.click(
182
  run_perplexity_test,
183
  [model_name, revision, precision],
184
+ [result, live_results_table]
185
  )
186
 
187
  demo.queue(default_concurrency_limit=5).launch()
src/display/formatting.py CHANGED
@@ -3,8 +3,8 @@ def model_hyperlink(link, model_name):
3
 
4
 
5
  def make_clickable_model(model_name):
6
- link = f"https://huggingface.co/{model_name}"
7
- return model_hyperlink(link, model_name)
8
 
9
 
10
  def styled_error(error):
 
3
 
4
 
5
  def make_clickable_model(model_name):
6
+ # Just return the plain model name without HTML formatting
7
+ return model_name
8
 
9
 
10
  def styled_error(error):