Enderchef commited on
Commit
ca30b1d
·
verified ·
1 Parent(s): cda939c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +124 -38
app.py CHANGED
@@ -26,13 +26,12 @@ def get_all_benchmark_options():
26
  and a flattened list suitable for a Gradio dropdown.
27
  """
28
  all_options = {}
29
- gr_dropdown_options = []
30
 
31
  # Get subjects for MMLU
32
  try:
33
  mmlu_subjects = get_dataset_config_names(MMLU_DATASET, token=HF_TOKEN)
34
  all_options[MMLU_DATASET] = ["ALL"] + mmlu_subjects
35
- gr_dropdown_options.extend([f"MMLU - {s}" for s in all_options[MMLU_DATASET]])
36
  except Exception as e:
37
  print(f"Warning: Could not load MMLU dataset configs. Error: {e}")
38
  all_options[MMLU_DATASET] = []
@@ -41,15 +40,19 @@ def get_all_benchmark_options():
41
  try:
42
  mmlu_pro_subjects = get_dataset_config_names(MMLU_PRO_DATASET, token=HF_TOKEN)
43
  all_options[MMLU_PRO_DATASET] = ["ALL"] + mmlu_pro_subjects
44
- gr_dropdown_options.extend([f"MMLU-Pro - {s}" for s in all_options[MMLU_PRO_DATASET]])
45
  except Exception as e:
46
  print(f"Warning: Could not load MMLU-Pro dataset configs. It might not be accessible or available. Error: {e}")
47
  all_options[MMLU_PRO_DATASET] = []
48
 
 
 
 
 
 
49
  return all_options, gr_dropdown_options
50
 
51
  # Initialize these once globally when the app starts
52
- ALL_BENCHMARK_SUBJECTS, GRADIO_DROPDOWN_OPTIONS = get_all_benchmark_options()
53
 
54
  @spaces.GPU() # Decorator to ensure this function runs on GPU if available
55
  def load_model(model_id):
@@ -186,7 +189,7 @@ def evaluate_single_subject(generator, dataset_id, subject, sample_count, progre
186
  return accuracy, subject_results
187
 
188
  @spaces.GPU() # Decorator to ensure this function runs on GPU if available
189
- def run_evaluation(model_id, selected_benchmark_subject, sample_count, progress=gr.Progress()):
190
  """
191
  Main function to orchestrate the evaluation process.
192
  Handles single subject or 'ALL' subjects evaluation for MMLU/MMLU-Pro.
@@ -198,25 +201,15 @@ def run_evaluation(model_id, selected_benchmark_subject, sample_count, progress=
198
  # Return updates to hide logs/debug and show empty results
199
  return "", gr.update(value="", visible=False), gr.update(visible=False), \
200
  gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False)
201
-
202
- # Parse the selected benchmark and subject from the dropdown string
203
- parts = selected_benchmark_subject.split(" - ")
204
- if len(parts) != 2:
205
- gr.Error("Invalid benchmark selection format. Please select from the dropdown.")
206
- return "", gr.update(value="", visible=False), gr.update(visible=False), \
207
- gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False)
208
 
209
- benchmark_name = parts[0]
210
- subject_name = parts[1]
211
-
212
  dataset_id_map = {
213
  "MMLU": MMLU_DATASET,
214
  "MMLU-Pro": MMLU_PRO_DATASET
215
  }
216
- current_dataset_id = dataset_id_map.get(benchmark_name)
217
 
218
  if not current_dataset_id:
219
- gr.Error(f"Unknown benchmark selected: {benchmark_name}. This should not happen.")
220
  return "", gr.update(value="", visible=False), gr.update(visible=False), \
221
  gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False)
222
 
@@ -234,12 +227,12 @@ def run_evaluation(model_id, selected_benchmark_subject, sample_count, progress=
234
  subjects_to_evaluate.remove("ALL")
235
 
236
  if not subjects_to_evaluate:
237
- gr.Warning(f"No subjects found to evaluate for '{benchmark_name}'.")
238
  return "", gr.update(value="", visible=False), gr.update(visible=False), \
239
  gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False)
240
 
241
- for i, sub in enumerate(progress.tqdm(subjects_to_evaluate, desc=f"Evaluating ALL {benchmark_name} subjects")):
242
- gr.Info(f"Evaluating {benchmark_name} - {sub} ({i+1}/{len(subjects_to_evaluate)})...")
243
  try:
244
  accuracy, subject_details = evaluate_single_subject(generator, current_dataset_id, sub, sample_count, progress)
245
  all_evaluation_results.extend(subject_details)
@@ -249,14 +242,14 @@ def run_evaluation(model_id, selected_benchmark_subject, sample_count, progress=
249
 
250
  total_correct_overall += num_correct_in_subject
251
  total_samples_overall += num_evaluated_samples
252
- eval_summary_lines.append(f"- {benchmark_name} - {sub}: {accuracy:.2f}% ({num_correct_in_subject}/{num_evaluated_samples} samples)")
253
  except Exception as e:
254
- gr.Error(f"Skipping {benchmark_name} - {sub} due to an error: {e}")
255
- eval_summary_lines.append(f"- {benchmark_name} - {sub}: Error during evaluation.")
256
  continue
257
 
258
  overall_accuracy = (total_correct_overall / total_samples_overall) * 100 if total_samples_overall > 0 else 0
259
- score_string = f"Overall Average Accuracy for {benchmark_name}: {overall_accuracy:.2f}% across {total_samples_overall} total samples.\n\n"
260
  score_string += "Detailed breakdown:\n" + "\n".join(eval_summary_lines)
261
 
262
  else:
@@ -264,7 +257,7 @@ def run_evaluation(model_id, selected_benchmark_subject, sample_count, progress=
264
  all_evaluation_results.extend(subject_details)
265
  overall_accuracy = accuracy
266
  num_evaluated_samples = len(subject_details)
267
- score_string = f"Accuracy for {benchmark_name} - {subject_name}: {accuracy:.2f}% out of {num_evaluated_samples} samples."
268
 
269
  # Format detailed results for display in the text box
270
  formatted_details = "\n\n".join([
@@ -283,7 +276,7 @@ def run_evaluation(model_id, selected_benchmark_subject, sample_count, progress=
283
  # Record the evaluation result to a JSONL file for the leaderboard
284
  record = {
285
  "model_id": model_id,
286
- "benchmark": benchmark_name,
287
  "subject": subject_name,
288
  "accuracy": overall_accuracy,
289
  "sample_count": total_samples_overall if subject_name == "ALL" else len(all_evaluation_results),
@@ -360,6 +353,24 @@ def load_leaderboard(benchmark_filter):
360
  traceback.print_exc() # Print full traceback for debugging
361
  return pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
362
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
363
 
364
  # --- Gradio Interface Definition ---
365
  with gr.Blocks(css="""
@@ -564,12 +575,69 @@ with gr.Blocks(css="""
564
  border-bottom-right-radius: 12px;
565
  }
566
 
567
- /* Horizontal line for separation */
568
- hr {
569
- border: none;
570
- border-top: 1px solid #e2e8f0;
571
- margin: 30px 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
572
  }
 
 
 
 
 
 
 
 
573
  """) as demo:
574
  gr.Markdown("""
575
  # 🤖 LLM Benchmark Evaluator
@@ -592,19 +660,30 @@ with gr.Blocks(css="""
592
  placeholder="e.g., mistralai/Mistral-7B-Instruct-v0.2",
593
  interactive=True
594
  )
 
 
 
 
 
 
 
 
 
 
 
595
  with gr.Row():
596
  benchmark_subject_dropdown = gr.Dropdown(
597
- label="Choose Benchmark and Subject",
598
- choices=GRADIO_DROPDOWN_OPTIONS,
599
- value="MMLU - ALL", # Default to MMLU ALL for initial load
600
  interactive=True,
601
- min_width=400 # Ensure sufficient width
602
  )
603
  sample_count_slider = gr.Slider(
604
  label="Number of Samples per Subject (1-100)",
605
  minimum=1,
606
  maximum=100,
607
- value=10, # Default to 10 samples
608
  step=1,
609
  interactive=True,
610
  min_width=200
@@ -648,7 +727,7 @@ with gr.Blocks(css="""
648
  # Define button click actions
649
  run_button.click(
650
  run_evaluation,
651
- inputs=[model_id_input, benchmark_subject_dropdown, sample_count_slider],
652
  outputs=[
653
  acc_output,
654
  error_message_output, debug_error_column, # For error state
@@ -656,6 +735,13 @@ with gr.Blocks(css="""
656
  ]
657
  )
658
 
 
 
 
 
 
 
 
659
  # Toggle visibility of detail_output
660
  show_details_button.click(
661
  lambda s: gr.update(visible=not s), # Toggle visibility
@@ -722,4 +808,4 @@ with gr.Blocks(css="""
722
  leaderboard_type_toggle.change(load_leaderboard, inputs=[leaderboard_type_toggle], outputs=[leaderboard_table_output])
723
 
724
  # Launch the Gradio app
725
- demo.launch()
 
26
  and a flattened list suitable for a Gradio dropdown.
27
  """
28
  all_options = {}
29
+ gr_dropdown_options = [] # This is for initial display only, not used for dynamic updates directly
30
 
31
  # Get subjects for MMLU
32
  try:
33
  mmlu_subjects = get_dataset_config_names(MMLU_DATASET, token=HF_TOKEN)
34
  all_options[MMLU_DATASET] = ["ALL"] + mmlu_subjects
 
35
  except Exception as e:
36
  print(f"Warning: Could not load MMLU dataset configs. Error: {e}")
37
  all_options[MMLU_DATASET] = []
 
40
  try:
41
  mmlu_pro_subjects = get_dataset_config_names(MMLU_PRO_DATASET, token=HF_TOKEN)
42
  all_options[MMLU_PRO_DATASET] = ["ALL"] + mmlu_pro_subjects
 
43
  except Exception as e:
44
  print(f"Warning: Could not load MMLU-Pro dataset configs. It might not be accessible or available. Error: {e}")
45
  all_options[MMLU_PRO_DATASET] = []
46
 
47
+ # Flattened list for the initial state of the subject dropdown (e.g., MMLU subjects)
48
+ if MMLU_DATASET in all_options:
49
+ gr_dropdown_options.extend(all_options[MMLU_DATASET])
50
+
51
+
52
  return all_options, gr_dropdown_options
53
 
54
  # Initialize these once globally when the app starts
55
+ ALL_BENCHMARK_SUBJECTS, INITIAL_GRADIO_DROPDOWN_OPTIONS = get_all_benchmark_options()
56
 
57
  @spaces.GPU() # Decorator to ensure this function runs on GPU if available
58
  def load_model(model_id):
 
189
  return accuracy, subject_results
190
 
191
  @spaces.GPU() # Decorator to ensure this function runs on GPU if available
192
+ def run_evaluation(model_id, benchmark_category, subject_name, sample_count, progress=gr.Progress()):
193
  """
194
  Main function to orchestrate the evaluation process.
195
  Handles single subject or 'ALL' subjects evaluation for MMLU/MMLU-Pro.
 
201
  # Return updates to hide logs/debug and show empty results
202
  return "", gr.update(value="", visible=False), gr.update(visible=False), \
203
  gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False)
 
 
 
 
 
 
 
204
 
 
 
 
205
  dataset_id_map = {
206
  "MMLU": MMLU_DATASET,
207
  "MMLU-Pro": MMLU_PRO_DATASET
208
  }
209
+ current_dataset_id = dataset_id_map.get(benchmark_category)
210
 
211
  if not current_dataset_id:
212
+ gr.Error(f"Unknown benchmark category selected: {benchmark_category}. This should not happen.")
213
  return "", gr.update(value="", visible=False), gr.update(visible=False), \
214
  gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False)
215
 
 
227
  subjects_to_evaluate.remove("ALL")
228
 
229
  if not subjects_to_evaluate:
230
+ gr.Warning(f"No subjects found to evaluate for '{benchmark_category}'.")
231
  return "", gr.update(value="", visible=False), gr.update(visible=False), \
232
  gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False)
233
 
234
+ for i, sub in enumerate(progress.tqdm(subjects_to_evaluate, desc=f"Evaluating ALL {benchmark_category} subjects")):
235
+ gr.Info(f"Evaluating {benchmark_category} - {sub} ({i+1}/{len(subjects_to_evaluate)})...")
236
  try:
237
  accuracy, subject_details = evaluate_single_subject(generator, current_dataset_id, sub, sample_count, progress)
238
  all_evaluation_results.extend(subject_details)
 
242
 
243
  total_correct_overall += num_correct_in_subject
244
  total_samples_overall += num_evaluated_samples
245
+ eval_summary_lines.append(f"- {benchmark_category} - {sub}: {accuracy:.2f}% ({num_correct_in_subject}/{num_evaluated_samples} samples)")
246
  except Exception as e:
247
+ gr.Error(f"Skipping {benchmark_category} - {sub} due to an error: {e}")
248
+ eval_summary_lines.append(f"- {benchmark_category} - {sub}: Error during evaluation.")
249
  continue
250
 
251
  overall_accuracy = (total_correct_overall / total_samples_overall) * 100 if total_samples_overall > 0 else 0
252
+ score_string = f"Overall Average Accuracy for {benchmark_category}: {overall_accuracy:.2f}% across {total_samples_overall} total samples.\n\n"
253
  score_string += "Detailed breakdown:\n" + "\n".join(eval_summary_lines)
254
 
255
  else:
 
257
  all_evaluation_results.extend(subject_details)
258
  overall_accuracy = accuracy
259
  num_evaluated_samples = len(subject_details)
260
+ score_string = f"Accuracy for {benchmark_category} - {subject_name}: {accuracy:.2f}% out of {num_evaluated_samples} samples."
261
 
262
  # Format detailed results for display in the text box
263
  formatted_details = "\n\n".join([
 
276
  # Record the evaluation result to a JSONL file for the leaderboard
277
  record = {
278
  "model_id": model_id,
279
+ "benchmark": benchmark_category,
280
  "subject": subject_name,
281
  "accuracy": overall_accuracy,
282
  "sample_count": total_samples_overall if subject_name == "ALL" else len(all_evaluation_results),
 
353
  traceback.print_exc() # Print full traceback for debugging
354
  return pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
355
 
356
+ def update_subject_dropdown_choices(benchmark_category):
357
+ """
358
+ Updates the choices for the subject dropdown based on the selected benchmark category.
359
+ """
360
+ dataset_id_map = {
361
+ "MMLU": MMLU_DATASET,
362
+ "MMLU-Pro": MMLU_PRO_DATASET
363
+ }
364
+ selected_dataset_id = dataset_id_map.get(benchmark_category)
365
+
366
+ if selected_dataset_id and selected_dataset_id in ALL_BENCHMARK_SUBJECTS:
367
+ new_choices = ALL_BENCHMARK_SUBJECTS[selected_dataset_id]
368
+ # Set default value to "ALL" if available, otherwise the first subject
369
+ default_value = "ALL" if "ALL" in new_choices else (new_choices[0] if new_choices else None)
370
+ return gr.update(choices=new_choices, value=default_value)
371
+ else:
372
+ return gr.update(choices=[], value=None)
373
+
374
 
375
  # --- Gradio Interface Definition ---
376
  with gr.Blocks(css="""
 
575
  border-bottom-right-radius: 12px;
576
  }
577
 
578
+ /* Radio button group for leaderboard */
579
+ #leaderboard-toggle.gr-form {
580
+ display: flex;
581
+ justify-content: center;
582
+ padding: 0px 0px 20px 0px; /* Reduced padding for more compact look */
583
+ }
584
+ #leaderboard-toggle label.gr-radio-label {
585
+ font-size: 1.1em;
586
+ font-weight: 600;
587
+ color: #2d3748;
588
+ padding: 10px 20px;
589
+ border-radius: 8px;
590
+ background-color: #edf2f7; /* Light background for unselected */
591
+ border: 1px solid #e2e8f0;
592
+ cursor: pointer;
593
+ transition: all 0.3s ease;
594
+ margin: 0 5px; /* Spacing between radio buttons */
595
+ }
596
+ #leaderboard-toggle input[type="radio"]:checked + label.gr-radio-label {
597
+ background-color: #2f80ed; /* Blue for selected */
598
+ color: white;
599
+ border-color: #2f80ed;
600
+ box-shadow: 0 3px 10px rgba(47, 128, 237, 0.3);
601
+ }
602
+ #leaderboard-toggle input[type="radio"]:checked + label.gr-radio-label:hover {
603
+ background-color: #1a6dcd; /* Darker blue on hover */
604
+ }
605
+ #leaderboard-toggle label.gr-radio-label:hover {
606
+ background-color: #e2e8f0; /* Lighter grey on hover */
607
+ }
608
+
609
+ /* Radio button group for evaluation benchmark selection */
610
+ #eval-benchmark-selection {
611
+ display: flex;
612
+ justify-content: center;
613
+ margin-bottom: 20px; /* Space above dropdown */
614
+ }
615
+ #eval-benchmark-selection label.gr-radio-label {
616
+ font-size: 1.05em;
617
+ font-weight: 500;
618
+ color: #4a5568;
619
+ padding: 8px 15px;
620
+ border-radius: 6px;
621
+ background-color: #f0f4f7;
622
+ border: 1px solid #d9e3ed;
623
+ cursor: pointer;
624
+ transition: all 0.3s ease;
625
+ margin: 0 5px;
626
+ }
627
+ #eval-benchmark-selection input[type="radio"]:checked + label.gr-radio-label {
628
+ background-color: #48bb78; /* A pleasant green for evaluation selection */
629
+ color: white;
630
+ border-color: #48bb78;
631
+ box-shadow: 0 2px 8px rgba(72, 187, 120, 0.2);
632
  }
633
+ #eval-benchmark-selection input[type="radio"]:checked + label.gr-radio-label:hover {
634
+ background-color: #38a169;
635
+ }
636
+ #eval-benchmark-selection label.gr-radio-label:hover {
637
+ background-color: #e5edf2;
638
+ }
639
+
640
+
641
  """) as demo:
642
  gr.Markdown("""
643
  # 🤖 LLM Benchmark Evaluator
 
660
  placeholder="e.g., mistralai/Mistral-7B-Instruct-v0.2",
661
  interactive=True
662
  )
663
+
664
+ # New Radio button for benchmark selection for evaluation
665
+ benchmark_selection_radio = gr.Radio(
666
+ ["MMLU", "MMLU-Pro"],
667
+ label="Select Benchmark Type",
668
+ value="MMLU", # Default selection
669
+ interactive=True,
670
+ container=False, # Important for custom styling placement
671
+ elem_id="eval-benchmark-selection"
672
+ )
673
+
674
  with gr.Row():
675
  benchmark_subject_dropdown = gr.Dropdown(
676
+ label="Choose Subject", # Label changed to be more concise
677
+ choices=INITIAL_GRADIO_DROPDOWN_OPTIONS, # Initial choices (MMLU subjects)
678
+ value="ALL", # Default to ALL for MMLU initially
679
  interactive=True,
680
+ min_width=400
681
  )
682
  sample_count_slider = gr.Slider(
683
  label="Number of Samples per Subject (1-100)",
684
  minimum=1,
685
  maximum=100,
686
+ value=10,
687
  step=1,
688
  interactive=True,
689
  min_width=200
 
727
  # Define button click actions
728
  run_button.click(
729
  run_evaluation,
730
+ inputs=[model_id_input, benchmark_selection_radio, benchmark_subject_dropdown, sample_count_slider], # Updated inputs
731
  outputs=[
732
  acc_output,
733
  error_message_output, debug_error_column, # For error state
 
735
  ]
736
  )
737
 
738
+ # Link benchmark selection radio to subject dropdown
739
+ benchmark_selection_radio.change(
740
+ update_subject_dropdown_choices,
741
+ inputs=[benchmark_selection_radio],
742
+ outputs=[benchmark_subject_dropdown]
743
+ )
744
+
745
  # Toggle visibility of detail_output
746
  show_details_button.click(
747
  lambda s: gr.update(visible=not s), # Toggle visibility
 
808
  leaderboard_type_toggle.change(load_leaderboard, inputs=[leaderboard_type_toggle], outputs=[leaderboard_table_output])
809
 
810
  # Launch the Gradio app
811
+ demo.launch()