Enderchef commited on
Commit
566e353
Β·
verified Β·
1 Parent(s): ff1ae6f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +186 -97
app.py CHANGED
@@ -11,7 +11,6 @@ import spaces
11
  from datetime import datetime
12
 
13
  # --- Environment and Caching ---
14
-
15
  # It's good practice to ensure the cache directory exists.
16
  CACHE_DIR = "evaluation_cache"
17
  os.makedirs(CACHE_DIR, exist_ok=True)
@@ -26,14 +25,14 @@ HF_TOKEN = os.environ.get("HF_TOKEN")
26
 
27
  # --- Constants for Benchmarks ---
28
  MMLU_DATASET = "cais/mmlu"
29
- MMLU_PRO_DATASET = "TIGER-Lab/MMLU-Pro"
 
30
  BENCHMARK_MAP = {
31
  "MMLU": MMLU_DATASET,
32
- "MMLU-Pro": MMLU_PRO_DATASET
33
  }
34
 
35
  # --- Data Loading and Preparation ---
36
-
37
  def get_all_benchmark_options():
38
  """
39
  Fetches and caches the available subjects (configs) for each benchmark dataset.
@@ -41,8 +40,9 @@ def get_all_benchmark_options():
41
  """
42
  if benchmark_subject_cache:
43
  return benchmark_subject_cache
44
-
45
  print("Fetching benchmark configurations for the first time...")
 
 
46
  for key, dataset_id in BENCHMARK_MAP.items():
47
  try:
48
  # Fetching dataset configurations requires authentication if the dataset is private
@@ -57,7 +57,6 @@ def get_all_benchmark_options():
57
  # Initialize the cache on startup
58
  ALL_BENCHMARK_SUBJECTS = get_all_benchmark_options()
59
 
60
-
61
  @spaces.GPU()
62
  def load_model(model_id):
63
  """
@@ -66,16 +65,14 @@ def load_model(model_id):
66
  """
67
  if not model_id:
68
  raise ValueError("Model ID cannot be empty.")
69
-
70
- gr.Info(f"Attempting to load model: {model_id}...")
71
  if model_id in model_cache:
72
  gr.Info(f"Model '{model_id}' found in cache.")
73
  return model_cache[model_id]
74
-
75
  try:
76
  # Use bfloat16 for better performance on modern GPUs
77
  dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float32
78
-
79
  tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN, trust_remote_code=True)
80
  model = AutoModelForCausalLM.from_pretrained(
81
  model_id,
@@ -84,7 +81,7 @@ def load_model(model_id):
84
  trust_remote_code=True,
85
  low_cpu_mem_usage=True, # Optimization for large models
86
  ).to("cuda" if torch.cuda.is_available() else "cpu")
87
-
88
  # Create the pipeline for text generation
89
  generator = pipeline(
90
  "text-generation",
@@ -92,7 +89,7 @@ def load_model(model_id):
92
  tokenizer=tokenizer,
93
  device=0 if torch.cuda.is_available() else -1
94
  )
95
-
96
  model_cache[model_id] = generator
97
  gr.Info(f"Model '{model_id}' loaded successfully.")
98
  return generator
@@ -100,9 +97,7 @@ def load_model(model_id):
100
  # Raise a more specific error to be caught by the main evaluation function
101
  raise RuntimeError(f"Failed to load model '{model_id}'. Please verify the model ID and your Hugging Face token (if required). Error: {e}")
102
 
103
-
104
  # --- Evaluation Logic ---
105
-
106
  def format_prompt(item):
107
  """Formats the MMLU question and choices into a standardized prompt."""
108
  prompt = f"Question: {item['question']}\n\nChoices:\nA. {item['choices'][0]}\nB. {item['choices'][1]}\nC. {item['choices'][2]}\nD. {item['choices'][3]}\n\nAnswer:"
@@ -121,12 +116,11 @@ def extract_predicted_letter(output_text):
121
  match = re.search(r"Answer:\s*([ABCD])", output_text.strip(), re.IGNORECASE)
122
  if match:
123
  return match.group(1).upper()
124
-
125
  # Fallback: if the model just outputs a letter
126
  match = re.search(r"^\s*([ABCD])\b", output_text.strip())
127
  if match:
128
  return match.group(1).upper()
129
-
130
  return None
131
 
132
  def evaluate_single_subject(generator, dataset_id, subject, sample_count, progress):
@@ -150,23 +144,22 @@ def evaluate_single_subject(generator, dataset_id, subject, sample_count, progre
150
  for item in progress.tqdm(dataset, desc=f"Evaluating {subject}"):
151
  prompt, correct_answer_idx = format_prompt(item)
152
  expected_letter = get_choice_letter(correct_answer_idx)
153
-
154
  # The generated text is often just after the prompt. We need to slice it.
155
  full_prompt_text = generator.tokenizer.decode(generator.tokenizer.encode(prompt), skip_special_tokens=True)
156
-
157
  # Generate a short response, aiming for a single letter answer.
158
  # do_sample=False (greedy decoding) is crucial for reproducibility.
159
  raw_output = generator(prompt, max_new_tokens=5, do_sample=False, pad_token_id=generator.tokenizer.eos_token_id)[0]["generated_text"]
160
-
161
  # Isolate the newly generated part
162
  generated_text_only = raw_output[len(full_prompt_text):].strip()
163
-
164
  predicted_letter = extract_predicted_letter(generated_text_only)
165
  is_correct = (predicted_letter == expected_letter)
166
-
167
  if is_correct:
168
  correct_predictions += 1
169
-
170
  results_details.append({
171
  "Question": item['question'],
172
  "Correct": "βœ…" if is_correct else "❌",
@@ -174,11 +167,9 @@ def evaluate_single_subject(generator, dataset_id, subject, sample_count, progre
174
  "Predicted": predicted_letter or "N/A",
175
  "Model Output": generated_text_only
176
  })
177
-
178
  accuracy = (correct_predictions / num_samples) * 100 if num_samples > 0 else 0
179
  return accuracy, results_details
180
 
181
-
182
  @spaces.GPU()
183
  def run_evaluation(model_id, benchmark_category, subject_name, sample_count, progress=gr.Progress(track_tqdm=True)):
184
  """
@@ -189,7 +180,7 @@ def run_evaluation(model_id, benchmark_category, subject_name, sample_count, pro
189
  try:
190
  gr.Info("Starting evaluation...")
191
  generator = load_model(model_id)
192
-
193
  dataset_id = BENCHMARK_MAP.get(benchmark_category)
194
  if not dataset_id:
195
  raise ValueError(f"Invalid benchmark category: {benchmark_category}")
@@ -198,7 +189,7 @@ def run_evaluation(model_id, benchmark_category, subject_name, sample_count, pro
198
  summary_lines = []
199
  total_correct = 0
200
  total_samples = 0
201
-
202
  subjects_to_run = []
203
  if subject_name == "ALL":
204
  # Exclude the "ALL" placeholder from the list of subjects to run
@@ -219,23 +210,22 @@ def run_evaluation(model_id, benchmark_category, subject_name, sample_count, pro
219
  gr.Info(f"Evaluating {benchmark_category} - {subject} ({i+1}/{len(subjects_to_run)})...")
220
  try:
221
  accuracy, subject_details = evaluate_single_subject(generator, dataset_id, subject, sample_count, progress)
222
-
223
  all_results_details.extend(subject_details)
224
  num_correct = sum(1 for d in subject_details if d['Correct'] == "βœ…")
225
  num_evaluated = len(subject_details)
226
-
227
  total_correct += num_correct
228
  total_samples += num_evaluated
229
  summary_lines.append(f"- **{subject}**: {accuracy:.2f}% ({num_correct}/{num_evaluated})")
230
-
231
  except Exception as e:
232
  error_trace = traceback.format_exc()
233
  gr.Error(f"Skipping {subject} due to an error: {e}")
234
  summary_lines.append(f"- **{subject}**: Evaluation failed. See logs for details:\n```\n{error_trace}\n```")
235
  continue
236
-
237
  overall_accuracy = (total_correct / total_samples) * 100 if total_samples > 0 else 0
238
-
239
  # --- Prepare Outputs ---
240
  if subject_name == "ALL":
241
  result_summary = f"### Overall Average Accuracy: {overall_accuracy:.2f}%\n"
@@ -244,7 +234,7 @@ def run_evaluation(model_id, benchmark_category, subject_name, sample_count, pro
244
  else:
245
  result_summary = f"### Accuracy for {benchmark_category} - {subject_name}: {overall_accuracy:.2f}%\n"
246
  result_summary += f"({total_correct:,}/{total_samples:,} correct)"
247
-
248
  # Save results for leaderboard
249
  record = {
250
  "model_id": model_id,
@@ -256,11 +246,11 @@ def run_evaluation(model_id, benchmark_category, subject_name, sample_count, pro
256
  }
257
  with open(EVAL_FILE, "a") as f:
258
  f.write(json.dumps(record) + "\n")
259
-
260
  gr.Info("Evaluation completed successfully!")
261
-
262
  df_details = pd.DataFrame(all_results_details)
263
-
264
  # Return a dictionary of component updates
265
  return {
266
  result_summary_output: gr.update(value=result_summary, visible=True),
@@ -268,12 +258,11 @@ def run_evaluation(model_id, benchmark_category, subject_name, sample_count, pro
268
  details_box: gr.update(visible=True),
269
  detailed_results_df: gr.update(value=df_details)
270
  }
271
-
272
  except Exception as e:
273
  error_message = f"An unexpected error occurred during setup: {e}"
274
  error_details = traceback.format_exc()
275
  gr.Error(error_message)
276
-
277
  return {
278
  result_summary_output: gr.update(visible=False),
279
  error_box: gr.update(visible=True),
@@ -282,9 +271,7 @@ def run_evaluation(model_id, benchmark_category, subject_name, sample_count, pro
282
  details_box: gr.update(visible=False)
283
  }
284
 
285
-
286
  # --- UI Helper Functions ---
287
-
288
  def update_subject_dropdown(benchmark_category):
289
  """Updates the subject dropdown choices based on the selected benchmark."""
290
  choices = ALL_BENCHMARK_SUBJECTS.get(benchmark_category, [])
@@ -300,7 +287,7 @@ def load_leaderboard(benchmark_filter, progress=gr.Progress()):
300
  try:
301
  if not os.path.exists(EVAL_FILE):
302
  return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])
303
-
304
  df = pd.read_json(EVAL_FILE, lines=True)
305
  if df.empty:
306
  return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])
@@ -308,22 +295,21 @@ def load_leaderboard(benchmark_filter, progress=gr.Progress()):
308
  # Coerce accuracy to numeric and filter valid entries
309
  df['accuracy'] = pd.to_numeric(df['accuracy'], errors='coerce')
310
  df.dropna(subset=['accuracy'], inplace=True)
311
-
312
  # Filter by the selected benchmark (e.g., MMLU or MMLU-Pro)
313
  df_filtered = df[(df['benchmark'] == benchmark_filter) & (df['subject'] == 'ALL')].copy()
314
-
315
  if df_filtered.empty:
316
  return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])
317
 
318
  # Find the latest evaluation for each model
319
  df_filtered['timestamp'] = pd.to_datetime(df_filtered['timestamp'])
320
  latest_evals = df_filtered.loc[df_filtered.groupby('model_id')['timestamp'].idxmax()].copy()
321
-
322
  leaderboard_df = latest_evals.sort_values(by="accuracy", ascending=False).copy()
323
-
324
  # Add Rank
325
  leaderboard_df.insert(0, 'Rank', range(1, len(leaderboard_df) + 1))
326
-
327
  # Rename and format columns
328
  leaderboard_df.rename(columns={
329
  'model_id': 'Model ID',
@@ -331,67 +317,169 @@ def load_leaderboard(benchmark_filter, progress=gr.Progress()):
331
  'sample_count': 'Total Samples',
332
  'timestamp': 'Date'
333
  }, inplace=True)
334
-
335
  leaderboard_df['Avg. Accuracy (%)'] = leaderboard_df['Avg. Accuracy (%)'].map('{:.2f}'.format)
336
  leaderboard_df['Date'] = leaderboard_df['Date'].dt.strftime('%Y-%m-%d')
337
-
338
  progress(1, desc="Done.")
339
  return leaderboard_df[['Rank', 'Model ID', 'Avg. Accuracy (%)', 'Total Samples', 'Date']]
340
-
341
  except Exception as e:
342
  gr.Error(f"Error loading leaderboard: {e}")
343
  traceback.print_exc()
344
  return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])
345
 
346
-
347
  # --- Gradio Interface Definition ---
348
-
349
- with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), css="""
350
- /* --- Global & Layout --- */
351
- body { font-family: 'Inter', sans-serif; background-color: #f8f9fa; }
352
- .gradio-container { max-width: 1280px !important; margin: auto; }
353
- .gr-group { border-radius: 12px !important; box-shadow: 0 4px 12px rgba(0,0,0,0.05) !important; border: 1px solid #e9ecef !important; background-color: white; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
354
 
355
- /* --- Typography --- */
356
- h1 { text-align: center; font-size: 2.5rem !important; font-weight: 800; color: #212529; margin-bottom: 0.5rem; letter-spacing: -1.5px; }
357
- .subtitle { text-align: center; color: #6c757d; font-size: 1.1rem; margin-bottom: 2.5rem; max-width: 800px; margin-left: auto; margin-right: auto;}
358
-
359
- /* --- Buttons & Inputs --- */
360
- .gr-button { font-weight: 600 !important; transition: all 0.2s ease; }
361
- .gr-button-primary { box-shadow: 0 4px 10px rgba(59, 130, 246, 0.2); }
362
- .gr-button-primary:hover { transform: translateY(-2px); box-shadow: 0 6px 15px rgba(59, 130, 246, 0.3); }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
363
 
364
  /* --- Custom Radio Buttons (Segmented Control) --- */
365
  #leaderboard-toggle-group { display: flex; justify-content: center; align-items: center; gap: 1rem; margin-bottom: 1.5rem; }
366
- #leaderboard-toggle { background-color: #e9ecef; padding: 5px; border-radius: 10px; display: inline-flex; }
367
  #leaderboard-toggle div.gr-form { display: flex; gap: 5px; }
368
  #leaderboard-toggle input[type='radio'] { display: none; }
369
- #leaderboard-toggle label { padding: 8px 16px; border-radius: 8px; cursor: pointer; transition: all 0.3s ease; font-weight: 500; color: #495057; background: transparent; border: none; box-shadow: none; }
370
- #leaderboard-toggle input[type='radio']:checked + label { background-color: white; color: #0d6efd; font-weight: 600; box-shadow: 0 2px 5px rgba(0,0,0,0.1); }
371
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
372
  /* --- Dataframe / Table Styling --- */
373
  .leaderboard-table .gr-dataframe table { border-collapse: collapse; width: 100%; }
374
- .leaderboard-table .gr-dataframe thead th { background-color: #f8f9fa !important; color: #495057 !important; font-weight: 600 !important; text-align: left; padding: 12px 15px; border-bottom: 2px solid #dee2e6; }
375
- .leaderboard-table .gr-dataframe tbody tr:nth-of-type(even) { background-color: #fdfdff; }
376
- .leaderboard-table .gr-dataframe tbody tr:hover { background-color: #f0f6ff; }
377
- .leaderboard-table .gr-dataframe tbody td { padding: 12px 15px; border-bottom: 1px solid #e9ecef; }
378
- .leaderboard-table .gr-dataframe tbody td:first-child { font-weight: 700; color: #495057; }
379
-
 
 
 
 
 
 
 
 
 
 
380
 
381
  /* --- Error & Result Panes --- */
382
- #error-display-box { background-color: #fff3f3 !important; border-color: #ffc9c9 !important; }
383
- #result-summary-box { background-color: #f3f9ff !important; border-color: #cde4ff !important; }
384
- """) as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
385
  gr.Markdown("<h1>πŸ† Open LLM Evaluator</h1>")
386
- gr.Markdown("<p class='subtitle'>Benchmark leading models on MMLU and MMLU-Pro. Your results contribute to a live leaderboard. Select a benchmark and run an evaluation, or view the current standings.</p>")
387
-
388
  with gr.Tabs() as tabs:
389
  # --- Leaderboard Tab ---
390
  with gr.TabItem("πŸ“Š Leaderboard", id=0):
391
  with gr.Column():
392
  with gr.Row(elem_id="leaderboard-toggle-group"):
 
393
  leaderboard_type_toggle = gr.Radio(
394
- ["MMLU", "MMLU-Pro"],
395
  label="Select Benchmark",
396
  value="MMLU",
397
  interactive=True,
@@ -400,15 +488,15 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), cs
400
  show_label=False,
401
  )
402
  refresh_button = gr.Button("πŸ”„ Refresh", size="sm")
403
-
404
  leaderboard_table_output = gr.DataFrame(
405
  headers=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"],
406
  interactive=False,
407
  datatype=["number", "str", "str", "number", "str"],
408
- row_count=15,
409
- elem_classes="leaderboard-table"
 
410
  )
411
-
412
  # --- Evaluation Tab ---
413
  with gr.TabItem("πŸš€ Run Evaluation", id=1):
414
  with gr.Row(variant='panel'):
@@ -418,10 +506,12 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), cs
418
  model_id_input = gr.Textbox(
419
  label="Hugging Face Model ID",
420
  placeholder="e.g., meta-llama/Meta-Llama-3-8B-Instruct",
421
- interactive=True
 
422
  )
 
423
  benchmark_selection_radio = gr.Radio(
424
- ["MMLU", "MMLU-Pro"],
425
  label="Benchmark",
426
  value="MMLU",
427
  interactive=True,
@@ -429,7 +519,8 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), cs
429
  with gr.Row():
430
  benchmark_subject_dropdown = gr.Dropdown(
431
  label="Subject",
432
- choices=ALL_BENCHMARK_SUBJECTS.get("MMLU", []),
 
433
  value="ALL",
434
  interactive=True
435
  )
@@ -437,21 +528,20 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), cs
437
  label="Samples per Subject",
438
  minimum=5, maximum=100, value=25, step=5, interactive=True
439
  )
440
-
441
  run_button = gr.Button("Start Evaluation", variant="primary", scale=1)
442
-
443
  with gr.Column(scale=3):
444
  gr.Markdown("### 2. View Results")
445
-
446
  # Panel for displaying the summary of results
447
  with gr.Group(visible=False) as result_summary_box:
448
  result_summary_output = gr.Markdown(elem_id="result-summary-box")
449
-
450
  # Panel for displaying errors
451
  with gr.Group(visible=False) as error_box:
452
  error_output = gr.Textbox(label="Error Message", interactive=False, elem_id="error-display-box")
453
  error_details_output = gr.Textbox(label="Error Details (Traceback)", interactive=False, lines=8)
454
-
455
  # Panel for detailed, row-by-row results
456
  with gr.Group(visible=False) as details_box:
457
  gr.Markdown("#### Detailed Evaluation Log")
@@ -459,20 +549,19 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), cs
459
  headers=["Question", "Correct", "Expected", "Predicted", "Model Output"],
460
  datatype=["str", "str", "str", "str", "str"],
461
  interactive=False,
462
- row_count=10,
463
- col_count=5,
464
  wrap=True,
465
  )
466
 
467
- # --- Event Handlers & Logic ---
468
-
469
  # Update subject dropdown when benchmark type changes
470
  benchmark_selection_radio.change(
471
  fn=update_subject_dropdown,
472
  inputs=[benchmark_selection_radio],
473
  outputs=[benchmark_subject_dropdown]
474
  )
475
-
476
  # Main evaluation trigger
477
  run_button.click(
478
  fn=run_evaluation,
@@ -506,4 +595,4 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), cs
506
 
507
  # Launch the Gradio app
508
  if __name__ == "__main__":
509
- demo.launch(debug=True)
 
11
  from datetime import datetime
12
 
13
  # --- Environment and Caching ---
 
14
  # It's good practice to ensure the cache directory exists.
15
  CACHE_DIR = "evaluation_cache"
16
  os.makedirs(CACHE_DIR, exist_ok=True)
 
25
 
26
  # --- Constants for Benchmarks ---
27
  MMLU_DATASET = "cais/mmlu"
28
+ # Temporarily remove MMLU-Pro references
29
+ # MMLU_PRO_DATASET = "TIGER-Lab/MMLU-Pro"
30
  BENCHMARK_MAP = {
31
  "MMLU": MMLU_DATASET,
32
+ # "MMLU-Pro": MMLU_PRO_DATASET # Temporarily removed
33
  }
34
 
35
  # --- Data Loading and Preparation ---
 
36
  def get_all_benchmark_options():
37
  """
38
  Fetches and caches the available subjects (configs) for each benchmark dataset.
 
40
  """
41
  if benchmark_subject_cache:
42
  return benchmark_subject_cache
 
43
  print("Fetching benchmark configurations for the first time...")
44
+
45
+ # Only iterate over the allowed benchmarks (MMLU)
46
  for key, dataset_id in BENCHMARK_MAP.items():
47
  try:
48
  # Fetching dataset configurations requires authentication if the dataset is private
 
57
  # Initialize the cache on startup
58
  ALL_BENCHMARK_SUBJECTS = get_all_benchmark_options()
59
 
 
60
  @spaces.GPU()
61
  def load_model(model_id):
62
  """
 
65
  """
66
  if not model_id:
67
  raise ValueError("Model ID cannot be empty.")
68
+ gr.Info(f"Attempting to load model: {model_id}...")
 
69
  if model_id in model_cache:
70
  gr.Info(f"Model '{model_id}' found in cache.")
71
  return model_cache[model_id]
 
72
  try:
73
  # Use bfloat16 for better performance on modern GPUs
74
  dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float32
75
+
76
  tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN, trust_remote_code=True)
77
  model = AutoModelForCausalLM.from_pretrained(
78
  model_id,
 
81
  trust_remote_code=True,
82
  low_cpu_mem_usage=True, # Optimization for large models
83
  ).to("cuda" if torch.cuda.is_available() else "cpu")
84
+
85
  # Create the pipeline for text generation
86
  generator = pipeline(
87
  "text-generation",
 
89
  tokenizer=tokenizer,
90
  device=0 if torch.cuda.is_available() else -1
91
  )
92
+
93
  model_cache[model_id] = generator
94
  gr.Info(f"Model '{model_id}' loaded successfully.")
95
  return generator
 
97
  # Raise a more specific error to be caught by the main evaluation function
98
  raise RuntimeError(f"Failed to load model '{model_id}'. Please verify the model ID and your Hugging Face token (if required). Error: {e}")
99
 
 
100
  # --- Evaluation Logic ---
 
101
  def format_prompt(item):
102
  """Formats the MMLU question and choices into a standardized prompt."""
103
  prompt = f"Question: {item['question']}\n\nChoices:\nA. {item['choices'][0]}\nB. {item['choices'][1]}\nC. {item['choices'][2]}\nD. {item['choices'][3]}\n\nAnswer:"
 
116
  match = re.search(r"Answer:\s*([ABCD])", output_text.strip(), re.IGNORECASE)
117
  if match:
118
  return match.group(1).upper()
119
+
120
  # Fallback: if the model just outputs a letter
121
  match = re.search(r"^\s*([ABCD])\b", output_text.strip())
122
  if match:
123
  return match.group(1).upper()
 
124
  return None
125
 
126
  def evaluate_single_subject(generator, dataset_id, subject, sample_count, progress):
 
144
  for item in progress.tqdm(dataset, desc=f"Evaluating {subject}"):
145
  prompt, correct_answer_idx = format_prompt(item)
146
  expected_letter = get_choice_letter(correct_answer_idx)
147
+
148
  # The generated text is often just after the prompt. We need to slice it.
149
  full_prompt_text = generator.tokenizer.decode(generator.tokenizer.encode(prompt), skip_special_tokens=True)
150
+
151
  # Generate a short response, aiming for a single letter answer.
152
  # do_sample=False (greedy decoding) is crucial for reproducibility.
153
  raw_output = generator(prompt, max_new_tokens=5, do_sample=False, pad_token_id=generator.tokenizer.eos_token_id)[0]["generated_text"]
154
+
155
  # Isolate the newly generated part
156
  generated_text_only = raw_output[len(full_prompt_text):].strip()
 
157
  predicted_letter = extract_predicted_letter(generated_text_only)
158
  is_correct = (predicted_letter == expected_letter)
159
+
160
  if is_correct:
161
  correct_predictions += 1
162
+
163
  results_details.append({
164
  "Question": item['question'],
165
  "Correct": "βœ…" if is_correct else "❌",
 
167
  "Predicted": predicted_letter or "N/A",
168
  "Model Output": generated_text_only
169
  })
 
170
  accuracy = (correct_predictions / num_samples) * 100 if num_samples > 0 else 0
171
  return accuracy, results_details
172
 
 
173
  @spaces.GPU()
174
  def run_evaluation(model_id, benchmark_category, subject_name, sample_count, progress=gr.Progress(track_tqdm=True)):
175
  """
 
180
  try:
181
  gr.Info("Starting evaluation...")
182
  generator = load_model(model_id)
183
+
184
  dataset_id = BENCHMARK_MAP.get(benchmark_category)
185
  if not dataset_id:
186
  raise ValueError(f"Invalid benchmark category: {benchmark_category}")
 
189
  summary_lines = []
190
  total_correct = 0
191
  total_samples = 0
192
+
193
  subjects_to_run = []
194
  if subject_name == "ALL":
195
  # Exclude the "ALL" placeholder from the list of subjects to run
 
210
  gr.Info(f"Evaluating {benchmark_category} - {subject} ({i+1}/{len(subjects_to_run)})...")
211
  try:
212
  accuracy, subject_details = evaluate_single_subject(generator, dataset_id, subject, sample_count, progress)
213
+
214
  all_results_details.extend(subject_details)
215
  num_correct = sum(1 for d in subject_details if d['Correct'] == "βœ…")
216
  num_evaluated = len(subject_details)
 
217
  total_correct += num_correct
218
  total_samples += num_evaluated
219
  summary_lines.append(f"- **{subject}**: {accuracy:.2f}% ({num_correct}/{num_evaluated})")
220
+
221
  except Exception as e:
222
  error_trace = traceback.format_exc()
223
  gr.Error(f"Skipping {subject} due to an error: {e}")
224
  summary_lines.append(f"- **{subject}**: Evaluation failed. See logs for details:\n```\n{error_trace}\n```")
225
  continue
226
+
227
  overall_accuracy = (total_correct / total_samples) * 100 if total_samples > 0 else 0
228
+
229
  # --- Prepare Outputs ---
230
  if subject_name == "ALL":
231
  result_summary = f"### Overall Average Accuracy: {overall_accuracy:.2f}%\n"
 
234
  else:
235
  result_summary = f"### Accuracy for {benchmark_category} - {subject_name}: {overall_accuracy:.2f}%\n"
236
  result_summary += f"({total_correct:,}/{total_samples:,} correct)"
237
+
238
  # Save results for leaderboard
239
  record = {
240
  "model_id": model_id,
 
246
  }
247
  with open(EVAL_FILE, "a") as f:
248
  f.write(json.dumps(record) + "\n")
249
+
250
  gr.Info("Evaluation completed successfully!")
251
+
252
  df_details = pd.DataFrame(all_results_details)
253
+
254
  # Return a dictionary of component updates
255
  return {
256
  result_summary_output: gr.update(value=result_summary, visible=True),
 
258
  details_box: gr.update(visible=True),
259
  detailed_results_df: gr.update(value=df_details)
260
  }
 
261
  except Exception as e:
262
  error_message = f"An unexpected error occurred during setup: {e}"
263
  error_details = traceback.format_exc()
264
  gr.Error(error_message)
265
+
266
  return {
267
  result_summary_output: gr.update(visible=False),
268
  error_box: gr.update(visible=True),
 
271
  details_box: gr.update(visible=False)
272
  }
273
 
 
274
  # --- UI Helper Functions ---
 
275
  def update_subject_dropdown(benchmark_category):
276
  """Updates the subject dropdown choices based on the selected benchmark."""
277
  choices = ALL_BENCHMARK_SUBJECTS.get(benchmark_category, [])
 
287
  try:
288
  if not os.path.exists(EVAL_FILE):
289
  return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])
290
+
291
  df = pd.read_json(EVAL_FILE, lines=True)
292
  if df.empty:
293
  return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])
 
295
  # Coerce accuracy to numeric and filter valid entries
296
  df['accuracy'] = pd.to_numeric(df['accuracy'], errors='coerce')
297
  df.dropna(subset=['accuracy'], inplace=True)
298
+
299
  # Filter by the selected benchmark (e.g., MMLU or MMLU-Pro)
300
  df_filtered = df[(df['benchmark'] == benchmark_filter) & (df['subject'] == 'ALL')].copy()
301
+
302
  if df_filtered.empty:
303
  return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])
304
 
305
  # Find the latest evaluation for each model
306
  df_filtered['timestamp'] = pd.to_datetime(df_filtered['timestamp'])
307
  latest_evals = df_filtered.loc[df_filtered.groupby('model_id')['timestamp'].idxmax()].copy()
308
+
309
  leaderboard_df = latest_evals.sort_values(by="accuracy", ascending=False).copy()
310
+
311
  # Add Rank
312
  leaderboard_df.insert(0, 'Rank', range(1, len(leaderboard_df) + 1))
 
313
  # Rename and format columns
314
  leaderboard_df.rename(columns={
315
  'model_id': 'Model ID',
 
317
  'sample_count': 'Total Samples',
318
  'timestamp': 'Date'
319
  }, inplace=True)
320
+
321
  leaderboard_df['Avg. Accuracy (%)'] = leaderboard_df['Avg. Accuracy (%)'].map('{:.2f}'.format)
322
  leaderboard_df['Date'] = leaderboard_df['Date'].dt.strftime('%Y-%m-%d')
323
+
324
  progress(1, desc="Done.")
325
  return leaderboard_df[['Rank', 'Model ID', 'Avg. Accuracy (%)', 'Total Samples', 'Date']]
 
326
  except Exception as e:
327
  gr.Error(f"Error loading leaderboard: {e}")
328
  traceback.print_exc()
329
  return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])
330
 
 
331
  # --- Gradio Interface Definition ---
332
+ # Black/Orange Theme and bigger to fit screen
333
+ custom_css = """
334
+ /* --- Global & Layout (Bigger to fit screen) --- */
335
+ body { font-family: 'Inter', sans-serif; background-color: #1a1a1a; color: #f0f0f0; } /* Dark background, light text */
336
+ .gradio-container { max-width: 95% !important; margin: auto; padding: 20px; } /* Wider container */
337
+ .gr-group {
338
+ border-radius: 12px !important;
339
+ box-shadow: 0 4px 12px rgba(0,0,0,0.3) !important; /* Darker shadow */
340
+ border: 1px solid #333 !important; /* Darker border */
341
+ background-color: #2a2a2a; /* Darker group background */
342
+ }
343
+ .gr-panel {
344
+ border-radius: 12px !important;
345
+ box-shadow: 0 4px 12px rgba(0,0,0,0.3) !important;
346
+ border: 1px solid #333 !important;
347
+ background-color: #2a2a2a;
348
+ }
349
+
350
+ /* --- Typography (Orange Hues) --- */
351
+ h1 { text-align: center; font-size: 3rem !important; font-weight: 800; color: #ff8c00; margin-bottom: 0.5rem; letter-spacing: -1.5px; } /* Orange title */
352
+ h3, h4 { color: #ffa500; } /* Orange headings */
353
+ .subtitle { text-align: center; color: #cccccc; font-size: 1.2rem; margin-bottom: 2.5rem; max-width: 900px; margin-left: auto; margin-right: auto;}
354
+ label { color: #f0f0f0 !important; } /* Label text color */
355
 
356
+ /* --- Tabs --- */
357
+ .gradio-tabs { background-color: #2a2a2a; border-radius: 12px; }
358
+ .gradio-tab-item { color: #f0f0f0; }
359
+ .gradio-tabs button {
360
+ background-color: #3a3a3a !important;
361
+ color: #f0f0f0 !important;
362
+ border-radius: 8px 8px 0 0 !important;
363
+ transition: all 0.3s ease;
364
+ }
365
+ .gradio-tabs button.selected {
366
+ background-color: #ff8c00 !important; /* Orange selected tab */
367
+ color: #1a1a1a !important; /* Dark text on orange */
368
+ font-weight: 700;
369
+ }
370
+ .gradio-tabs button:hover { background-color: #555 !important; }
371
+
372
+ /* --- Inputs --- */
373
+ .gr-textbox, .gr-dropdown, .gr-slider {
374
+ background-color: #3a3a3a !important;
375
+ color: #f0f0f0 !important;
376
+ border: 1px solid #555 !important;
377
+ border-radius: 8px !important;
378
+ }
379
+ .gr-textbox textarea, .gr-textbox input, .gr-dropdown input {
380
+ color: #f0f0f0 !important;
381
+ }
382
+ .gr-textbox.gr-text-input:focus-within {
383
+ border-color: #ff8c00 !important; /* Orange focus border */
384
+ box-shadow: 0 0 0 2px rgba(255, 140, 0, 0.5) !important;
385
+ }
386
+
387
+
388
+ /* --- Buttons --- */
389
+ .gr-button { font-weight: 600 !important; transition: all 0.2s ease; border-radius: 8px !important; }
390
+ .gr-button-primary {
391
+ background-color: #ff8c00 !important; /* Orange primary button */
392
+ color: #1a1a1a !important;
393
+ box-shadow: 0 4px 10px rgba(255, 140, 0, 0.3);
394
+ border: none;
395
+ }
396
+ .gr-button-primary:hover {
397
+ transform: translateY(-2px);
398
+ box-shadow: 0 6px 15px rgba(255, 140, 0, 0.5);
399
+ background-color: #ffa500 !important; /* Slightly lighter orange on hover */
400
+ }
401
+ .gr-button-secondary {
402
+ background-color: #444 !important;
403
+ color: #f0f0f0 !important;
404
+ border: 1px solid #555 !important;
405
+ }
406
+ .gr-button-secondary:hover {
407
+ background-color: #555 !important;
408
+ }
409
 
410
  /* --- Custom Radio Buttons (Segmented Control) --- */
411
  #leaderboard-toggle-group { display: flex; justify-content: center; align-items: center; gap: 1rem; margin-bottom: 1.5rem; }
412
+ #leaderboard-toggle { background-color: #3a3a3a; padding: 5px; border-radius: 10px; display: inline-flex; border: 1px solid #555; }
413
  #leaderboard-toggle div.gr-form { display: flex; gap: 5px; }
414
  #leaderboard-toggle input[type='radio'] { display: none; }
415
+ #leaderboard-toggle label {
416
+ padding: 8px 16px;
417
+ border-radius: 8px;
418
+ cursor: pointer;
419
+ transition: all 0.3s ease;
420
+ font-weight: 500;
421
+ color: #f0f0f0;
422
+ background: transparent;
423
+ border: none;
424
+ box-shadow: none;
425
+ }
426
+ #leaderboard-toggle input[type='radio']:checked + label {
427
+ background-color: #ff8c00; /* Orange selected */
428
+ color: #1a1a1a;
429
+ font-weight: 600;
430
+ box-shadow: 0 2px 5px rgba(255, 140, 0, 0.3);
431
+ }
432
+ #leaderboard-toggle label:hover {
433
+ background-color: #555;
434
+ }
435
+
436
  /* --- Dataframe / Table Styling --- */
437
  .leaderboard-table .gr-dataframe table { border-collapse: collapse; width: 100%; }
438
+ .leaderboard-table .gr-dataframe thead th {
439
+ background-color: #3a3a3a !important;
440
+ color: #ffa500 !important; /* Orange headers */
441
+ font-weight: 600 !important;
442
+ text-align: left;
443
+ padding: 12px 15px;
444
+ border-bottom: 2px solid #555;
445
+ }
446
+ .leaderboard-table .gr-dataframe tbody tr:nth-of-type(even) { background-color: #2f2f2f; } /* Alternating row color */
447
+ .leaderboard-table .gr-dataframe tbody tr:hover { background-color: #4a4a4a; } /* Hover effect */
448
+ .leaderboard-table .gr-dataframe tbody td {
449
+ padding: 12px 15px;
450
+ border-bottom: 1px solid #3a3a3a;
451
+ color: #f0f0f0;
452
+ }
453
+ .leaderboard-table .gr-dataframe tbody td:first-child { font-weight: 700; color: #ffcc99; } /* Lighter orange for rank */
454
 
455
  /* --- Error & Result Panes --- */
456
+ #error-display-box {
457
+ background-color: #4a1e1e !important; /* Dark red for error */
458
+ border-color: #8c2f2f !important;
459
+ color: #ffc9c9 !important; /* Lighter red text */
460
+ }
461
+ #result-summary-box {
462
+ background-color: #1e3a2a !important; /* Dark green for success */
463
+ border-color: #2f8c4a !important;
464
+ color: #c9ffc9 !important; /* Lighter green text */
465
+ }
466
+ .gr-markdown p { color: #f0f0f0 !important; } /* Ensure markdown paragraph text is visible */
467
+ .gr-markdown strong { color: #ffa500 !important; } /* Strong text in orange */
468
+ .gradio-message { background-color: #ff8c00 !important; color: #1a1a1a !important; border: 1px solid #ff8c00 !important; } /* Gradio Info messages */
469
+ """
470
+
471
+ with gr.Blocks(theme=gr.themes.Base(), css=custom_css) as demo:
472
  gr.Markdown("<h1>πŸ† Open LLM Evaluator</h1>")
473
+ gr.Markdown("<p class='subtitle'>Benchmark leading models on MMLU. Your results contribute to a live leaderboard. Select a benchmark and run an evaluation, or view the current standings.</p>")
474
+
475
  with gr.Tabs() as tabs:
476
  # --- Leaderboard Tab ---
477
  with gr.TabItem("πŸ“Š Leaderboard", id=0):
478
  with gr.Column():
479
  with gr.Row(elem_id="leaderboard-toggle-group"):
480
+ # Temporarily remove MMLU-Pro from radio options
481
  leaderboard_type_toggle = gr.Radio(
482
+ ["MMLU"],
483
  label="Select Benchmark",
484
  value="MMLU",
485
  interactive=True,
 
488
  show_label=False,
489
  )
490
  refresh_button = gr.Button("πŸ”„ Refresh", size="sm")
 
491
  leaderboard_table_output = gr.DataFrame(
492
  headers=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"],
493
  interactive=False,
494
  datatype=["number", "str", "str", "number", "str"],
495
+ row_count=15, # Adjusted for more rows
496
+ elem_classes="leaderboard-table",
497
+ # Removed col_count to allow dynamic width
498
  )
499
+
500
  # --- Evaluation Tab ---
501
  with gr.TabItem("πŸš€ Run Evaluation", id=1):
502
  with gr.Row(variant='panel'):
 
506
  model_id_input = gr.Textbox(
507
  label="Hugging Face Model ID",
508
  placeholder="e.g., meta-llama/Meta-Llama-3-8B-Instruct",
509
+ interactive=True,
510
+ scale=2 # Increased scale for textbox
511
  )
512
+ # Temporarily remove MMLU-Pro from radio options
513
  benchmark_selection_radio = gr.Radio(
514
+ ["MMLU"],
515
  label="Benchmark",
516
  value="MMLU",
517
  interactive=True,
 
519
  with gr.Row():
520
  benchmark_subject_dropdown = gr.Dropdown(
521
  label="Subject",
522
+ # Ensure only MMLU subjects are fetched
523
+ choices=ALL_BENCHMARK_SUBJECTS.get("MMLU", []),
524
  value="ALL",
525
  interactive=True
526
  )
 
528
  label="Samples per Subject",
529
  minimum=5, maximum=100, value=25, step=5, interactive=True
530
  )
 
531
  run_button = gr.Button("Start Evaluation", variant="primary", scale=1)
532
+
533
  with gr.Column(scale=3):
534
  gr.Markdown("### 2. View Results")
535
+
536
  # Panel for displaying the summary of results
537
  with gr.Group(visible=False) as result_summary_box:
538
  result_summary_output = gr.Markdown(elem_id="result-summary-box")
539
+
540
  # Panel for displaying errors
541
  with gr.Group(visible=False) as error_box:
542
  error_output = gr.Textbox(label="Error Message", interactive=False, elem_id="error-display-box")
543
  error_details_output = gr.Textbox(label="Error Details (Traceback)", interactive=False, lines=8)
544
+
545
  # Panel for detailed, row-by-row results
546
  with gr.Group(visible=False) as details_box:
547
  gr.Markdown("#### Detailed Evaluation Log")
 
549
  headers=["Question", "Correct", "Expected", "Predicted", "Model Output"],
550
  datatype=["str", "str", "str", "str", "str"],
551
  interactive=False,
552
+ row_count=10, # Adjusted for more rows
553
+ # Removed col_count to allow dynamic width
554
  wrap=True,
555
  )
556
 
557
+ # --- Event Handlers & Logic ---
 
558
  # Update subject dropdown when benchmark type changes
559
  benchmark_selection_radio.change(
560
  fn=update_subject_dropdown,
561
  inputs=[benchmark_selection_radio],
562
  outputs=[benchmark_subject_dropdown]
563
  )
564
+
565
  # Main evaluation trigger
566
  run_button.click(
567
  fn=run_evaluation,
 
595
 
596
  # Launch the Gradio app
597
  if __name__ == "__main__":
598
+ demo.launch(debug=True)