Enderchef commited on
Commit
2d01a29
·
verified ·
1 Parent(s): 02583ad

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -7
app.py CHANGED
@@ -268,14 +268,18 @@ def run_evaluation(model_id, selected_benchmark_subject, sample_count, progress=
268
  score_string = f"Accuracy for {benchmark_name} - {subject_name}: {accuracy:.2f}% out of {num_evaluated_samples} samples."
269
 
270
  # Format detailed results for display in the text box
 
 
271
  formatted_details = "\n\n".join([
272
- f"### Question:\n{item['question']}\n\n"
273
- f"**Choices:**\n" + "\n".join([f"{get_choice_letter(i)}. {c}" for i, c in enumerate(item['choices'])]) + "\n\n"
274
- + (f"**Note:** Reasoning models are currently not fully supported for single-letter extraction. The original model output followed:\n" if item.get('is_reasoning_model_output') else "")
275
- f"**Model Raw Output:** {item['model_raw_output']}\n"
276
- f"**Expected Answer:** {item['expected_answer_letter']}\n"
277
- f"**Predicted Answer:** {item['predicted_answer_letter']}\n"
278
- f"**Correct:** {'Yes' if item['is_correct'] else 'No'}"
 
 
279
  for item in all_evaluation_results
280
  ])
281
 
 
268
  score_string = f"Accuracy for {benchmark_name} - {subject_name}: {accuracy:.2f}% out of {num_evaluated_samples} samples."
269
 
270
  # Format detailed results for display in the text box
271
+ # The key change here is to wrap the entire multi-line string construction for each item
272
+ # within parentheses to ensure it's treated as a single element in the list comprehension.
273
  formatted_details = "\n\n".join([
274
+ (
275
+ f"### Question:\n{item['question']}\n\n"
276
+ + f"**Choices:**\n" + "\n".join([f"{get_choice_letter(i)}. {c}" for i, c in enumerate(item['choices'])]) + "\n\n"
277
+ + (f"**Note:** Reasoning models are currently not fully supported for single-letter extraction. The original model output followed:\n" if item.get('is_reasoning_model_output') else "")
278
+ + f"**Model Raw Output:** {item['model_raw_output']}\n"
279
+ + f"**Expected Answer:** {item['expected_answer_letter']}\n"
280
+ + f"**Predicted Answer:** {item['predicted_answer_letter']}\n"
281
+ + f"**Correct:** {'Yes' if item['is_correct'] else 'No'}"
282
+ )
283
  for item in all_evaluation_results
284
  ])
285