Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -268,14 +268,18 @@ def run_evaluation(model_id, selected_benchmark_subject, sample_count, progress=
|
|
268 |
score_string = f"Accuracy for {benchmark_name} - {subject_name}: {accuracy:.2f}% out of {num_evaluated_samples} samples."
|
269 |
|
270 |
# Format detailed results for display in the text box
|
|
|
|
|
271 |
formatted_details = "\n\n".join([
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
|
|
|
|
279 |
for item in all_evaluation_results
|
280 |
])
|
281 |
|
|
|
268 |
score_string = f"Accuracy for {benchmark_name} - {subject_name}: {accuracy:.2f}% out of {num_evaluated_samples} samples."
|
269 |
|
270 |
# Format detailed results for display in the text box
|
271 |
+
# The key change here is to wrap the entire multi-line string construction for each item
|
272 |
+
# within parentheses to ensure it's treated as a single element in the list comprehension.
|
273 |
formatted_details = "\n\n".join([
|
274 |
+
(
|
275 |
+
f"### Question:\n{item['question']}\n\n"
|
276 |
+
+ f"**Choices:**\n" + "\n".join([f"{get_choice_letter(i)}. {c}" for i, c in enumerate(item['choices'])]) + "\n\n"
|
277 |
+
+ (f"**Note:** Reasoning models are currently not fully supported for single-letter extraction. The original model output followed:\n" if item.get('is_reasoning_model_output') else "")
|
278 |
+
+ f"**Model Raw Output:** {item['model_raw_output']}\n"
|
279 |
+
+ f"**Expected Answer:** {item['expected_answer_letter']}\n"
|
280 |
+
+ f"**Predicted Answer:** {item['predicted_answer_letter']}\n"
|
281 |
+
+ f"**Correct:** {'Yes' if item['is_correct'] else 'No'}"
|
282 |
+
)
|
283 |
for item in all_evaluation_results
|
284 |
])
|
285 |
|