Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -4,6 +4,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
|
|
4 |
from datasets import load_dataset
|
5 |
import torch
|
6 |
import re
|
|
|
|
|
|
|
7 |
|
8 |
# Cache to avoid reloading the model
|
9 |
model_cache = {}
|
@@ -20,8 +23,7 @@ def load_model(model_id):
|
|
20 |
return generator
|
21 |
|
22 |
def format_prompt(item):
|
23 |
-
system_instruction = "
|
24 |
-
Only answer with a single letter: A, B, C, or D."
|
25 |
prompt = f"{item['question']}
|
26 |
A. {item['choices'][0]}
|
27 |
B. {item['choices'][1]}
|
@@ -55,10 +57,23 @@ def evaluate(model_id, sample_count, config_name):
|
|
55 |
|
56 |
def run(model_id, sample_count, config_name):
|
57 |
score, details = evaluate(model_id, sample_count, config_name)
|
58 |
-
formatted = "
|
59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
for q, o, a, g, c in details
|
61 |
])
|
|
|
|
|
|
|
|
|
|
|
62 |
return score, formatted
|
63 |
|
64 |
def save_text(text):
|
@@ -105,4 +120,23 @@ with gr.Blocks(css="body {font-family: Inter, sans-serif; padding: 1em; max-widt
|
|
105 |
run_button.click(run, inputs=[model_id, sample_count, config_name], outputs=[acc_output, detail_output])
|
106 |
download_button.click(save_text, inputs=detail_output, outputs=gr.File())
|
107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
demo.launch()
|
|
|
4 |
from datasets import load_dataset
|
5 |
import torch
|
6 |
import re
|
7 |
+
import json
|
8 |
+
import pandas as pd
|
9 |
+
import matplotlib.pyplot as plt
|
10 |
|
11 |
# Cache to avoid reloading the model
|
12 |
model_cache = {}
|
|
|
23 |
return generator
|
24 |
|
25 |
def format_prompt(item):
|
26 |
+
system_instruction = " Only answer with a single letter: A, B, C, or D."
|
|
|
27 |
prompt = f"{item['question']}
|
28 |
A. {item['choices'][0]}
|
29 |
B. {item['choices'][1]}
|
|
|
57 |
|
58 |
def run(model_id, sample_count, config_name):
|
59 |
score, details = evaluate(model_id, sample_count, config_name)
|
60 |
+
formatted = "
|
61 |
+
|
62 |
+
".join([
|
63 |
+
f"### Question:
|
64 |
+
{q}
|
65 |
+
|
66 |
+
**Model Answer:** {o}
|
67 |
+
**Expected:** {a}
|
68 |
+
**Predicted:** {g}
|
69 |
+
**Correct:** {c}"
|
70 |
for q, o, a, g, c in details
|
71 |
])
|
72 |
+
accuracy_value = float(score.split()[1][:-1])
|
73 |
+
record = {"model_id": model_id, "subject": config_name, "accuracy": accuracy_value}
|
74 |
+
with open("eval.jsonl", "a") as f:
|
75 |
+
f.write(json.dumps(record) + "
|
76 |
+
")
|
77 |
return score, formatted
|
78 |
|
79 |
def save_text(text):
|
|
|
120 |
run_button.click(run, inputs=[model_id, sample_count, config_name], outputs=[acc_output, detail_output])
|
121 |
download_button.click(save_text, inputs=detail_output, outputs=gr.File())
|
122 |
|
123 |
+
with gr.Row():
|
124 |
+
leaderboard_plot = gr.Plot(label="Leaderboard Chart")
|
125 |
+
leaderboard_table = gr.Dataframe(headers=["Model ID", "Subject", "Accuracy"], interactive=False)
|
126 |
+
|
127 |
+
def load_leaderboard():
|
128 |
+
try:
|
129 |
+
df = pd.read_json("eval.jsonl", lines=True)
|
130 |
+
df_sorted = df.sort_values(by="accuracy", ascending=False).head(10)
|
131 |
+
fig, ax = plt.subplots()
|
132 |
+
ax.barh(df_sorted['model_id'], df_sorted['accuracy'])
|
133 |
+
ax.set_xlabel("Accuracy")
|
134 |
+
ax.set_ylabel("Model")
|
135 |
+
ax.set_title("Top 10 Models")
|
136 |
+
return fig, df_sorted
|
137 |
+
except Exception as e:
|
138 |
+
return plt.figure(), pd.DataFrame(columns=["model_id", "subject", "accuracy"])
|
139 |
+
|
140 |
+
demo.load(load_leaderboard, inputs=[], outputs=[leaderboard_plot, leaderboard_table])
|
141 |
+
|
142 |
demo.launch()
|