Enderchef commited on
Commit
fc74a31
·
verified ·
1 Parent(s): 64a5484

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -4
app.py CHANGED
@@ -4,6 +4,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
4
  from datasets import load_dataset
5
  import torch
6
  import re
 
 
 
7
 
8
  # Cache to avoid reloading the model
9
  model_cache = {}
@@ -20,8 +23,7 @@ def load_model(model_id):
20
  return generator
21
 
22
  def format_prompt(item):
23
- system_instruction = "
24
- Only answer with a single letter: A, B, C, or D."
25
  prompt = f"{item['question']}
26
  A. {item['choices'][0]}
27
  B. {item['choices'][1]}
@@ -55,10 +57,23 @@ def evaluate(model_id, sample_count, config_name):
55
 
56
  def run(model_id, sample_count, config_name):
57
  score, details = evaluate(model_id, sample_count, config_name)
58
- formatted = "\n\n".join([
59
- f"### Question:\n{q}\n\n**Model Answer:** {o}\n**Expected:** {a}\n**Predicted:** {g}\n**Correct:** {c}"
 
 
 
 
 
 
 
 
60
  for q, o, a, g, c in details
61
  ])
 
 
 
 
 
62
  return score, formatted
63
 
64
  def save_text(text):
@@ -105,4 +120,23 @@ with gr.Blocks(css="body {font-family: Inter, sans-serif; padding: 1em; max-widt
105
  run_button.click(run, inputs=[model_id, sample_count, config_name], outputs=[acc_output, detail_output])
106
  download_button.click(save_text, inputs=detail_output, outputs=gr.File())
107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  demo.launch()
 
4
  from datasets import load_dataset
5
  import torch
6
  import re
7
+ import json
8
+ import pandas as pd
9
+ import matplotlib.pyplot as plt
10
 
11
  # Cache to avoid reloading the model
12
  model_cache = {}
 
23
  return generator
24
 
25
  def format_prompt(item):
26
+ system_instruction = " Only answer with a single letter: A, B, C, or D."
 
27
  prompt = f"{item['question']}
28
  A. {item['choices'][0]}
29
  B. {item['choices'][1]}
 
57
 
58
  def run(model_id, sample_count, config_name):
59
  score, details = evaluate(model_id, sample_count, config_name)
60
+ formatted = "
61
+
62
+ ".join([
63
+ f"### Question:
64
+ {q}
65
+
66
+ **Model Answer:** {o}
67
+ **Expected:** {a}
68
+ **Predicted:** {g}
69
+ **Correct:** {c}"
70
  for q, o, a, g, c in details
71
  ])
72
+ accuracy_value = float(score.split()[1][:-1])
73
+ record = {"model_id": model_id, "subject": config_name, "accuracy": accuracy_value}
74
+ with open("eval.jsonl", "a") as f:
75
+ f.write(json.dumps(record) + "
76
+ ")
77
  return score, formatted
78
 
79
  def save_text(text):
 
120
  run_button.click(run, inputs=[model_id, sample_count, config_name], outputs=[acc_output, detail_output])
121
  download_button.click(save_text, inputs=detail_output, outputs=gr.File())
122
 
123
+ with gr.Row():
124
+ leaderboard_plot = gr.Plot(label="Leaderboard Chart")
125
+ leaderboard_table = gr.Dataframe(headers=["Model ID", "Subject", "Accuracy"], interactive=False)
126
+
127
+ def load_leaderboard():
128
+ try:
129
+ df = pd.read_json("eval.jsonl", lines=True)
130
+ df_sorted = df.sort_values(by="accuracy", ascending=False).head(10)
131
+ fig, ax = plt.subplots()
132
+ ax.barh(df_sorted['model_id'], df_sorted['accuracy'])
133
+ ax.set_xlabel("Accuracy")
134
+ ax.set_ylabel("Model")
135
+ ax.set_title("Top 10 Models")
136
+ return fig, df_sorted
137
+ except Exception as e:
138
+ return plt.figure(), pd.DataFrame(columns=["model_id", "subject", "accuracy"])
139
+
140
+ demo.load(load_leaderboard, inputs=[], outputs=[leaderboard_plot, leaderboard_table])
141
+
142
  demo.launch()