Enderchef commited on
Commit
903eadb
Β·
verified Β·
1 Parent(s): f6dce38

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -36
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import os
2
  import gradio as gr
3
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
4
- from datasets import load_dataset
5
  import torch
6
  import re
7
  import json
@@ -23,37 +23,26 @@ def load_model(model_id):
23
  return generator
24
 
25
  def format_prompt(item):
26
- system_instruction = " Only answer with a single letter: A, B, C, or D."
 
27
  prompt = f"""{item['question']}
28
  A. {item['choices'][0]}
29
  B. {item['choices'][1]}
30
  C. {item['choices'][2]}
31
  D. {item['choices'][3]}
32
- Answer:{system_instruction}"""
33
  return prompt, item['answer']
34
 
35
  def extract_choice_letter(output):
 
36
  match = re.search(r"\b([ABCD])\b", output.strip())
37
  return match.group(1) if match else None
38
 
39
- # Modified evaluate function to return accuracy as a float directly
40
  def evaluate(model_id, sample_count, config_name, progress=gr.Progress()):
41
  if config_name == "ALL":
42
- subjects = [
43
- "abstract_algebra", "anatomy", "astronomy", "business_ethics", "college_biology",
44
- "college_chemistry", "college_computer_science", "college_mathematics", "college_medicine",
45
- "college_physics", "computer_security", "econometrics", "electrical_engineering",
46
- "elementary_mathematics", "formal_logic", "global_facts", "high_school_biology",
47
- "high_school_chemistry", "high_school_computer_science", "high_school_european_history",
48
- "high_school_geography", "high_school_government_and_politics", "high_school_macroeconomics",
49
- "high_school_microeconomics", "high_school_physics", "high_school_psychology",
50
- "high_school_statistics", "high_school_us_history", "high_school_world_history", "human_aging",
51
- "human_sexuality", "international_law", "jurisprudence", "logical_fallacies", "machine_learning",
52
- "management", "marketing", "medical_genetics", "miscellaneous", "moral_disputes",
53
- "moral_scenarios", "nutrition", "philosophy", "prehistory", "professional_accounting",
54
- "professional_law", "professional_medicine", "professional_psychology", "public_relations",
55
- "security_studies", "sociology", "us_foreign_policy", "virology", "world_religions"
56
- ]
57
  gen = load_model(model_id)
58
  total_correct = 0
59
  total_samples = 0
@@ -64,16 +53,16 @@ def evaluate(model_id, sample_count, config_name, progress=gr.Progress()):
64
  correct = 0
65
  for j, item in enumerate(progress.tqdm(dataset, desc=f"Processing {subject} samples")):
66
  prompt, answer = format_prompt(item)
67
- output = gen(prompt, max_new_tokens=20, do_sample=False)[0]["generated_text"]
 
68
  output_letter = extract_choice_letter(output)
69
  correct += output_letter == answer
70
  all_results.append((prompt, output.strip(), answer, output_letter, output_letter == answer))
71
- # No need to write subject-level record here, only aggregate
72
  total_correct += correct
73
  total_samples += len(dataset)
74
  avg_accuracy = total_correct / total_samples * 100
75
- # Return the float accuracy value
76
  return avg_accuracy, all_results
 
77
  gen = load_model(model_id)
78
  dataset = load_dataset("cais/mmlu", config_name, token=HF_TOKEN)["test"]
79
  dataset = dataset.shuffle(seed=42).select(range(min(sample_count, len(dataset))))
@@ -83,19 +72,17 @@ def evaluate(model_id, sample_count, config_name, progress=gr.Progress()):
83
 
84
  for i, item in enumerate(progress.tqdm(dataset, desc=f"Processing {config_name} samples")):
85
  prompt, answer = format_prompt(item)
86
- output = gen(prompt, max_new_tokens=20, do_sample=False)[0]["generated_text"]
 
87
  output_letter = extract_choice_letter(output)
88
  is_correct = output_letter == answer
89
  correct += is_correct
90
  results.append((prompt, output.strip(), answer, output_letter, is_correct))
91
 
92
  accuracy = correct / len(dataset) * 100
93
- # Return the float accuracy value
94
  return accuracy, results
95
 
96
- # Pass progress to evaluate function
97
  def run(model_id, sample_count, config_name, progress=gr.Progress()):
98
- # Receive accuracy_value directly as a float
99
  accuracy_value, details = evaluate(model_id, sample_count, config_name, progress)
100
 
101
  formatted = "\n\n".join([
@@ -103,17 +90,15 @@ def run(model_id, sample_count, config_name, progress=gr.Progress()):
103
  for q, o, a, g, c in details
104
  ])
105
 
106
- # Format the score string based on config_name
107
  if config_name == "ALL":
108
  score_string = f"Average Accuracy: {accuracy_value:.2f}% across all subjects"
109
  else:
110
- # Assuming len(details) corresponds to the number of samples processed for a single subject
111
  score_string = f"Accuracy: {accuracy_value:.2f}%, out of {len(details)} samples"
112
 
113
  record = {"model_id": model_id, "subject": config_name, "accuracy": accuracy_value}
114
  with open("eval.jsonl", "a") as f:
115
  f.write(json.dumps(record) + "\n")
116
- return score_string, formatted # Return the formatted string and details
117
 
118
  def save_text(text):
119
  return "evaluation_results.txt", text
@@ -128,14 +113,17 @@ with gr.Blocks(css="body {font-family: Inter, sans-serif; padding: 1em; max-widt
128
  Enter your model ID, pick MMLU, choose a subject, and hit evaluate.
129
  """)
130
 
 
 
 
131
  with gr.Row():
132
  model_id = gr.Textbox(label="Your Hugging Face Model ID", placeholder="e.g., your-org/your-model")
133
  config_name = gr.Dropdown(
134
- label="Choose MMLU Subject",
135
- choices=["ALL"],
136
- value="ALL",
137
- interactive=False
138
- )
139
  sample_count = gr.Slider(label="Number of Samples", minimum=1, maximum=100, value=10, step=1)
140
 
141
  run_button = gr.Button("πŸš€ Run Evaluation")
@@ -143,7 +131,6 @@ with gr.Blocks(css="body {font-family: Inter, sans-serif; padding: 1em; max-widt
143
  detail_output = gr.Textbox(label="Evaluation Details", lines=20, interactive=False)
144
  download_button = gr.Button("πŸ“₯ Download Full Evaluation")
145
 
146
- # Pass progress to the run function
147
  run_button.click(run, inputs=[model_id, sample_count, config_name], outputs=[acc_output, detail_output])
148
  download_button.click(save_text, inputs=detail_output, outputs=gr.File())
149
 
@@ -172,4 +159,4 @@ with gr.Blocks(css="body {font-family: Inter, sans-serif; padding: 1em; max-widt
172
 
173
  demo.load(load_leaderboard, inputs=[], outputs=[leaderboard_plot, leaderboard_table])
174
 
175
- demo.launch()
 
1
  import os
2
  import gradio as gr
3
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
4
+ from datasets import load_dataset, get_dataset_config_names # Import get_dataset_config_names
5
  import torch
6
  import re
7
  import json
 
23
  return generator
24
 
25
  def format_prompt(item):
26
+ # Emphasize the single letter answer instruction to encourage concise output
27
+ system_instruction = "Respond ONLY with a single capital letter: A, B, C, or D. No other text."
28
  prompt = f"""{item['question']}
29
  A. {item['choices'][0]}
30
  B. {item['choices'][1]}
31
  C. {item['choices'][2]}
32
  D. {item['choices'][3]}
33
+ Answer: {system_instruction}""" # Place instruction after 'Answer:' with a space
34
  return prompt, item['answer']
35
 
36
  def extract_choice_letter(output):
37
+ # This function should now be more reliable as max_new_tokens is set to 1
38
  match = re.search(r"\b([ABCD])\b", output.strip())
39
  return match.group(1) if match else None
40
 
 
41
  def evaluate(model_id, sample_count, config_name, progress=gr.Progress()):
42
  if config_name == "ALL":
43
+ # Dynamically get all MMLU subjects
44
+ subjects = get_dataset_config_names("cais/mmlu", token=HF_TOKEN)
45
+
 
 
 
 
 
 
 
 
 
 
 
 
46
  gen = load_model(model_id)
47
  total_correct = 0
48
  total_samples = 0
 
53
  correct = 0
54
  for j, item in enumerate(progress.tqdm(dataset, desc=f"Processing {subject} samples")):
55
  prompt, answer = format_prompt(item)
56
+ # Crucial change: Limit generation to 1 new token
57
+ output = gen(prompt, max_new_tokens=1, do_sample=False)[0]["generated_text"]
58
  output_letter = extract_choice_letter(output)
59
  correct += output_letter == answer
60
  all_results.append((prompt, output.strip(), answer, output_letter, output_letter == answer))
 
61
  total_correct += correct
62
  total_samples += len(dataset)
63
  avg_accuracy = total_correct / total_samples * 100
 
64
  return avg_accuracy, all_results
65
+
66
  gen = load_model(model_id)
67
  dataset = load_dataset("cais/mmlu", config_name, token=HF_TOKEN)["test"]
68
  dataset = dataset.shuffle(seed=42).select(range(min(sample_count, len(dataset))))
 
72
 
73
  for i, item in enumerate(progress.tqdm(dataset, desc=f"Processing {config_name} samples")):
74
  prompt, answer = format_prompt(item)
75
+ # Crucial change: Limit generation to 1 new token
76
+ output = gen(prompt, max_new_tokens=1, do_sample=False)[0]["generated_text"]
77
  output_letter = extract_choice_letter(output)
78
  is_correct = output_letter == answer
79
  correct += is_correct
80
  results.append((prompt, output.strip(), answer, output_letter, is_correct))
81
 
82
  accuracy = correct / len(dataset) * 100
 
83
  return accuracy, results
84
 
 
85
  def run(model_id, sample_count, config_name, progress=gr.Progress()):
 
86
  accuracy_value, details = evaluate(model_id, sample_count, config_name, progress)
87
 
88
  formatted = "\n\n".join([
 
90
  for q, o, a, g, c in details
91
  ])
92
 
 
93
  if config_name == "ALL":
94
  score_string = f"Average Accuracy: {accuracy_value:.2f}% across all subjects"
95
  else:
 
96
  score_string = f"Accuracy: {accuracy_value:.2f}%, out of {len(details)} samples"
97
 
98
  record = {"model_id": model_id, "subject": config_name, "accuracy": accuracy_value}
99
  with open("eval.jsonl", "a") as f:
100
  f.write(json.dumps(record) + "\n")
101
+ return score_string, formatted
102
 
103
  def save_text(text):
104
  return "evaluation_results.txt", text
 
113
  Enter your model ID, pick MMLU, choose a subject, and hit evaluate.
114
  """)
115
 
116
+ # Get all MMLU subject config names dynamically
117
+ mmlu_subjects = ["ALL"] + get_dataset_config_names("cais/mmlu", token=HF_TOKEN)
118
+
119
  with gr.Row():
120
  model_id = gr.Textbox(label="Your Hugging Face Model ID", placeholder="e.g., your-org/your-model")
121
  config_name = gr.Dropdown(
122
+ label="Choose MMLU Subject",
123
+ choices=mmlu_subjects, # Populate with all subjects
124
+ value="ALL",
125
+ interactive=True # Make interactive now that there are more choices
126
+ )
127
  sample_count = gr.Slider(label="Number of Samples", minimum=1, maximum=100, value=10, step=1)
128
 
129
  run_button = gr.Button("πŸš€ Run Evaluation")
 
131
  detail_output = gr.Textbox(label="Evaluation Details", lines=20, interactive=False)
132
  download_button = gr.Button("πŸ“₯ Download Full Evaluation")
133
 
 
134
  run_button.click(run, inputs=[model_id, sample_count, config_name], outputs=[acc_output, detail_output])
135
  download_button.click(save_text, inputs=detail_output, outputs=gr.File())
136
 
 
159
 
160
  demo.load(load_leaderboard, inputs=[], outputs=[leaderboard_plot, leaderboard_table])
161
 
162
+ demo.launch()