Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,315 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import requests
|
3 |
+
import random
|
4 |
+
from datasets import load_dataset, Dataset
|
5 |
+
from typing import Dict, List
|
6 |
+
import re
|
7 |
+
import datetime
|
8 |
+
import pandas as pd
|
9 |
+
import os
|
10 |
+
from dotenv import load_dotenv
|
11 |
+
|
12 |
+
load_dotenv()
|
13 |
+
|
14 |
+
def sanitize_theme_name(theme: str) -> str:
|
15 |
+
sanitized = re.sub(r'[^\w\s-]', '', theme)
|
16 |
+
sanitized = re.sub(r'[-\s]+', '_', sanitized)
|
17 |
+
return sanitized.lower().strip('_')
|
18 |
+
|
19 |
+
def load_questions_from_dataset() -> Dict[str, List[Dict]]:
|
20 |
+
dataset = load_dataset("SASLeaderboard/sas_opposition_exam_data")
|
21 |
+
dataset = dataset['train'].filter(lambda x: x['theme'] == 'FEA UrologΓa')
|
22 |
+
|
23 |
+
questions_by_theme = {}
|
24 |
+
skipped = 0
|
25 |
+
loaded = 0
|
26 |
+
|
27 |
+
for item in dataset:
|
28 |
+
theme = item['theme']
|
29 |
+
answers = item.get('answers', [])
|
30 |
+
correct_answer = item.get('correct_answer', '')
|
31 |
+
|
32 |
+
if not answers or not correct_answer or len(answers) < 3:
|
33 |
+
skipped += 1
|
34 |
+
continue
|
35 |
+
|
36 |
+
while len(answers) < 4:
|
37 |
+
answers.append(answers[-1])
|
38 |
+
|
39 |
+
sanitized_theme = sanitize_theme_name(theme)
|
40 |
+
|
41 |
+
if sanitized_theme not in questions_by_theme:
|
42 |
+
questions_by_theme[sanitized_theme] = []
|
43 |
+
|
44 |
+
try:
|
45 |
+
question = {
|
46 |
+
"statement": item['statement'],
|
47 |
+
"options": {
|
48 |
+
"A": answers[0],
|
49 |
+
"B": answers[1],
|
50 |
+
"C": answers[2],
|
51 |
+
"D": answers[3]
|
52 |
+
},
|
53 |
+
"real_answer": correct_answer,
|
54 |
+
"theme": theme,
|
55 |
+
"sanitized_theme": sanitized_theme,
|
56 |
+
"version": item.get('version', 'Default')
|
57 |
+
}
|
58 |
+
|
59 |
+
questions_by_theme[sanitized_theme].append(question)
|
60 |
+
loaded += 1
|
61 |
+
except Exception as e:
|
62 |
+
skipped += 1
|
63 |
+
continue
|
64 |
+
|
65 |
+
print(f"Loaded {loaded} questions, skipped {skipped} invalid questions")
|
66 |
+
return questions_by_theme
|
67 |
+
|
68 |
+
def ask_ai_model(api_key: str, model: str, question: Dict) -> tuple:
|
69 |
+
prompt = f"""You are a medical expert taking a urology examination. Please analyze this question carefully and provide your answer.
|
70 |
+
|
71 |
+
Question: {question['statement']}
|
72 |
+
|
73 |
+
Options:
|
74 |
+
A) {question['options']['A']}
|
75 |
+
B) {question['options']['B']}
|
76 |
+
C) {question['options']['C']}
|
77 |
+
D) {question['options']['D']}
|
78 |
+
|
79 |
+
Please provide your answer in this exact format:
|
80 |
+
Answer: [A/B/C/D]
|
81 |
+
|
82 |
+
Then provide your reasoning."""
|
83 |
+
|
84 |
+
try:
|
85 |
+
headers = {
|
86 |
+
"Authorization": f"Bearer {api_key}",
|
87 |
+
"Content-Type": "application/json"
|
88 |
+
}
|
89 |
+
|
90 |
+
data = {
|
91 |
+
"model": model,
|
92 |
+
"messages": [
|
93 |
+
{"role": "user", "content": prompt}
|
94 |
+
]
|
95 |
+
}
|
96 |
+
|
97 |
+
response = requests.post("https://openrouter.ai/api/v1/chat/completions",
|
98 |
+
headers=headers, json=data)
|
99 |
+
|
100 |
+
if response.status_code == 200:
|
101 |
+
result = response.json()
|
102 |
+
ai_response = result["choices"][0]["message"]["content"]
|
103 |
+
|
104 |
+
ai_answer = extract_answer_from_response(ai_response)
|
105 |
+
|
106 |
+
return ai_response, ai_answer
|
107 |
+
else:
|
108 |
+
error_msg = f"API Error {response.status_code}: {response.text}"
|
109 |
+
return error_msg, "API_ERROR"
|
110 |
+
|
111 |
+
except Exception as e:
|
112 |
+
error_msg = f"Request Error: {str(e)}"
|
113 |
+
return error_msg, "REQUEST_ERROR"
|
114 |
+
|
115 |
+
def extract_answer_from_response(ai_response: str) -> str:
|
116 |
+
if not ai_response:
|
117 |
+
return "EMPTY_RESPONSE"
|
118 |
+
|
119 |
+
lines = ai_response.split('\n')
|
120 |
+
|
121 |
+
for line in lines:
|
122 |
+
line_clean = line.strip().lower()
|
123 |
+
if line_clean.startswith('answer:'):
|
124 |
+
answer_part = line.split(':')[1].strip().upper()
|
125 |
+
for char in answer_part:
|
126 |
+
if char in ['A', 'B', 'C', 'D']:
|
127 |
+
return char
|
128 |
+
|
129 |
+
for line in lines:
|
130 |
+
line_clean = line.strip().lower()
|
131 |
+
if 'answer is' in line_clean:
|
132 |
+
for char in ['A', 'B', 'C', 'D']:
|
133 |
+
if char.lower() in line_clean.split('answer is')[1][:5]:
|
134 |
+
return char
|
135 |
+
|
136 |
+
for line in lines[:5]:
|
137 |
+
line_upper = line.upper()
|
138 |
+
for char in ['A', 'B', 'C', 'D']:
|
139 |
+
patterns = [f"{char})", f"{char}.", f"OPTION {char}", f"({char})", f"CHOICE {char}"]
|
140 |
+
for pattern in patterns:
|
141 |
+
if pattern in line_upper:
|
142 |
+
return char
|
143 |
+
|
144 |
+
for line in lines[:3]:
|
145 |
+
for char in ['A', 'B', 'C', 'D']:
|
146 |
+
if char in line.upper():
|
147 |
+
return char
|
148 |
+
|
149 |
+
for char in ['A', 'B', 'C', 'D']:
|
150 |
+
if char in ai_response.upper():
|
151 |
+
return char
|
152 |
+
|
153 |
+
return "NO_ANSWER_FOUND"
|
154 |
+
|
155 |
+
def save_results_to_dataset(results: List[Dict], hf_token: str = None) -> str:
|
156 |
+
if not results:
|
157 |
+
return "No results to save"
|
158 |
+
|
159 |
+
if not hf_token:
|
160 |
+
hf_token = os.getenv("HF_TOKEN")
|
161 |
+
|
162 |
+
if not hf_token:
|
163 |
+
return "β HuggingFace token not found. Please provide it in the interface or set HF_TOKEN environment variable"
|
164 |
+
|
165 |
+
try:
|
166 |
+
try:
|
167 |
+
existing_dataset = load_dataset("SASLeaderboard/results", use_auth_token=hf_token)
|
168 |
+
existing_data = existing_dataset['train'].to_pandas()
|
169 |
+
except Exception:
|
170 |
+
existing_data = None
|
171 |
+
|
172 |
+
new_data = pd.DataFrame(results)
|
173 |
+
|
174 |
+
if existing_data is not None:
|
175 |
+
combined_data = pd.concat([existing_data, new_data], ignore_index=True)
|
176 |
+
else:
|
177 |
+
combined_data = new_data
|
178 |
+
|
179 |
+
new_dataset = Dataset.from_pandas(combined_data)
|
180 |
+
|
181 |
+
new_dataset.push_to_hub(
|
182 |
+
"SASLeaderboard/results",
|
183 |
+
token=hf_token,
|
184 |
+
commit_message=f"Automated exam results for {results[0]['model']} - {len(results)} questions"
|
185 |
+
)
|
186 |
+
|
187 |
+
return f"β
Successfully saved {len(results)} results to SASLeaderboard/results dataset"
|
188 |
+
|
189 |
+
except Exception as e:
|
190 |
+
return f"β Error saving results: {str(e)}"
|
191 |
+
|
192 |
+
def run_automated_exam(api_key: str, model: str, hf_token: str = ""):
|
193 |
+
if not api_key:
|
194 |
+
yield "β Please provide OpenRouter API key"
|
195 |
+
return
|
196 |
+
|
197 |
+
if not model:
|
198 |
+
yield "β Please provide model name"
|
199 |
+
return
|
200 |
+
|
201 |
+
yield "π Loading questions from dataset..."
|
202 |
+
|
203 |
+
try:
|
204 |
+
all_questions_by_theme = load_questions_from_dataset()
|
205 |
+
|
206 |
+
all_questions = []
|
207 |
+
for theme_questions in all_questions_by_theme.values():
|
208 |
+
all_questions.extend(theme_questions)
|
209 |
+
|
210 |
+
total_questions = len(all_questions)
|
211 |
+
|
212 |
+
yield f"β
Loaded {total_questions} questions from dataset"
|
213 |
+
yield f"π Starting automated exam with ALL {total_questions} questions for model: {model}"
|
214 |
+
|
215 |
+
session_id = f"{model}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
216 |
+
results = []
|
217 |
+
correct_count = 0
|
218 |
+
|
219 |
+
for i, question in enumerate(all_questions):
|
220 |
+
|
221 |
+
ai_response, ai_answer = ask_ai_model(api_key, model, question)
|
222 |
+
|
223 |
+
if ai_answer in ["API_ERROR", "REQUEST_ERROR", "EMPTY_RESPONSE", "NO_ANSWER_FOUND"]:
|
224 |
+
yield f"β οΈ Question {i+1}: Error getting answer - {ai_answer}. Response: {ai_response[:100]}..."
|
225 |
+
|
226 |
+
is_correct = ai_answer == question['real_answer']
|
227 |
+
if is_correct:
|
228 |
+
correct_count += 1
|
229 |
+
|
230 |
+
result = {
|
231 |
+
"session_id": session_id,
|
232 |
+
"model": model,
|
233 |
+
"question": question['statement'],
|
234 |
+
"theme": question['theme'],
|
235 |
+
"correct_answer": question['real_answer'],
|
236 |
+
"ai_answer": ai_answer,
|
237 |
+
"ai_response": ai_response,
|
238 |
+
"is_correct": is_correct,
|
239 |
+
"timestamp": datetime.datetime.now().isoformat(),
|
240 |
+
"options_a": question['options']['A'],
|
241 |
+
"options_b": question['options']['B'],
|
242 |
+
"options_c": question['options']['C'],
|
243 |
+
"options_d": question['options']['D']
|
244 |
+
}
|
245 |
+
|
246 |
+
results.append(result)
|
247 |
+
|
248 |
+
current_accuracy = (correct_count / (i + 1)) * 100
|
249 |
+
|
250 |
+
status_emoji = "β
" if is_correct else "β"
|
251 |
+
yield f"{status_emoji} Q{i+1}/{total_questions}: Accuracy: {correct_count}/{i+1} ({current_accuracy:.1f}%) | AI: {ai_answer} vs Correct: {question['real_answer']} | {question['statement'][:80]}..."
|
252 |
+
|
253 |
+
yield f"πΎ Saving results to HuggingFace dataset..."
|
254 |
+
|
255 |
+
save_result = save_results_to_dataset(results, hf_token)
|
256 |
+
|
257 |
+
final_accuracy = (correct_count / len(results)) * 100
|
258 |
+
yield f"""
|
259 |
+
## π― Exam Complete!
|
260 |
+
|
261 |
+
**Final Results:**
|
262 |
+
- Model: {model}
|
263 |
+
- Total Questions: {len(results)}
|
264 |
+
- Correct Answers: {correct_count}
|
265 |
+
- Final Accuracy: {final_accuracy:.1f}%
|
266 |
+
- Session ID: {session_id}
|
267 |
+
|
268 |
+
**Save Status:** {save_result}
|
269 |
+
|
270 |
+
The automated exam has been completed successfully!
|
271 |
+
"""
|
272 |
+
|
273 |
+
except Exception as e:
|
274 |
+
yield f"β Error during automated exam: {str(e)}"
|
275 |
+
|
276 |
+
with gr.Blocks(title="Automated Urology Exam System") as demo:
|
277 |
+
gr.Markdown("# Automated Urology Exam System")
|
278 |
+
gr.Markdown("This system automatically runs a complete urology exam for AI models using ALL available questions (~150) and saves results to the dataset.")
|
279 |
+
|
280 |
+
with gr.Row():
|
281 |
+
with gr.Column():
|
282 |
+
gr.Markdown("**Get your API key:** [OpenRouter Keys](https://openrouter.ai/settings/keys)")
|
283 |
+
api_key_input = gr.Textbox(
|
284 |
+
label="OpenRouter API Key",
|
285 |
+
type="password",
|
286 |
+
placeholder="Enter your OpenRouter API key"
|
287 |
+
)
|
288 |
+
with gr.Column():
|
289 |
+
gr.Markdown("**Find models:** [OpenRouter Models](https://openrouter.ai/models)")
|
290 |
+
model_input = gr.Textbox(
|
291 |
+
label="Model Name",
|
292 |
+
placeholder="e.g., anthropic/claude-3-sonnet",
|
293 |
+
value="anthropic/claude-3-sonnet"
|
294 |
+
)
|
295 |
+
|
296 |
+
with gr.Row():
|
297 |
+
start_exam_btn = gr.Button("Start Automated Exam", variant="primary", size="lg")
|
298 |
+
|
299 |
+
with gr.Row():
|
300 |
+
progress_output = gr.Textbox(
|
301 |
+
label="Exam Progress - Dont close this window",
|
302 |
+
placeholder="Exam progress will be displayed here...",
|
303 |
+
lines=15,
|
304 |
+
max_lines=20,
|
305 |
+
interactive=False
|
306 |
+
)
|
307 |
+
|
308 |
+
start_exam_btn.click(
|
309 |
+
run_automated_exam,
|
310 |
+
inputs=[api_key_input, model_input],
|
311 |
+
outputs=[progress_output]
|
312 |
+
)
|
313 |
+
|
314 |
+
if __name__ == "__main__":
|
315 |
+
demo.launch()
|