RafaelJaime commited on
Commit
7a9b69c
Β·
verified Β·
1 Parent(s): 6172280

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +315 -0
app.py ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ import random
4
+ from datasets import load_dataset, Dataset
5
+ from typing import Dict, List
6
+ import re
7
+ import datetime
8
+ import pandas as pd
9
+ import os
10
+ from dotenv import load_dotenv
11
+
12
+ load_dotenv()
13
+
14
+ def sanitize_theme_name(theme: str) -> str:
15
+ sanitized = re.sub(r'[^\w\s-]', '', theme)
16
+ sanitized = re.sub(r'[-\s]+', '_', sanitized)
17
+ return sanitized.lower().strip('_')
18
+
19
+ def load_questions_from_dataset() -> Dict[str, List[Dict]]:
20
+ dataset = load_dataset("SASLeaderboard/sas_opposition_exam_data")
21
+ dataset = dataset['train'].filter(lambda x: x['theme'] == 'FEA UrologΓ­a')
22
+
23
+ questions_by_theme = {}
24
+ skipped = 0
25
+ loaded = 0
26
+
27
+ for item in dataset:
28
+ theme = item['theme']
29
+ answers = item.get('answers', [])
30
+ correct_answer = item.get('correct_answer', '')
31
+
32
+ if not answers or not correct_answer or len(answers) < 3:
33
+ skipped += 1
34
+ continue
35
+
36
+ while len(answers) < 4:
37
+ answers.append(answers[-1])
38
+
39
+ sanitized_theme = sanitize_theme_name(theme)
40
+
41
+ if sanitized_theme not in questions_by_theme:
42
+ questions_by_theme[sanitized_theme] = []
43
+
44
+ try:
45
+ question = {
46
+ "statement": item['statement'],
47
+ "options": {
48
+ "A": answers[0],
49
+ "B": answers[1],
50
+ "C": answers[2],
51
+ "D": answers[3]
52
+ },
53
+ "real_answer": correct_answer,
54
+ "theme": theme,
55
+ "sanitized_theme": sanitized_theme,
56
+ "version": item.get('version', 'Default')
57
+ }
58
+
59
+ questions_by_theme[sanitized_theme].append(question)
60
+ loaded += 1
61
+ except Exception as e:
62
+ skipped += 1
63
+ continue
64
+
65
+ print(f"Loaded {loaded} questions, skipped {skipped} invalid questions")
66
+ return questions_by_theme
67
+
68
+ def ask_ai_model(api_key: str, model: str, question: Dict) -> tuple:
69
+ prompt = f"""You are a medical expert taking a urology examination. Please analyze this question carefully and provide your answer.
70
+
71
+ Question: {question['statement']}
72
+
73
+ Options:
74
+ A) {question['options']['A']}
75
+ B) {question['options']['B']}
76
+ C) {question['options']['C']}
77
+ D) {question['options']['D']}
78
+
79
+ Please provide your answer in this exact format:
80
+ Answer: [A/B/C/D]
81
+
82
+ Then provide your reasoning."""
83
+
84
+ try:
85
+ headers = {
86
+ "Authorization": f"Bearer {api_key}",
87
+ "Content-Type": "application/json"
88
+ }
89
+
90
+ data = {
91
+ "model": model,
92
+ "messages": [
93
+ {"role": "user", "content": prompt}
94
+ ]
95
+ }
96
+
97
+ response = requests.post("https://openrouter.ai/api/v1/chat/completions",
98
+ headers=headers, json=data)
99
+
100
+ if response.status_code == 200:
101
+ result = response.json()
102
+ ai_response = result["choices"][0]["message"]["content"]
103
+
104
+ ai_answer = extract_answer_from_response(ai_response)
105
+
106
+ return ai_response, ai_answer
107
+ else:
108
+ error_msg = f"API Error {response.status_code}: {response.text}"
109
+ return error_msg, "API_ERROR"
110
+
111
+ except Exception as e:
112
+ error_msg = f"Request Error: {str(e)}"
113
+ return error_msg, "REQUEST_ERROR"
114
+
115
+ def extract_answer_from_response(ai_response: str) -> str:
116
+ if not ai_response:
117
+ return "EMPTY_RESPONSE"
118
+
119
+ lines = ai_response.split('\n')
120
+
121
+ for line in lines:
122
+ line_clean = line.strip().lower()
123
+ if line_clean.startswith('answer:'):
124
+ answer_part = line.split(':')[1].strip().upper()
125
+ for char in answer_part:
126
+ if char in ['A', 'B', 'C', 'D']:
127
+ return char
128
+
129
+ for line in lines:
130
+ line_clean = line.strip().lower()
131
+ if 'answer is' in line_clean:
132
+ for char in ['A', 'B', 'C', 'D']:
133
+ if char.lower() in line_clean.split('answer is')[1][:5]:
134
+ return char
135
+
136
+ for line in lines[:5]:
137
+ line_upper = line.upper()
138
+ for char in ['A', 'B', 'C', 'D']:
139
+ patterns = [f"{char})", f"{char}.", f"OPTION {char}", f"({char})", f"CHOICE {char}"]
140
+ for pattern in patterns:
141
+ if pattern in line_upper:
142
+ return char
143
+
144
+ for line in lines[:3]:
145
+ for char in ['A', 'B', 'C', 'D']:
146
+ if char in line.upper():
147
+ return char
148
+
149
+ for char in ['A', 'B', 'C', 'D']:
150
+ if char in ai_response.upper():
151
+ return char
152
+
153
+ return "NO_ANSWER_FOUND"
154
+
155
+ def save_results_to_dataset(results: List[Dict], hf_token: str = None) -> str:
156
+ if not results:
157
+ return "No results to save"
158
+
159
+ if not hf_token:
160
+ hf_token = os.getenv("HF_TOKEN")
161
+
162
+ if not hf_token:
163
+ return "❌ HuggingFace token not found. Please provide it in the interface or set HF_TOKEN environment variable"
164
+
165
+ try:
166
+ try:
167
+ existing_dataset = load_dataset("SASLeaderboard/results", use_auth_token=hf_token)
168
+ existing_data = existing_dataset['train'].to_pandas()
169
+ except Exception:
170
+ existing_data = None
171
+
172
+ new_data = pd.DataFrame(results)
173
+
174
+ if existing_data is not None:
175
+ combined_data = pd.concat([existing_data, new_data], ignore_index=True)
176
+ else:
177
+ combined_data = new_data
178
+
179
+ new_dataset = Dataset.from_pandas(combined_data)
180
+
181
+ new_dataset.push_to_hub(
182
+ "SASLeaderboard/results",
183
+ token=hf_token,
184
+ commit_message=f"Automated exam results for {results[0]['model']} - {len(results)} questions"
185
+ )
186
+
187
+ return f"βœ… Successfully saved {len(results)} results to SASLeaderboard/results dataset"
188
+
189
+ except Exception as e:
190
+ return f"❌ Error saving results: {str(e)}"
191
+
192
+ def run_automated_exam(api_key: str, model: str, hf_token: str = ""):
193
+ if not api_key:
194
+ yield "❌ Please provide OpenRouter API key"
195
+ return
196
+
197
+ if not model:
198
+ yield "❌ Please provide model name"
199
+ return
200
+
201
+ yield "πŸ”„ Loading questions from dataset..."
202
+
203
+ try:
204
+ all_questions_by_theme = load_questions_from_dataset()
205
+
206
+ all_questions = []
207
+ for theme_questions in all_questions_by_theme.values():
208
+ all_questions.extend(theme_questions)
209
+
210
+ total_questions = len(all_questions)
211
+
212
+ yield f"βœ… Loaded {total_questions} questions from dataset"
213
+ yield f"πŸš€ Starting automated exam with ALL {total_questions} questions for model: {model}"
214
+
215
+ session_id = f"{model}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}"
216
+ results = []
217
+ correct_count = 0
218
+
219
+ for i, question in enumerate(all_questions):
220
+
221
+ ai_response, ai_answer = ask_ai_model(api_key, model, question)
222
+
223
+ if ai_answer in ["API_ERROR", "REQUEST_ERROR", "EMPTY_RESPONSE", "NO_ANSWER_FOUND"]:
224
+ yield f"⚠️ Question {i+1}: Error getting answer - {ai_answer}. Response: {ai_response[:100]}..."
225
+
226
+ is_correct = ai_answer == question['real_answer']
227
+ if is_correct:
228
+ correct_count += 1
229
+
230
+ result = {
231
+ "session_id": session_id,
232
+ "model": model,
233
+ "question": question['statement'],
234
+ "theme": question['theme'],
235
+ "correct_answer": question['real_answer'],
236
+ "ai_answer": ai_answer,
237
+ "ai_response": ai_response,
238
+ "is_correct": is_correct,
239
+ "timestamp": datetime.datetime.now().isoformat(),
240
+ "options_a": question['options']['A'],
241
+ "options_b": question['options']['B'],
242
+ "options_c": question['options']['C'],
243
+ "options_d": question['options']['D']
244
+ }
245
+
246
+ results.append(result)
247
+
248
+ current_accuracy = (correct_count / (i + 1)) * 100
249
+
250
+ status_emoji = "βœ…" if is_correct else "❌"
251
+ yield f"{status_emoji} Q{i+1}/{total_questions}: Accuracy: {correct_count}/{i+1} ({current_accuracy:.1f}%) | AI: {ai_answer} vs Correct: {question['real_answer']} | {question['statement'][:80]}..."
252
+
253
+ yield f"πŸ’Ύ Saving results to HuggingFace dataset..."
254
+
255
+ save_result = save_results_to_dataset(results, hf_token)
256
+
257
+ final_accuracy = (correct_count / len(results)) * 100
258
+ yield f"""
259
+ ## 🎯 Exam Complete!
260
+
261
+ **Final Results:**
262
+ - Model: {model}
263
+ - Total Questions: {len(results)}
264
+ - Correct Answers: {correct_count}
265
+ - Final Accuracy: {final_accuracy:.1f}%
266
+ - Session ID: {session_id}
267
+
268
+ **Save Status:** {save_result}
269
+
270
+ The automated exam has been completed successfully!
271
+ """
272
+
273
+ except Exception as e:
274
+ yield f"❌ Error during automated exam: {str(e)}"
275
+
276
+ with gr.Blocks(title="Automated Urology Exam System") as demo:
277
+ gr.Markdown("# Automated Urology Exam System")
278
+ gr.Markdown("This system automatically runs a complete urology exam for AI models using ALL available questions (~150) and saves results to the dataset.")
279
+
280
+ with gr.Row():
281
+ with gr.Column():
282
+ gr.Markdown("**Get your API key:** [OpenRouter Keys](https://openrouter.ai/settings/keys)")
283
+ api_key_input = gr.Textbox(
284
+ label="OpenRouter API Key",
285
+ type="password",
286
+ placeholder="Enter your OpenRouter API key"
287
+ )
288
+ with gr.Column():
289
+ gr.Markdown("**Find models:** [OpenRouter Models](https://openrouter.ai/models)")
290
+ model_input = gr.Textbox(
291
+ label="Model Name",
292
+ placeholder="e.g., anthropic/claude-3-sonnet",
293
+ value="anthropic/claude-3-sonnet"
294
+ )
295
+
296
+ with gr.Row():
297
+ start_exam_btn = gr.Button("Start Automated Exam", variant="primary", size="lg")
298
+
299
+ with gr.Row():
300
+ progress_output = gr.Textbox(
301
+ label="Exam Progress - Dont close this window",
302
+ placeholder="Exam progress will be displayed here...",
303
+ lines=15,
304
+ max_lines=20,
305
+ interactive=False
306
+ )
307
+
308
+ start_exam_btn.click(
309
+ run_automated_exam,
310
+ inputs=[api_key_input, model_input],
311
+ outputs=[progress_output]
312
+ )
313
+
314
+ if __name__ == "__main__":
315
+ demo.launch()