Enderchef commited on
Commit
09fee22
·
verified ·
1 Parent(s): ecf55e4

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +259 -0
app.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import torch
4
+ from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
5
+ import json
6
+ import os
7
+ from datetime import datetime
8
+ import time
9
+
10
+ # --- Configuration ---
11
+ QA_FILE = "qa.txt"
12
+ RESULTS_FILE = "Eval_results.jsonl"
13
+ JUDGE_MODEL_REPO = "google/flan-t5-base" # A capable but relatively small model for judging
14
+
15
+ # --- Setup: Ensure files exist ---
16
+ if not os.path.exists(RESULTS_FILE):
17
+ with open(RESULTS_FILE, "w") as f:
18
+ pass # Create an empty file if it doesn't exist
19
+
20
+ if not os.path.exists(QA_FILE):
21
+ # Create a dummy qa.txt if it's missing, with a few example questions
22
+ dummy_data = """ID,Question_Type,Question,Golden_Answer_Summary
23
+ 1,Code,"Create a Python function that implements the Bubble Sort algorithm.","The function should take a list, use nested loops to compare adjacent elements, and swap them if they are in the wrong order. The outer loop runs n times, and the inner loop runs n-i-1 times."
24
+ 2,Common Chat,"What is the capital of France?","The answer must be Paris."
25
+ 3,Advanced Code,"Write a Python script that connects to a public FTP server, lists the files in the root directory, and then disconnects.","The script must import the `ftplib` library. It should create an FTP object, for example `FTP('ftp.dlptest.com')`, call the `login()` method, then `retrlines('LIST')` to print the directory listing, and finally `quit()` to close the connection."
26
+ """
27
+ with open(QA_FILE, "w") as f:
28
+ f.write(dummy_data)
29
+
30
+
31
+ # --- AI Judge Logic ---
32
+ def get_ai_judge_verdict(judge_pipeline, question, golden_summary, ai_answer):
33
+ """
34
+ Uses the AI Judge model to give a verdict on the tested model's answer.
35
+ """
36
+ system_instruction = f"""
37
+ You are an expert evaluator for an AI model benchmark. Your task is to determine if the AI's answer is a correct and satisfactory response to the user's question. You must only respond with a single character: '1' for a correct/passing answer, or '0' for an incorrect/failing answer.
38
+
39
+ A '1' means the AI's answer correctly addresses the main components of the question and is similar in spirit to the expected golden answer summary.
40
+ A '0' means the AI's answer is factually wrong, does not address the question, is a refusal to answer, or is fundamentally incomplete.
41
+
42
+ ---
43
+ User Question:
44
+ {question}
45
+
46
+ Expected Golden Answer Summary:
47
+ {golden_summary}
48
+
49
+ ---
50
+ AI Model's Answer:
51
+ {ai_answer}
52
+ ---
53
+
54
+ Based on this, is the AI Model's Answer correct? Respond with only '1' or '0'.
55
+ """
56
+ try:
57
+ response = judge_pipeline(system_instruction, max_new_tokens=5)
58
+ # Extract the generated text and clean it up
59
+ verdict = response[0]['generated_text'].strip()
60
+ # Ensure the verdict is either '1' or '0'
61
+ if '1' in verdict:
62
+ return 1
63
+ else:
64
+ return 0
65
+ except Exception:
66
+ # If the judge fails for any reason, default to a failing grade
67
+ return 0
68
+
69
+ # --- Core Evaluation Logic ---
70
+ def run_evaluation(model_repo, model_nickname, progress=gr.Progress()):
71
+ """
72
+ Loads a user-specified model, runs it against the benchmark, evaluates the answers
73
+ using an AI judge, and saves the results.
74
+ """
75
+ if not model_repo or not model_nickname:
76
+ gr.Warning("Model Repository and Nickname cannot be empty.")
77
+ return pd.DataFrame(), None
78
+
79
+ # Load benchmark questions
80
+ try:
81
+ questions_df = pd.read_csv(QA_FILE)
82
+ # Use a small subset for quick demos if needed
83
+ # questions_df = questions_df.head(3)
84
+ except Exception as e:
85
+ gr.Error(f"Failed to load benchmark questions from {QA_FILE}: {e}")
86
+ return pd.DataFrame(), None
87
+
88
+ # --- Load Models ---
89
+ progress(0, desc="Loading AI Judge Model...")
90
+ try:
91
+ judge_pipeline = pipeline("text2text-generation", model=JUDGE_MODEL_REPO, device_map="auto", torch_dtype=torch.bfloat16)
92
+ except Exception as e:
93
+ gr.Error(f"Failed to load AI Judge model '{JUDGE_MODEL_REPO}': {e}")
94
+ return pd.DataFrame(), None
95
+
96
+ progress(0.1, desc=f"Loading test model: {model_repo}")
97
+ try:
98
+ model_to_test_tokenizer = AutoTokenizer.from_pretrained(model_repo)
99
+ model_to_test = AutoModelForCausalLM.from_pretrained(
100
+ model_repo,
101
+ device_map="auto",
102
+ torch_dtype=torch.bfloat16 # bfloat16 is good for ZeroGPU
103
+ )
104
+ test_pipeline = pipeline(
105
+ "text-generation",
106
+ model=model_to_test,
107
+ tokenizer=model_to_test_tokenizer,
108
+ max_new_tokens=1024, # Set a reasonable limit for code generation
109
+ do_sample=True,
110
+ temperature=0.7,
111
+ top_p=0.95
112
+ )
113
+ except Exception as e:
114
+ gr.Error(f"Failed to load the specified test model '{model_repo}': {e}")
115
+ return pd.DataFrame(), None
116
+
117
+ # --- Run Benchmark Loop ---
118
+ detailed_results = []
119
+ total_score = 0
120
+ total_questions = len(questions_df)
121
+
122
+ for i, row in enumerate(questions_df.itertuples()):
123
+ progress_val = 0.1 + (0.8 * (i / total_questions))
124
+ progress(progress_val, desc=f"Running Q{row.ID}/{total_questions}")
125
+
126
+ # Generate answer from the model being tested
127
+ try:
128
+ prompt = f"Question: {row.Question}\n\nAnswer:"
129
+ response = test_pipeline(prompt)
130
+ ai_answer = response[0]['generated_text'].replace(prompt, "").strip()
131
+ except Exception as e:
132
+ ai_answer = f"Error during generation: {e}"
133
+
134
+ # Get verdict from the AI Judge
135
+ score = get_ai_judge_verdict(judge_pipeline, row.Question, row.Golden_Answer_Summary, ai_answer)
136
+ total_score += score
137
+
138
+ detailed_results.append({
139
+ "ID": row.ID,
140
+ "Question": row.Question,
141
+ "AI_Answer": ai_answer,
142
+ "Score": score
143
+ })
144
+ time.sleep(0.1) # Small delay to allow UI to update
145
+
146
+ # --- Finalize and Save Results ---
147
+ progress(0.95, desc="Finalizing and saving...")
148
+ final_score_percent = (total_score / total_questions) * 100 if total_questions > 0 else 0
149
+
150
+ run_summary = {
151
+ "model_nickname": model_nickname,
152
+ "model_repo": model_repo,
153
+ "score_percent": round(final_score_percent, 2),
154
+ "timestamp": datetime.utcnow().isoformat(),
155
+ "detailed_results": detailed_results
156
+ }
157
+
158
+ try:
159
+ with open(RESULTS_FILE, "a") as f:
160
+ f.write(json.dumps(run_summary) + "\n")
161
+ except Exception as e:
162
+ gr.Warning(f"Could not save results to {RESULTS_FILE}: {e}")
163
+
164
+ progress(1, desc="Evaluation Complete!")
165
+ return pd.DataFrame(detailed_results), gr.Markdown(f"**Overall Score: {final_score_percent:.2f}%**")
166
+
167
+
168
+ # --- Leaderboard Logic ---
169
+ def load_leaderboard():
170
+ """
171
+ Loads and displays the leaderboard from the results file.
172
+ """
173
+ if not os.path.exists(RESULTS_FILE) or os.path.getsize(RESULTS_FILE) == 0:
174
+ return pd.DataFrame(columns=["Rank", "Model Nickname", "Score (%)", "Date"])
175
+
176
+ results_data = []
177
+ with open(RESULTS_FILE, "r") as f:
178
+ for line in f:
179
+ try:
180
+ data = json.loads(line)
181
+ results_data.append({
182
+ "Model Nickname": data.get("model_nickname"),
183
+ "Score (%)": data.get("score_percent"),
184
+ "Model Repo": data.get("model_repo"),
185
+ "Date": datetime.fromisoformat(data.get("timestamp")).strftime('%Y-%m-%d %H:%M:%S')
186
+ })
187
+ except (json.JSONDecodeError, KeyError):
188
+ # Skip corrupted or malformed lines
189
+ continue
190
+
191
+ if not results_data:
192
+ return pd.DataFrame(columns=["Rank", "Model Nickname", "Score (%)", "Date"])
193
+
194
+ leaderboard_df = pd.DataFrame(results_data)
195
+ leaderboard_df = leaderboard_df.sort_values(by="Score (%)", ascending=False).reset_index(drop=True)
196
+ leaderboard_df["Rank"] = leaderboard_df.index + 1
197
+
198
+ # Reorder columns for display
199
+ leaderboard_df = leaderboard_df[["Rank", "Model Nickname", "Score (%)", "Date", "Model Repo"]]
200
+ return leaderboard_df
201
+
202
+
203
+ # --- Gradio UI ---
204
+ with gr.Blocks(theme=gr.themes.Soft(), title="NPFL Benchmark") as demo:
205
+ gr.Markdown("# NPFL (No Placeholders, Full Logic) AI Benchmark")
206
+
207
+ with gr.Tabs():
208
+ with gr.TabItem("Run Evaluation"):
209
+ with gr.Row():
210
+ with gr.Column(scale=2):
211
+ model_repo_input = gr.Textbox(
212
+ label="Hugging Face Model Repository",
213
+ placeholder="e.g., google/gemma-2b-it",
214
+ info="The model to be tested. Must be compatible with the text-generation pipeline."
215
+ )
216
+ model_nickname_input = gr.Textbox(
217
+ label="Model Nickname",
218
+ placeholder="e.g., Gemma-2B-v1",
219
+ info="A unique name to display on the leaderboard."
220
+ )
221
+ run_button = gr.Button("Start Evaluation", variant="primary")
222
+ with gr.Column(scale=1):
223
+ final_score_output = gr.Markdown("**Overall Score: --**")
224
+
225
+ gr.Markdown("---")
226
+ gr.Markdown("### Detailed Run Results")
227
+ results_output = gr.DataFrame(
228
+ headers=["ID", "Question", "AI_Answer", "Score"],
229
+ wrap=True,
230
+ height=600
231
+ )
232
+
233
+ with gr.TabItem("Leaderboard"):
234
+ leaderboard_refresh_button = gr.Button("Refresh Leaderboard")
235
+ leaderboard_output = gr.DataFrame(
236
+ headers=["Rank", "Model Nickname", "Score (%)", "Date", "Model Repo"],
237
+ wrap=True,
238
+ height=700
239
+ )
240
+
241
+ # --- Event Handlers ---
242
+ run_button.click(
243
+ fn=run_evaluation,
244
+ inputs=[model_repo_input, model_nickname_input],
245
+ outputs=[results_output, final_score_output]
246
+ )
247
+
248
+ leaderboard_refresh_button.click(
249
+ fn=load_leaderboard,
250
+ inputs=[],
251
+ outputs=[leaderboard_output]
252
+ )
253
+
254
+ # Load leaderboard once on startup
255
+ demo.load(load_leaderboard, None, leaderboard_output)
256
+
257
+
258
+ if __name__ == "__main__":
259
+ demo.launch(debug=True)