Spaces:

Enderchef
/

NPFL-Leaderboard

Runtime error

App Files Files Community

Enderchef commited on Jun 18

Commit

09fee22

verified ·

1 Parent(s): ecf55e4

Create app.py

Browse files

Files changed (1) hide show

app.py +259 -0

app.py ADDED Viewed

	@@ -0,0 +1,259 @@

+import gradio as gr
+import pandas as pd
+import torch
+from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
+import json
+import os
+from datetime import datetime
+import time
+# --- Configuration ---
+QA_FILE = "qa.txt"
+RESULTS_FILE = "Eval_results.jsonl"
+JUDGE_MODEL_REPO = "google/flan-t5-base" # A capable but relatively small model for judging
+# --- Setup: Ensure files exist ---
+if not os.path.exists(RESULTS_FILE):
+    with open(RESULTS_FILE, "w") as f:
+        pass # Create an empty file if it doesn't exist
+if not os.path.exists(QA_FILE):
+    # Create a dummy qa.txt if it's missing, with a few example questions
+    dummy_data = """ID,Question_Type,Question,Golden_Answer_Summary
+1,Code,"Create a Python function that implements the Bubble Sort algorithm.","The function should take a list, use nested loops to compare adjacent elements, and swap them if they are in the wrong order. The outer loop runs n times, and the inner loop runs n-i-1 times."
+2,Common Chat,"What is the capital of France?","The answer must be Paris."
+3,Advanced Code,"Write a Python script that connects to a public FTP server, lists the files in the root directory, and then disconnects.","The script must import the `ftplib` library. It should create an FTP object, for example `FTP('ftp.dlptest.com')`, call the `login()` method, then `retrlines('LIST')` to print the directory listing, and finally `quit()` to close the connection."
+"""
+    with open(QA_FILE, "w") as f:
+        f.write(dummy_data)
+# --- AI Judge Logic ---
+def get_ai_judge_verdict(judge_pipeline, question, golden_summary, ai_answer):
+    """
+    Uses the AI Judge model to give a verdict on the tested model's answer.
+    """
+    system_instruction = f"""
+You are an expert evaluator for an AI model benchmark. Your task is to determine if the AI's answer is a correct and satisfactory response to the user's question. You must only respond with a single character: '1' for a correct/passing answer, or '0' for an incorrect/failing answer.
+A '1' means the AI's answer correctly addresses the main components of the question and is similar in spirit to the expected golden answer summary.
+A '0' means the AI's answer is factually wrong, does not address the question, is a refusal to answer, or is fundamentally incomplete.
+---
+User Question:
+{question}
+Expected Golden Answer Summary:
+{golden_summary}
+---
+AI Model's Answer:
+{ai_answer}
+---
+Based on this, is the AI Model's Answer correct? Respond with only '1' or '0'.
+"""
+    try:
+        response = judge_pipeline(system_instruction, max_new_tokens=5)
+        # Extract the generated text and clean it up
+        verdict = response[0]['generated_text'].strip()
+        # Ensure the verdict is either '1' or '0'
+        if '1' in verdict:
+            return 1
+        else:
+            return 0
+    except Exception:
+        # If the judge fails for any reason, default to a failing grade
+        return 0
+# --- Core Evaluation Logic ---
+def run_evaluation(model_repo, model_nickname, progress=gr.Progress()):
+    """
+    Loads a user-specified model, runs it against the benchmark, evaluates the answers
+    using an AI judge, and saves the results.
+    """
+    if not model_repo or not model_nickname:
+        gr.Warning("Model Repository and Nickname cannot be empty.")
+        return pd.DataFrame(), None
+    # Load benchmark questions
+    try:
+        questions_df = pd.read_csv(QA_FILE)
+        # Use a small subset for quick demos if needed
+        # questions_df = questions_df.head(3)
+    except Exception as e:
+        gr.Error(f"Failed to load benchmark questions from {QA_FILE}: {e}")
+        return pd.DataFrame(), None
+    # --- Load Models ---
+    progress(0, desc="Loading AI Judge Model...")
+    try:
+        judge_pipeline = pipeline("text2text-generation", model=JUDGE_MODEL_REPO, device_map="auto", torch_dtype=torch.bfloat16)
+    except Exception as e:
+        gr.Error(f"Failed to load AI Judge model '{JUDGE_MODEL_REPO}': {e}")
+        return pd.DataFrame(), None
+    progress(0.1, desc=f"Loading test model: {model_repo}")
+    try:
+        model_to_test_tokenizer = AutoTokenizer.from_pretrained(model_repo)
+        model_to_test = AutoModelForCausalLM.from_pretrained(
+            model_repo,
+            device_map="auto",
+            torch_dtype=torch.bfloat16 # bfloat16 is good for ZeroGPU
+        )
+        test_pipeline = pipeline(
+            "text-generation",
+            model=model_to_test,
+            tokenizer=model_to_test_tokenizer,
+            max_new_tokens=1024, # Set a reasonable limit for code generation
+            do_sample=True,
+            temperature=0.7,
+            top_p=0.95
+        )
+    except Exception as e:
+        gr.Error(f"Failed to load the specified test model '{model_repo}': {e}")
+        return pd.DataFrame(), None
+    # --- Run Benchmark Loop ---
+    detailed_results = []
+    total_score = 0
+    total_questions = len(questions_df)
+    for i, row in enumerate(questions_df.itertuples()):
+        progress_val = 0.1 + (0.8 * (i / total_questions))
+        progress(progress_val, desc=f"Running Q{row.ID}/{total_questions}")
+        # Generate answer from the model being tested
+        try:
+            prompt = f"Question: {row.Question}\n\nAnswer:"
+            response = test_pipeline(prompt)
+            ai_answer = response[0]['generated_text'].replace(prompt, "").strip()
+        except Exception as e:
+            ai_answer = f"Error during generation: {e}"
+        # Get verdict from the AI Judge
+        score = get_ai_judge_verdict(judge_pipeline, row.Question, row.Golden_Answer_Summary, ai_answer)
+        total_score += score
+        detailed_results.append({
+            "ID": row.ID,
+            "Question": row.Question,
+            "AI_Answer": ai_answer,
+            "Score": score
+        })
+        time.sleep(0.1) # Small delay to allow UI to update
+    # --- Finalize and Save Results ---
+    progress(0.95, desc="Finalizing and saving...")
+    final_score_percent = (total_score / total_questions) * 100 if total_questions > 0 else 0
+    run_summary = {
+        "model_nickname": model_nickname,
+        "model_repo": model_repo,
+        "score_percent": round(final_score_percent, 2),
+        "timestamp": datetime.utcnow().isoformat(),
+        "detailed_results": detailed_results
+    }
+    try:
+        with open(RESULTS_FILE, "a") as f:
+            f.write(json.dumps(run_summary) + "\n")
+    except Exception as e:
+        gr.Warning(f"Could not save results to {RESULTS_FILE}: {e}")
+    progress(1, desc="Evaluation Complete!")
+    return pd.DataFrame(detailed_results), gr.Markdown(f"**Overall Score: {final_score_percent:.2f}%**")
+# --- Leaderboard Logic ---
+def load_leaderboard():
+    """
+    Loads and displays the leaderboard from the results file.
+    """
+    if not os.path.exists(RESULTS_FILE) or os.path.getsize(RESULTS_FILE) == 0:
+        return pd.DataFrame(columns=["Rank", "Model Nickname", "Score (%)", "Date"])
+    results_data = []
+    with open(RESULTS_FILE, "r") as f:
+        for line in f:
+            try:
+                data = json.loads(line)
+                results_data.append({
+                    "Model Nickname": data.get("model_nickname"),
+                    "Score (%)": data.get("score_percent"),
+                    "Model Repo": data.get("model_repo"),
+                    "Date": datetime.fromisoformat(data.get("timestamp")).strftime('%Y-%m-%d %H:%M:%S')
+                })
+            except (json.JSONDecodeError, KeyError):
+                # Skip corrupted or malformed lines
+                continue
+    if not results_data:
+        return pd.DataFrame(columns=["Rank", "Model Nickname", "Score (%)", "Date"])
+    leaderboard_df = pd.DataFrame(results_data)
+    leaderboard_df = leaderboard_df.sort_values(by="Score (%)", ascending=False).reset_index(drop=True)
+    leaderboard_df["Rank"] = leaderboard_df.index + 1
+    # Reorder columns for display
+    leaderboard_df = leaderboard_df[["Rank", "Model Nickname", "Score (%)", "Date", "Model Repo"]]
+    return leaderboard_df
+# --- Gradio UI ---
+with gr.Blocks(theme=gr.themes.Soft(), title="NPFL Benchmark") as demo:
+    gr.Markdown("# NPFL (No Placeholders, Full Logic) AI Benchmark")
+    with gr.Tabs():
+        with gr.TabItem("Run Evaluation"):
+            with gr.Row():
+                with gr.Column(scale=2):
+                    model_repo_input = gr.Textbox(
+                        label="Hugging Face Model Repository",
+                        placeholder="e.g., google/gemma-2b-it",
+                        info="The model to be tested. Must be compatible with the text-generation pipeline."
+                    )
+                    model_nickname_input = gr.Textbox(
+                        label="Model Nickname",
+                        placeholder="e.g., Gemma-2B-v1",
+                        info="A unique name to display on the leaderboard."
+                    )
+                    run_button = gr.Button("Start Evaluation", variant="primary")
+                with gr.Column(scale=1):
+                    final_score_output = gr.Markdown("**Overall Score: --**")
+            gr.Markdown("---")
+            gr.Markdown("### Detailed Run Results")
+            results_output = gr.DataFrame(
+                headers=["ID", "Question", "AI_Answer", "Score"],
+                wrap=True,
+                height=600
+            )
+        with gr.TabItem("Leaderboard"):
+            leaderboard_refresh_button = gr.Button("Refresh Leaderboard")
+            leaderboard_output = gr.DataFrame(
+                headers=["Rank", "Model Nickname", "Score (%)", "Date", "Model Repo"],
+                wrap=True,
+                height=700
+            )
+    # --- Event Handlers ---
+    run_button.click(
+        fn=run_evaluation,
+        inputs=[model_repo_input, model_nickname_input],
+        outputs=[results_output, final_score_output]
+    )
+    leaderboard_refresh_button.click(
+        fn=load_leaderboard,
+        inputs=[],
+        outputs=[leaderboard_output]
+    )
+    # Load leaderboard once on startup
+    demo.load(load_leaderboard, None, leaderboard_output)
+if __name__ == "__main__":
+    demo.launch(debug=True)