""" Agent Evaluation Runner""" import os import gradio as gr import requests import pandas as pd import json import time from agent.agent import chat_with_agent # --- Constants --- DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space" # --- Agent Definition --- class BasicAgent: def __call__(self, question: str) -> str: print(f"Agent received question: {question}") # Get response from the agent using your LLM answer = chat_with_agent(question) return answer.strip() # Return just the clean answer def download_task_file(task_id, api_url): """Download file associated with a task ID""" url = f"{api_url}/files/{task_id}" try: response = requests.get(url) if response.status_code == 200: try: content = response.text if len(content) > 50000: # Limit to 50KB content = content[:50000] return content except UnicodeDecodeError: return f"[Binary file content - {len(response.content)} bytes]" elif response.status_code == 404: return None else: return None except Exception as e: return None def run_and_submit_all(username_input=""): """ Fetches all questions, runs the BasicAgent on them, submits all answers, and displays the results. """ # --- Determine HF Space Runtime URL and Repo URL --- space_id = os.getenv("SPACE_ID") # Get username from input if username_input: username = username_input.strip() print(f"Using provided username: {username}") else: print("No username provided.") return "Please provide a username.", None api_url = DEFAULT_API_URL questions_url = f"{api_url}/questions" submit_url = f"{api_url}/submit" # 1. Instantiate Agent try: agent = BasicAgent() except Exception as e: print(f"Error instantiating agent: {e}") return f"Error initializing agent: {e}", None agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "https://huggingface.co/spaces/kamil1300/agent_course/tree/main" print(agent_code) # 2. Fetch Questions print(f"Fetching questions from: {questions_url}") try: response = requests.get(questions_url, timeout=15) response.raise_for_status() questions_data = response.json() if not questions_data: print("Fetched questions list is empty.") return "Fetched questions list is empty or invalid format.", None # Limit to only 20 questions questions_data = questions_data[:20] print(f"Fetched {len(questions_data)} questions (limited to 20).") except Exception as e: print(f"Error fetching questions: {e}") return f"Error fetching questions: {e}", None # 3. Run your Agent results_log = [] answers_payload = [] print(f"Running agent on {len(questions_data)} questions...") for item in questions_data: task_id = item.get("task_id") question_text = item.get("question") if not task_id or question_text is None: print(f"Skipping item with missing task_id or question: {item}") continue try: # Download task file if available task_file_content = download_task_file(task_id, api_url) # Prepare the full context for the agent if task_file_content: full_context = f"Context/File Content:\n{task_file_content}\n\nQuestion: {question_text}" print(f"\n--- Question {task_id} ---") print(f"Question: {question_text}") print(f"File content length: {len(task_file_content)} characters") print(f"File content preview: {task_file_content[:200]}...") else: full_context = question_text print(f"\n--- Question {task_id} ---") print(f"Question: {question_text}") print("No file content available") # Get answer from your LLM agent with full context submitted_answer = agent(full_context) # Clean up the answer - extract only the final answer after "FINAL ANSWER:" if "FINAL ANSWER:" in submitted_answer: submitted_answer = submitted_answer.split("FINAL ANSWER:")[-1].strip() # Remove any extra explanations or context if "\n\n" in submitted_answer: submitted_answer = submitted_answer.split("\n\n")[0].strip() # Take only the first sentence if it's still too long if len(submitted_answer.split()) > 5: submitted_answer = submitted_answer.split('.')[0].strip() # Better answer cleaning submitted_answer = submitted_answer.strip() submitted_answer = submitted_answer.replace('"', '') # Remove quotes submitted_answer = submitted_answer.lower() # Standardize case # Print the answer for debugging print(f"Answer: {submitted_answer}") # Small delay to avoid overwhelming the API time.sleep(1) # Create answer entry in the required format answer_entry = { "task_id": task_id, "submitted_answer": submitted_answer } answers_payload.append(answer_entry) print(f"Answer Entry: {answer_entry}") print("-" * 50) # For display in the table, show truncated versions display_question = question_text[:200] + "..." if len(question_text) > 200 else question_text display_answer = submitted_answer[:200] + "..." if len(submitted_answer) > 200 else submitted_answer results_log.append({ "Task ID": task_id, "Question": display_question, "Model Answer": display_answer, "Score": "N/A" # No scoring since ground truth not available }) except Exception as e: print(f"Error running agent on task {task_id}: {e}") error_response = { "task_id": task_id, "submitted_answer": f"AGENT ERROR: {e}" } answers_payload.append(error_response) results_log.append({ "Task ID": task_id, "Question": question_text[:200] + "..." if question_text and len(question_text) > 200 else question_text, "Model Answer": f"AGENT ERROR: {e}", "Score": "ERROR" }) if not answers_payload: print("Agent did not produce any answers to submit.") return "Agent did not produce any answers to submit.", pd.DataFrame(results_log) # 4. Prepare Submission in the required format submission_data = { "username": username.strip(), "agent_code": agent_code, "answers": answers_payload } # Print the final submission format print("\n" + "="*60) print("FINAL SUBMISSION FORMAT:") print("="*60) print(json.dumps(submission_data, indent=2)) print("="*60) status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..." print(status_update) # 5. Submit print(f"Submitting {len(answers_payload)} answers to: {submit_url}") try: response = requests.post(submit_url, json=submission_data, timeout=60) response.raise_for_status() result_data = response.json() final_status = ( f"Submission Successful!\n" f"User: {result_data.get('username')}\n" f"Overall Score: {result_data.get('score', 'N/A')}% " f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n" f"Message: {result_data.get('message', 'No message received.')}" ) print("Submission successful.") results_df = pd.DataFrame(results_log) return final_status, results_df except Exception as e: status_message = f"Submission Failed: {e}" print(status_message) results_df = pd.DataFrame(results_log) return status_message, results_df # --- Build Gradio Interface --- with gr.Blocks() as demo: gr.Markdown("# Agent Evaluation Runner") gr.Markdown( """ **Instructions:** 1. Enter your Hugging Face username in the text box below. 2. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score. **Note:** This will take some time as the agent processes all questions. """ ) username_input = gr.Textbox(label="Enter your Hugging Face username", placeholder="your_username") run_button = gr.Button("Run Evaluation & Submit All Answers") status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False) results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True) run_button.click( fn=run_and_submit_all, inputs=[username_input], outputs=[status_output, results_table] ) if __name__ == "__main__": print("\n" + "-"*30 + " App Starting " + "-"*30) space_host_startup = os.getenv("SPACE_HOST") space_id_startup = os.getenv("SPACE_ID") if space_host_startup: print(f"✅ SPACE_HOST found: {space_host_startup}") print(f" Runtime URL should be: https://{space_host_startup}.hf.space") else: print("ℹ️ SPACE_HOST environment variable not found (running locally?).") if space_id_startup: print(f"✅ SPACE_ID found: {space_id_startup}") print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}") print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main") else: print("ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.") print("-"*(60 + len(" App Starting ")) + "\n") print("Launching Gradio Interface for Agent Evaluation...") demo.launch(debug=True, share=True)