""" Agent Evaluation Runner ====================== This module implements a framework for evaluating LLM agents against a set of questions and submitting the results to a scoring server. Main components: - BasicAgent: The agent implementation that processes questions - Evaluation functions: For running and submitting results - Gradio interface: For user interaction """ import os import logging from typing import Tuple, List, Dict, Any, Optional import gradio as gr import requests import pandas as pd from langchain_core.messages import HumanMessage from agent import build_graph # Configure logging logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S" ) logger = logging.getLogger(__name__) # --- Constants --- DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space" REQUEST_TIMEOUT = 60 # seconds class BasicAgent: """ A LangGraph-based agent that answers questions using a graph-based workflow. This agent takes natural language questions, processes them through a predefined graph workflow, and returns the answer. Attributes: graph: The LangGraph workflow that processes the questions """ def __init__(self): """Initialize the agent with a graph-based workflow.""" logger.info("Initializing BasicAgent") self.graph = build_graph() def __call__(self, question: str) -> str: """ Process a question and return an answer. Args: question: The natural language question to process Returns: The agent's answer to the question """ logger.info(f"Processing question (first 50 chars): {question[:50]}...") # Wrap the question in a HumanMessage from langchain_core messages = [HumanMessage(content=question)] # Process through the graph messages = self.graph.invoke({"messages": messages}) # Extract and clean the answer answer = messages['messages'][-1].content # Remove the "FINAL ANSWER:" prefix if present return answer[14:] if answer.startswith("FINAL ANSWER:") else answer def fetch_questions(api_url: str) -> List[Dict[str, Any]]: """ Fetch questions from the evaluation server. Args: api_url: Base URL of the evaluation API Returns: List of question data dictionaries Raises: requests.exceptions.RequestException: If there's an error fetching questions """ questions_url = f"{api_url}/questions" logger.info(f"Fetching questions from: {questions_url}") response = requests.get(questions_url, timeout=REQUEST_TIMEOUT) response.raise_for_status() questions_data = response.json() if not questions_data: raise ValueError("Fetched questions list is empty or invalid format") logger.info(f"Successfully fetched {len(questions_data)} questions") return questions_data def run_agent_on_questions( agent: BasicAgent, questions_data: List[Dict[str, Any]] ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: """ Run the agent on a list of questions. Args: agent: The agent to run questions_data: List of question data dictionaries Returns: Tuple of (answers_payload, results_log) """ results_log = [] answers_payload = [] logger.info(f"Running agent on {len(questions_data)} questions...") for item in questions_data: task_id = item.get("task_id") question_text = item.get("question") if not task_id or question_text is None: logger.warning(f"Skipping item with missing task_id or question: {item}") continue try: submitted_answer = agent(question_text) # Prepare answer for submission answers_payload.append({ "task_id": task_id, "submitted_answer": submitted_answer }) # Log result for display results_log.append({ "Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer }) except Exception as e: logger.error(f"Error running agent on task {task_id}: {e}", exc_info=True) # Log error in results results_log.append({ "Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}" }) return answers_payload, results_log def submit_answers( api_url: str, username: str, agent_code: str, answers_payload: List[Dict[str, Any]] ) -> Dict[str, Any]: """ Submit answers to the evaluation server. Args: api_url: Base URL of the evaluation API username: Hugging Face username agent_code: URL to the agent code repository answers_payload: List of answer dictionaries Returns: Response data from the server Raises: requests.exceptions.RequestException: If there's an error during submission """ submit_url = f"{api_url}/submit" # Prepare submission data submission_data = { "username": username.strip(), "agent_code": agent_code, "answers": answers_payload } logger.info(f"Submitting {len(answers_payload)} answers to: {submit_url}") # Submit answers response = requests.post(submit_url, json=submission_data, timeout=REQUEST_TIMEOUT) response.raise_for_status() result_data = response.json() logger.info("Submission successful") return result_data def run_and_submit_all(profile: Optional[gr.OAuthProfile] = None) -> Tuple[str, pd.DataFrame]: """ Fetches all questions, runs the BasicAgent on them, submits all answers, and displays the results. Args: profile: Gradio OAuth profile containing user information Returns: Tuple of (status_message, results_dataframe) """ # Check if user is logged in if not profile: logger.warning("User not logged in") return "Please Login to Hugging Face with the button.", None username = profile.username logger.info(f"User logged in: {username}") # Get the space ID for linking to code space_id = os.getenv("SPACE_ID") api_url = DEFAULT_API_URL agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" try: # 1. Instantiate Agent agent = BasicAgent() # 2. Fetch Questions questions_data = fetch_questions(api_url) # 3. Run Agent on Questions answers_payload, results_log = run_agent_on_questions(agent, questions_data) if not answers_payload: logger.warning("Agent did not produce any answers to submit") return "Agent did not produce any answers to submit.", pd.DataFrame(results_log) # 4. Submit Answers result_data = submit_answers(api_url, username, agent_code, answers_payload) # 5. Format and Return Results final_status = ( f"Submission Successful!\n" f"User: {result_data.get('username')}\n" f"Overall Score: {result_data.get('score', 'N/A')}% " f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n" f"Message: {result_data.get('message', 'No message received.')}" ) results_df = pd.DataFrame(results_log) return final_status, results_df except requests.exceptions.HTTPError as e: # Handle HTTP errors with detailed error information error_detail = f"Server responded with status {e.response.status_code}." try: error_json = e.response.json() error_detail += f" Detail: {error_json.get('detail', e.response.text)}" except requests.exceptions.JSONDecodeError: error_detail += f" Response: {e.response.text[:500]}" status_message = f"Submission Failed: {error_detail}" logger.error(status_message) results_df = pd.DataFrame(results_log if 'results_log' in locals() else []) return status_message, results_df except requests.exceptions.Timeout: status_message = f"Submission Failed: The request timed out after {REQUEST_TIMEOUT} seconds" logger.error(status_message) results_df = pd.DataFrame(results_log if 'results_log' in locals() else []) return status_message, results_df except Exception as e: status_message = f"An unexpected error occurred: {str(e)}" logger.error(status_message, exc_info=True) results_df = pd.DataFrame(results_log if 'results_log' in locals() else []) return status_message, results_df def create_gradio_interface() -> gr.Blocks: """ Create and configure the Gradio interface. Returns: Configured Gradio Blocks interface """ with gr.Blocks() as demo: gr.Markdown("# Agent Evaluation Runner") gr.Markdown( """ ## Instructions 1. **Clone this space** and modify the code to define your agent's logic, tools, and dependencies 2. **Log in to your Hugging Face account** using the button below (required for submission) 3. **Run Evaluation** to fetch questions, run your agent, and submit answers ## Important Notes - The evaluation process may take several minutes to complete - This agent framework is intentionally minimal to allow for your own improvements - Consider implementing caching or async processing for better performance """ ) gr.LoginButton() run_button = gr.Button("Run Evaluation & Submit All Answers", variant="primary") status_output = gr.Textbox( label="Run Status / Submission Result", lines=5, interactive=False ) results_table = gr.DataFrame( label="Questions and Agent Answers", wrap=True ) run_button.click( fn=run_and_submit_all, outputs=[status_output, results_table] ) return demo def check_environment() -> None: """ Check and log environment variables at startup. """ logger.info("-" * 30 + " App Starting " + "-" * 30) # Check for SPACE_HOST space_host = os.getenv("SPACE_HOST") if space_host: logger.info(f"✅ SPACE_HOST found: {space_host}") logger.info(f" Runtime URL should be: https://{space_host}.hf.space") else: logger.info("ℹ️ SPACE_HOST environment variable not found (running locally?).") # Check for SPACE_ID space_id = os.getenv("SPACE_ID") if space_id: logger.info(f"✅ SPACE_ID found: {space_id}") logger.info(f" Repo URL: https://huggingface.co/spaces/{space_id}") logger.info(f" Repo Tree URL: https://huggingface.co/spaces/{space_id}/tree/main") else: logger.info("ℹ️ SPACE_ID environment variable not found (running locally?).") logger.info("-" * (60 + len(" App Starting ")) + "\n") if __name__ == "__main__": # Check environment at startup check_environment() # Create and launch Gradio interface logger.info("Launching Gradio Interface for Agent Evaluation...") demo = create_gradio_interface() demo.launch(debug=True, share=False)