Final_Assignment_Template

Sleeping

App Files Files Community

priyamarwaha commited on May 27

Commit

a94fa9b

verified ·

1 Parent(s): 81917a3

Upload 30 files

Browse files

Files changed (31) hide show

.gitattributes +2 -0
agent.py +262 -0
answers/1f975693-876d-457b-a649-393859e79bf3.json +5 -0
answers/2d83110e-a098-4ebb-9987-066c06fa42d0.json +5 -0
answers/305ac316-eef6-4446-960a-92d80d542f82.json +5 -0
answers/3cef3a44-215e-4aed-8e3b-b1e3f08063b7.json +5 -0
answers/3f57289b-8c60-48be-bd80-01f8099ca449.json +5 -0
answers/4fc2f1ae-8625-45b5-ab34-ad4433bc21f8.json +5 -0
answers/5a0c1adf-205e-4841-a666-7c3ef95def9d.json +5 -0
answers/6f37996b-2ac7-44b0-8e68-6d28256631b4.json +5 -0
answers/7bd855d8-463d-4ed5-93ca-5fe35145f733.json +5 -0
answers/840bfca7-4f7b-481a-8794-c560c340185d.json +5 -0
answers/8e867cd7-cff9-4e6c-867a-ff5ddc2550be.json +5 -0
answers/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.json +5 -0
answers/9d191bce-651d-4746-be2d-7ef8ecadb9c2.json +5 -0
answers/a0c07678-e491-4bbc-8f0b-07405144218f.json +5 -0
answers/a1e91b78-d3d8-4675-bb8d-62741b4b68a6.json +5 -0
answers/bda648d7-d618-4883-88f4-3466eabd860e.json +5 -0
answers/cabe07ed-9eca-40ea-8ead-410ef5e83f91.json +5 -0
answers/cca530fc-4052-43b2-b130-b30968d8aa44.json +5 -0
answers/cf106601-ab4f-4af9-b045-5295fe67b37d.json +5 -0
answers/f918266a-b3e0-4914-865d-4faa564f1aef.json +5 -0
app.py +180 -105
dataset_helper.py +172 -0
downloads/1f975693-876d-457b-a649-393859e79bf3.mp3 +3 -0
downloads/7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx +0 -0
downloads/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3 +3 -0
downloads/cca530fc-4052-43b2-b130-b30968d8aa44.png +0 -0
downloads/f918266a-b3e0-4914-865d-4faa564f1aef.py +35 -0
requirements.txt +16 -2
tools.py +314 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+downloads/1f975693-876d-457b-a649-393859e79bf3.mp3 filter=lfs diff=lfs merge=lfs -text
+downloads/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3 filter=lfs diff=lfs merge=lfs -text

agent.py ADDED Viewed

	@@ -0,0 +1,262 @@

+# agent.py
+import logging # Import logging
+import os # For file/directory operations
+import json # For reading/writing JSON answer files
+# import base64 # No longer needed here
+from typing import TypedDict, Annotated, Optional, List
+from dotenv import load_dotenv # Import load_dotenv
+from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage, ToolMessage
+from langchain_openai import ChatOpenAI
+from langgraph.graph import StateGraph, START, END
+from langgraph.graph.message import add_messages
+from langgraph.prebuilt import ToolNode, tools_condition
+from dataset_helper import download_file # For potential use in file handling
+# Get the logger instance configured in app.py
+logger = logging.getLogger("eval_logger")
+# Load environment variables from .env file at the beginning
+# This will load OPENAI_API_KEY if it's set in a .env file in the root directory.
+if load_dotenv():
+    logger.info(".env file loaded successfully by agent.py.")
+else:
+    logger.info(".env file not found or empty in agent.py, relying on system environment variables.")
+# Import tools AFTER .env might have been loaded
+from tools import TOOLS
+# --- Agent State Definition ---
+class AgentState(TypedDict):
+    task_id: str
+    original_question: str
+    input_file_path: Optional[str] # Path to the locally downloaded file, if any
+    messages: Annotated[list[AnyMessage], add_messages]
+    # Potentially add other fields like 'scratchpad' or 'intermediate_steps' if needed
+# --- Tool Definitions --- MOVED TO tools.py ---
+# vision_llm, extract_text_from_image, search_tool, TOOLS list are now in tools.py
+# --- LangGraph Agent Class ---
+class LangGraphAgent:
+    def __init__(self, api_url: str, answers_dir: str = "answers"):
+        logger.info("LangGraphAgent initializing...")
+        self.api_url = api_url # Needed for download_file, though not directly by graph
+        self.answers_dir = answers_dir
+        os.makedirs(self.answers_dir, exist_ok=True)
+        logger.info(f"Answers will be stored in: {os.path.abspath(self.answers_dir)}")
+        # Initialize LLM for the agent
+        # Ensure OPENAI_API_KEY is set in your environment
+        try:
+            self.llm = ChatOpenAI(model="gpt-4o", temperature=0)
+            # Bind tools imported from tools.py
+            self.agent_llm = self.llm.bind_tools(TOOLS, parallel_tool_calls=False) # parallel_tool_calls=False as per example
+        except Exception as e:
+            logger.error(f"Failed to initialize agent LLM (ChatOpenAI with gpt-4o) or bind tools: {e}. Ensure OPENAI_API_KEY is set.", exc_info=True)
+            self.llm = None
+            self.agent_llm = None
+        # Build the graph
+        self.graph = self._build_graph()
+        logger.info("LangGraphAgent initialized successfully.")
+    def _save_answer(self, task_id: str, question: str, answer: str):
+        """Saves the generated answer to a JSON file."""
+        answer_payload = {"task_id": task_id, "question": question, "answer": answer}
+        answer_file_path = os.path.join(self.answers_dir, f"{task_id}.json")
+        try:
+            with open(answer_file_path, 'w') as f:
+                json.dump(answer_payload, f, indent=4)
+            logger.info(f"Answer for task_id {task_id} saved to {answer_file_path}")
+        except IOError as e:
+            logger.error(f"Error saving answer for task_id {task_id} to {answer_file_path}: {e}", exc_info=True)
+    def _load_answer(self, task_id: str) -> str | None:
+        """Loads an answer from a JSON file if it exists."""
+        answer_file_path = os.path.join(self.answers_dir, f"{task_id}.json")
+        if os.path.exists(answer_file_path):
+            try:
+                with open(answer_file_path, 'r') as f:
+                    answer_data = json.load(f)
+                logger.info(f"Loaded existing answer for task_id {task_id} from {answer_file_path}")
+                return answer_data.get("answer")
+            except (IOError, json.JSONDecodeError) as e:
+                logger.error(f"Error loading answer for task_id {task_id} from {answer_file_path}: {e}", exc_info=True)
+        return None
+    # --- Graph Node Definitions ---
+    def _assistant_node(self, state: AgentState):
+        logger.info(f"_assistant_node called for task_id: {state['task_id']}. Current messages count: {len(state['messages'])}")
+        if not self.agent_llm:
+            logger.error("Agent LLM not initialized. Cannot proceed with assistant node.")
+            # Return a message indicating error, which will be added to state by add_messages
+            # This helps in debugging and ensures flow continues to an extent
+            error_message = SystemMessage(content="Error: Agent LLM not initialized. Cannot generate response.")
+            return {"messages": [error_message]}
+        system_prompt_parts = [
+            f"You are a helpful AI assistant for the GAIA benchmark. Your goal is to answer the user's question accurately and concisely. ",
+            f"The user's question is about task_id: {state['task_id']}.\n",
+            f"The original question is: {state['original_question']}\n"
+        ]
+        input_file_path = state.get('input_file_path')
+        original_question_text = state['original_question']
+        if input_file_path:
+            system_prompt_parts.append(f"A local file is available at path: {input_file_path}. ")
+            file_extension = os.path.splitext(input_file_path)[1].lower()
+            if file_extension in ['.png', '.jpg', '.jpeg', '.gif', '.webp']:
+                system_prompt_parts.append(f"This file appears to be an image. You can use the 'analyse_image' tool to analyse it. This tool requires the 'img_path' (which is '{input_file_path}') and the 'question' (which is '{original_question_text}') to be passed as arguments. This tool works only for local image files. ")
+            elif file_extension in ['.mp3', '.wav', '.aac', '.flac', '.ogg', '.opus']: # Common audio types for AssemblyAI
+                system_prompt_parts.append(f"This file appears to be an audio file. You can use the 'analyse_audio' tool to analyse its content. This tool requires the 'audio_path' (which is '{input_file_path}') and the 'question' (which is '{original_question_text}') to be passed as arguments. This tool works only for local audio files and cannot process web URLs. ")
+            elif file_extension == '.py':
+                system_prompt_parts.append(f"This file appears to be a Python script. You can use the 'execute_python_code_from_file' tool to understand its content and answer questions about it (e.g., predict its output or describe its functionality). This tool requires the 'file_path' (which is '{input_file_path}') and the 'question' (which is '{original_question_text}') as arguments. This tool analyses the code textually; it does not execute it. ")
+            elif file_extension in ['.xls', '.xlsx']:
+                system_prompt_parts.append(f"This file appears to be an Excel file. To answer questions requiring calculations, data manipulation, or specific lookups: "
+                                           f"1. You should generate a Python script using the pandas library. "
+                                           f"2. Use the 'execute_pandas_script_for_excel' tool to run this script. "
+                                           f"3. The script will have access to a variable 'excel_file_path' which holds the path: '{input_file_path}'. Use this variable in your script to load the Excel file (e.g., pd.read_excel(excel_file_path)). "
+                                           f"4. Your generated Python script MUST end with a print() statement that outputs ONLY the final answer, precisely formatted. "
+                                           f"5. If you first need to understand the structure of the Excel file (sheet names, columns), you can use the 'analyse_excel_file' tool which provides a textual (CSV) representation of the data. But for computation, use 'execute_pandas_script_for_excel'. "
+                                           f"Pass the '{input_file_path}' as 'excel_file_path' and your generated script as 'python_code' to the 'execute_pandas_script_for_excel' tool. ")
+            else:
+                system_prompt_parts.append(f"The provided file '{input_file_path}' is not a supported image, audio, Python, or Excel type for direct analysis with available tools. Do not attempt to use 'analyse_image', 'analyse_audio', 'execute_python_code_from_file', or 'analyse_excel_file'/'execute_pandas_script_for_excel' for this file. You may need to rely on web search or the question text itself. ")
+        else:
+            system_prompt_parts.append("No local file was provided with this question. ")
+        system_prompt_parts.append("If the question text itself contains a URL (e.g., a link to a YouTube video or other website), you should primarily use the 'web_search' tool to find information related to that URL and the question. For YouTube URLs, specifically rely on 'web_search' as direct transcript access is not available. ")
+        system_prompt_parts.append("You also have access to a 'web_search' tool for general information or if the question implies online content (e.g., a URL mentioned in the question text). ")
+        system_prompt_parts.append("If a tool fails or a file type is unsupported, do not try the same tool repeatedly on it. Use web_search or state you cannot answer if appropriate. ")
+        system_prompt_parts.append("Prioritize answering the question. If after about 5-7 tool execution cycles you cannot find a definitive answer, you MUST provide the best possible answer based on the information you have gathered or state CLEARLY that you cannot answer the question. DO NOT get stuck in overly long loops of tool use. Be decisive and conclude your reasoning.")
+        system_prompt_parts.append("When providing your final answer, it is crucial that it is ONLY the answer itself, with absolutely no additional conversation, explanations, or formatting like 'The answer is...' or 'Based on my findings...'. Be direct. ")
+        system_prompt_parts.append("The final answer format must be one of the following: ")
+        system_prompt_parts.append("1. A number (e.g., 42, 1000, 3.14). Do not use commas for thousands separators (e.g., write 1000 not 1,000). Do not use units like '$' or '%' unless the question explicitly asks for it in the answer format. ")
+        system_prompt_parts.append("2. As few words as possible (e.g., 'Paris', 'Mount Everest'). Do not use articles (a, an, the) unless part of a proper name. Avoid abbreviations (e.g., use 'Los Angeles' not 'LA') unless the question implies it. Write digits in plain text (e.g., 'two' instead of '2') unless the question asks for a numerical digit. ")
+        system_prompt_parts.append("3. A comma-separated list of numbers and/or strings (e.g., 'red,blue,green', '1,2,three', 'Tokyo,London,New York'). Apply the rules from 1 and 2 to each element in the list. Ensure there are no spaces after commas unless a list element itself naturally contains a space (e.g. a multi-word city name). ")
+        system_prompt_parts.append("Adhere to these formatting rules strictly for the final output.")
+        system_prompt_parts.append("You also have access to a 'wikipedia_tool' to get information from Wikipedia. It's good for general knowledge questions, facts, definitions, and summaries on a wide range of topics.")
+        system_prompt_parts.append("For questions specifically about the visual content of a YouTube video, use the 'analyse_youtube' tool. Provide the 'youtube_url' and the 'question'. This tool uses a Gemini multimodal model. If this tool fails or cannot answer, you can fall back to 'web_search' for general information about the video.")
+        system_prompt_parts.append("If you encounter a particularly complex question (e.g., historical queries with multiple constraints, or questions requiring deep, multi-step reasoning) and you are struggling to find a definitive answer after attempting with standard tools (like web_search, wikipedia_tool) for a few cycles (e.g., 2-3 attempts), you can use the 'deep_analysis_with_gemini' tool. Pass the original, full question to this tool. Use this as a strategic escalation for very challenging textual questions.")
+        system_prompt_parts.append("If a tool fails or a file type is unsupported, do not try the same tool repeatedly on it. Use web_search or state you cannot answer if appropriate. ")
+        system_prompt = "".join(system_prompt_parts)
+        messages_for_llm = [SystemMessage(content=system_prompt)] + state["messages"]
+        logger.debug(f"Messages being sent to LLM for task {state['task_id']}: {messages_for_llm}")
+        response_message = self.agent_llm.invoke(messages_for_llm)
+        logger.debug(f"LLM response for task {state['task_id']}: {response_message}")
+        return {"messages": [response_message]} # LangGraph's add_messages will append this
+    def _build_graph(self) -> StateGraph:
+        logger.info("Building LangGraph...")
+        builder = StateGraph(AgentState)
+        builder.add_node("assistant", self._assistant_node)
+        tool_node = ToolNode(TOOLS) # Create a ToolNode with all our tools
+        builder.add_node("tools", tool_node)
+        builder.add_edge(START, "assistant")
+        builder.add_conditional_edges(
+            "assistant",
+            tools_condition, # LangGraph's prebuilt tools_condition
+            # END # If no tool call, end. (Modified below to ensure final processing)
+        )
+        # builder.add_edge("tools", "assistant") # Loop back from tools to assistant
+        # Modified flow: Tools execute, then always go back to assistant for summarization/final answer
+        # If assistant decided no tool, tools_condition might route to END if not handled
+        # We want the assistant to make the final decision to END.
+        # If assistant calls a tool, route to tools.
+        # If assistant does not call a tool, it should be the final answer.
+        # tools_condition will route to END if no tool calls are present in the AI message.
+        # So, if tools_condition routes to END, it means the assistant provided the final answer.
+        builder.add_edge("tools", "assistant") # Always go back to assistant after a tool run
+        # graph = builder.compile(checkpointer=None, recursion_limit=35) # Incorrect parameter
+        graph = builder.compile(checkpointer=None) # Corrected: remove recursion_limit
+        logger.info("LangGraph built successfully.")
+        # try:
+        #     # For debugging: display graph structure if possible (requires graphviz)
+        #     # from IPython.display import Image, display
+        #     # display(Image(graph.get_graph(xray=True).draw_mermaid_png()))
+        #     logger.info("Graph visualization (mermaid PNG) can be generated if IPython and graphviz are available.")
+        # except Exception as e:
+        #     logger.warning(f"Could not generate graph visualization: {e}")
+        return graph
+    def __call__(self, task_id: str, question: str, file_name: str | None) -> tuple[str, bool]:
+        logger.info(f"LangGraphAgent __call__ for task_id: {task_id}")
+        # 1. Check for cached answer first
+        cached_answer = self._load_answer(task_id)
+        if cached_answer is not None:
+            logger.info(f"Returning cached answer for {task_id}.")
+            return cached_answer, True
+        if not self.graph or not self.agent_llm:
+            logger.error("Agent graph or LLM not initialized. Cannot process question.")
+            return "Error: Agent not properly initialized.", False
+        # 2. Download file if provided
+        local_file_path = None
+        if file_name:
+            logger.info(f"Associated file '{file_name}' for task {task_id}. Attempting download.")
+            local_file_path = download_file(self.api_url, task_id, file_name, download_dir="downloads") # Ensure 'downloads' dir
+            if local_file_path:
+                logger.info(f"File '{file_name}' available at {local_file_path} for task {task_id}.")
+            else:
+                logger.error(f"Failed to download file '{file_name}' for task {task_id}.")
+                # Agent might still try to answer or this could be a hard failure depending on the question
+        # 3. Invoke the graph
+        initial_state: AgentState = {
+            "task_id": task_id,
+            "original_question": question,
+            "input_file_path": local_file_path,
+            "messages": [HumanMessage(content=question)]
+        }
+        final_answer_content = f"Error: Agent did not produce a final answer for task {task_id}." # Default error
+        try:
+            logger.info(f"Invoking graph for task_id: {task_id} with initial state.")
+            # Stream events for debugging if needed:
+            # for event in self.graph.stream(initial_state, stream_mode="values"):
+            #     logger.debug(f"Graph event for {task_id}: {event}")
+            #     final_state = event
+            final_state = self.graph.invoke(initial_state, config={'recursion_limit': 50}) # Increased to 50
+            logger.info(f"Graph invocation complete for task_id: {task_id}.")
+            if final_state and final_state.get("messages"):
+                # The final answer should be the content of the last AI message that is not a tool call
+                for msg in reversed(final_state["messages"]):
+                    if msg.type == "ai" and not msg.tool_calls: # Check for AI message without tool calls
+                        final_answer_content = msg.content
+                        logger.info(f"Extracted final answer for {task_id}: '{final_answer_content[:100]}...' ")
+                        break
+                    elif msg.type == "system" and "Error: Agent LLM not initialized" in msg.content: # Check for our specific error
+                        final_answer_content = msg.content
+                        break
+                else: # If loop finishes without break (no suitable AI message found)
+                    logger.warning(f"No suitable final AI message found for task {task_id}. Last messages: {final_state.get('messages')}")
+                    # Fallback or specific error message.
+                    # For now, use the last message content if any, or keep the default error.
+                    if final_state.get("messages"):
+                        final_answer_content = final_state["messages"][-1].content # Best guess
+            else:
+                logger.error(f"Graph did not return messages in final_state for task {task_id}. Final state: {final_state}")
+        except Exception as e:
+            logger.error(f"Error during LangGraph agent execution for task_id {task_id}: {e}", exc_info=True)
+            final_answer_content = f"Error during agent execution: {str(e)}"
+        # 4. Save and return the final answer
+        self._save_answer(task_id, question, final_answer_content)
+        return final_answer_content, False # False because it's newly generated/processed by graph

answers/1f975693-876d-457b-a649-393859e79bf3.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "task_id": "1f975693-876d-457b-a649-393859e79bf3",
+    "question": "Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I need to study for my Calculus mid-term next week. My friend from class sent me an audio recording of Professor Willowbrook giving out the recommended reading for the test, but my headphones are broken :(\n\nCould you please listen to the recording for me and tell me the page numbers I'm supposed to go over? I've attached a file called Homework.mp3 that has the recording. Please provide just the page numbers as a comma-delimited list. And please provide the list in ascending order.",
+    "answer": "132,133,134,197,245"
+}

answers/2d83110e-a098-4ebb-9987-066c06fa42d0.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "task_id": "2d83110e-a098-4ebb-9987-066c06fa42d0",
+    "question": ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI",
+    "answer": "right"
+}

answers/305ac316-eef6-4446-960a-92d80d542f82.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "task_id": "305ac316-eef6-4446-960a-92d80d542f82",
+    "question": "Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M.? Give only the first name.",
+    "answer": "Piotr"
+}

answers/3cef3a44-215e-4aed-8e3b-b1e3f08063b7.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "task_id": "3cef3a44-215e-4aed-8e3b-b1e3f08063b7",
+    "question": "I'm making a grocery list for my mom, but she's a professor of botany and she's a real stickler when it comes to categorizing things. I need to add different foods to different categories on the grocery list, but if I make a mistake, she won't buy anything inserted in the wrong category. Here's the list I have so far:\n\nmilk, eggs, flour, whole bean coffee, Oreos, sweet potatoes, fresh basil, plums, green beans, rice, corn, bell pepper, whole allspice, acorns, broccoli, celery, zucchini, lettuce, peanuts\n\nI need to make headings for the fruits and vegetables. Could you please create a list of just the vegetables from my list? If you could do that, then I can figure out how to categorize the rest of the list into the appropriate categories. But remember that my mom is a real stickler, so make sure that no botanical fruits end up on the vegetable list, or she won't get them when she's at the store. Please alphabetize the list of vegetables, and place each item in a comma separated list.",
+    "answer": "broccoli,celery,lettuce,sweet potatoes,zucchini"
+}

answers/3f57289b-8c60-48be-bd80-01f8099ca449.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "task_id": "3f57289b-8c60-48be-bd80-01f8099ca449",
+    "question": "How many at bats did the Yankee with the most walks in the 1977 regular season have that same season?",
+    "answer": "582"
+}

answers/4fc2f1ae-8625-45b5-ab34-ad4433bc21f8.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "task_id": "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8",
+    "question": "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?",
+    "answer": "FunkMonk"
+}

answers/5a0c1adf-205e-4841-a666-7c3ef95def9d.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "task_id": "5a0c1adf-205e-4841-a666-7c3ef95def9d",
+    "question": "What is the first name of the only Malko Competition recipient from the 20th Century (after 1977) whose nationality on record is a country that no longer exists?",
+    "answer": "Claus"
+}

answers/6f37996b-2ac7-44b0-8e68-6d28256631b4.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "task_id": "6f37996b-2ac7-44b0-8e68-6d28256631b4",
+    "question": "Given this table defining * on the set S = {a, b, c, d, e}\n\n|*|a|b|c|d|e|\n|---|---|---|---|---|---|\n|a|a|b|c|b|d|\n|b|b|c|a|e|c|\n|c|c|a|b|b|a|\n|d|b|e|b|e|d|\n|e|d|b|a|d|c|\n\nprovide the subset of S involved in any possible counter-examples that prove * is not commutative. Provide your answer as a comma separated list of the elements in the set in alphabetical order.",
+    "answer": "a,b,c,d,e"
+}

answers/7bd855d8-463d-4ed5-93ca-5fe35145f733.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "task_id": "7bd855d8-463d-4ed5-93ca-5fe35145f733",
+    "question": "The attached Excel file contains the sales of menu items for a local fast-food chain. What were the total sales that the chain made from food (not including drinks)? Express your answer in USD with two decimal places.",
+    "answer": "89706.00"
+}

answers/840bfca7-4f7b-481a-8794-c560c340185d.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "task_id": "840bfca7-4f7b-481a-8794-c560c340185d",
+    "question": "On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?",
+    "answer": "NNX17AF26G"
+}

answers/8e867cd7-cff9-4e6c-867a-ff5ddc2550be.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "task_id": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
+    "question": "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.",
+    "answer": "5"
+}

answers/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "task_id": "99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3",
+    "question": "Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need for the crust, but I'm not sure about the filling. I got the recipe from my friend Aditi, but she left it as a voice memo and the speaker on my phone is buzzing so I can't quite make out what she's saying. Could you please listen to the recipe and list all of the ingredients that my friend described? I only want the ingredients for the filling, as I have everything I need to make my favorite pie crust. I've attached the recipe as Strawberry pie.mp3.\n\nIn your response, please only list the ingredients, not any measurements. So if the recipe calls for \"a pinch of salt\" or \"two cups of ripe strawberries\" the ingredients on the list would be \"salt\" and \"ripe strawberries\".\n\nPlease format your response as a comma separated list of ingredients. Also, please alphabetize the ingredients.",
+    "answer": "cornstarch, granulated sugar, lemon juice, ripe strawberries, vanilla extract"
+}

answers/9d191bce-651d-4746-be2d-7ef8ecadb9c2.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "task_id": "9d191bce-651d-4746-be2d-7ef8ecadb9c2",
+    "question": "Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec.\n\nWhat does Teal'c say in response to the question \"Isn't that hot?\"",
+    "answer": "Extremely."
+}

answers/a0c07678-e491-4bbc-8f0b-07405144218f.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "task_id": "a0c07678-e491-4bbc-8f0b-07405144218f",
+    "question": "Who are the pitchers with the number before and after Taish\u014d Tamai's number as of July 2023? Give them to me in the form Pitcher Before, Pitcher After, use their last names only, in Roman characters.",
+    "answer": "Hasegawa, VerHagen"
+}

answers/a1e91b78-d3d8-4675-bb8d-62741b4b68a6.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
+    "question": "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?",
+    "answer": "I cannot determine the highest number of bird species on camera simultaneously in the video."
+}

answers/bda648d7-d618-4883-88f4-3466eabd860e.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "task_id": "bda648d7-d618-4883-88f4-3466eabd860e",
+    "question": "Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010 paper eventually deposited? Just give me the city name without abbreviations.",
+    "answer": "Saint Petersburg"
+}

answers/cabe07ed-9eca-40ea-8ead-410ef5e83f91.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "task_id": "cabe07ed-9eca-40ea-8ead-410ef5e83f91",
+    "question": "What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023?",
+    "answer": "Franco"
+}

answers/cca530fc-4052-43b2-b130-b30968d8aa44.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "task_id": "cca530fc-4052-43b2-b130-b30968d8aa44",
+    "question": "Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.",
+    "answer": "Qb1+"
+}

answers/cf106601-ab4f-4af9-b045-5295fe67b37d.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "task_id": "cf106601-ab4f-4af9-b045-5295fe67b37d",
+    "question": "What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer.",
+    "answer": "PAN"
+}

answers/f918266a-b3e0-4914-865d-4faa564f1aef.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "task_id": "f918266a-b3e0-4914-865d-4faa564f1aef",
+    "question": "What is the final numeric output from the attached Python code?",
+    "answer": "0"
+}

app.py CHANGED Viewed

@@ -1,103 +1,82 @@
 import os
 import gradio as gr
-import requests
 import inspect
 import pandas as pd
-# (Keep Constants as is)
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 # --- Basic Agent Definition ---
 # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
-class BasicAgent:
-    def __init__(self):
-        print("BasicAgent initialized.")
-    def __call__(self, question: str) -> str:
-        print(f"Agent received question (first 50 chars): {question[:50]}...")
-        fixed_answer = "This is a default answer."
-        print(f"Agent returning fixed answer: {fixed_answer}")
-        return fixed_answer
-def run_and_submit_all( profile: gr.OAuthProfile | None):
-    """
-    Fetches all questions, runs the BasicAgent on them, submits all answers,
-    and displays the results.
-    """
-    # --- Determine HF Space Runtime URL and Repo URL ---
-    space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
-    if profile:
-        username= f"{profile.username}"
-        print(f"User logged in: {username}")
-    else:
-        print("User not logged in.")
-        return "Please Login to Hugging Face with the button.", None
-    api_url = DEFAULT_API_URL
-    questions_url = f"{api_url}/questions"
-    submit_url = f"{api_url}/submit"
-    # 1. Instantiate Agent ( modify this part to create your agent)
     try:
-        agent = BasicAgent()
-    except Exception as e:
-        print(f"Error instantiating agent: {e}")
-        return f"Error initializing agent: {e}", None
-    # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
-    agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
-    print(agent_code)
-    # 2. Fetch Questions
-    print(f"Fetching questions from: {questions_url}")
-    try:
-        response = requests.get(questions_url, timeout=15)
-        response.raise_for_status()
-        questions_data = response.json()
-        if not questions_data:
-             print("Fetched questions list is empty.")
-             return "Fetched questions list is empty or invalid format.", None
-        print(f"Fetched {len(questions_data)} questions.")
-    except requests.exceptions.RequestException as e:
-        print(f"Error fetching questions: {e}")
-        return f"Error fetching questions: {e}", None
-    except requests.exceptions.JSONDecodeError as e:
-         print(f"Error decoding JSON response from questions endpoint: {e}")
-         print(f"Response text: {response.text[:500]}")
-         return f"Error decoding server response for questions: {e}", None
-    except Exception as e:
-        print(f"An unexpected error occurred fetching questions: {e}")
-        return f"An unexpected error occurred fetching questions: {e}", None
-    # 3. Run your Agent
-    results_log = []
-    answers_payload = []
-    print(f"Running agent on {len(questions_data)} questions...")
-    for item in questions_data:
-        task_id = item.get("task_id")
-        question_text = item.get("question")
-        if not task_id or question_text is None:
-            print(f"Skipping item with missing task_id or question: {item}")
-            continue
-        try:
-            submitted_answer = agent(question_text)
-            answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
-            results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
-        except Exception as e:
-             print(f"Error running agent on task {task_id}: {e}")
-             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
-    if not answers_payload:
-        print("Agent did not produce any answers to submit.")
-        return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
-    # 4. Prepare Submission
-    submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
-    status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
-    print(status_update)
-    # 5. Submit
-    print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
     try:
         response = requests.post(submit_url, json=submission_data, timeout=60)
         response.raise_for_status()
@@ -109,7 +88,7 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
             f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
             f"Message: {result_data.get('message', 'No message received.')}"
         )
-        print("Submission successful.")
         results_df = pd.DataFrame(results_log)
         return final_status, results_df
     except requests.exceptions.HTTPError as e:
@@ -120,27 +99,125 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
         except requests.exceptions.JSONDecodeError:
             error_detail += f" Response: {e.response.text[:500]}"
         status_message = f"Submission Failed: {error_detail}"
-        print(status_message)
         results_df = pd.DataFrame(results_log)
         return status_message, results_df
     except requests.exceptions.Timeout:
         status_message = "Submission Failed: The request timed out."
-        print(status_message)
         results_df = pd.DataFrame(results_log)
         return status_message, results_df
     except requests.exceptions.RequestException as e:
         status_message = f"Submission Failed: Network error - {e}"
-        print(status_message)
         results_df = pd.DataFrame(results_log)
         return status_message, results_df
     except Exception as e:
         status_message = f"An unexpected error occurred during submission: {e}"
-        print(status_message)
         results_df = pd.DataFrame(results_log)
         return status_message, results_df
-# --- Build Gradio Interface using Blocks ---
 with gr.Blocks() as demo:
     gr.Markdown("# Basic Agent Evaluation Runner")
     gr.Markdown(
@@ -163,7 +240,6 @@ with gr.Blocks() as demo:
     run_button = gr.Button("Run Evaluation & Submit All Answers")
     status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
-    # Removed max_rows=10 from DataFrame constructor
     results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
     run_button.click(
@@ -172,25 +248,24 @@ with gr.Blocks() as demo:
     )
 if __name__ == "__main__":
-    print("\n" + "-"*30 + " App Starting " + "-"*30)
-    # Check for SPACE_HOST and SPACE_ID at startup for information
     space_host_startup = os.getenv("SPACE_HOST")
-    space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
     if space_host_startup:
-        print(f"✅ SPACE_HOST found: {space_host_startup}")
-        print(f"   Runtime URL should be: https://{space_host_startup}.hf.space")
     else:
-        print("ℹ️  SPACE_HOST environment variable not found (running locally?).")
-    if space_id_startup: # Print repo URLs if SPACE_ID is found
-        print(f"✅ SPACE_ID found: {space_id_startup}")
-        print(f"   Repo URL: https://huggingface.co/spaces/{space_id_startup}")
-        print(f"   Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
     else:
-        print("ℹ️  SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
-    print("-"*(60 + len(" App Starting ")) + "\n")
-    print("Launching Gradio Interface for Basic Agent Evaluation...")
     demo.launch(debug=True, share=False)

 import os
 import gradio as gr
 import inspect
 import pandas as pd
+import requests
+import logging
+import datetime
+import json # Added for saving submission data
+log_timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+log_file_name = f"evaluation_run_{log_timestamp}.log"
+logger = logging.getLogger("eval_logger")
+logger.setLevel(logging.INFO)
+file_handler = logging.FileHandler(log_file_name)
+formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(name)s - %(module)s - %(funcName)s - %(lineno)d - %(message)s')
+file_handler.setFormatter(formatter)
+logger.addHandler(file_handler)
+logger.info("Logging setup complete. Log file: %s", log_file_name)
+from dataset_helper import fetch_all_questions, download_file # fetch_random_question is also available if needed
+from agent import LangGraphAgent
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 # --- Basic Agent Definition ---
 # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
+# class BasicAgent: # Moved to agent.py
+#     def __init__(self, api_url: str):
+#         print("BasicAgent initialized.")
+#         self.api_url = api_url # Store api_url for potential use in downloading files
+#
+#     def __call__(self, task_id: str, question: str, file_name: str | None) -> str:
+#         print(f"Agent received task_id: {task_id}, question (first 50 chars): {question[:50]}...")
+#         if file_name:
+#             print(f"Question has an associated file: {file_name}")
+#             # Example: Download the file if needed by the agent's logic
+#             # local_file_path = download_file(self.api_url, task_id, file_name)
+#             # if local_file_path:
+#             #     print(f"File {file_name} downloaded to {local_file_path}")
+#             #     # Agent would then process this file
+#             # else:
+#             #     print(f"Failed to download {file_name} for task {task_id}")
+#             #     return "Error: Could not download associated file."
+#
+#         # Current placeholder answer
+#         fixed_answer = "This is a default answer from BasicAgent."
+#         print(f"Agent returning fixed answer: {fixed_answer}")
+#         return fixed_answer
+def _submit_answers_to_api(submit_url: str, submission_data: dict, results_log: list, logger_instance: logging.Logger) -> tuple[str, pd.DataFrame]:
+    """Handles the submission of answers to the API and processes the response."""
     try:
+        submission_log_timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+        submission_file_name = f"submission_payload_{submission_log_timestamp}.json"
+        # Create an 'submissions' directory if it doesn't exist
+        submissions_dir = "submissions"
+        if not os.path.exists(submissions_dir):
+            os.makedirs(submissions_dir)
+            logger_instance.info(f"Created directory: {submissions_dir}")
+        submission_file_path = os.path.join(submissions_dir, submission_file_name)
+        with open(submission_file_path, 'w') as f:
+            json.dump(submission_data, f, indent=4)
+        logger_instance.info(f"Submission payload saved to: {submission_file_path}")
+    except Exception as e:
+        logger_instance.error(f"Failed to save submission payload: {e}", exc_info=True)
+    logger_instance.info(f"Submitting {len(submission_data.get('answers', []))} answers to: {submit_url}")
     try:
         response = requests.post(submit_url, json=submission_data, timeout=60)
         response.raise_for_status()
             f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
             f"Message: {result_data.get('message', 'No message received.')}"
         )
+        logger_instance.info(f"Submission successful: {final_status}")
         results_df = pd.DataFrame(results_log)
         return final_status, results_df
     except requests.exceptions.HTTPError as e:
         except requests.exceptions.JSONDecodeError:
             error_detail += f" Response: {e.response.text[:500]}"
         status_message = f"Submission Failed: {error_detail}"
+        logger_instance.error(status_message, exc_info=True)
         results_df = pd.DataFrame(results_log)
         return status_message, results_df
     except requests.exceptions.Timeout:
         status_message = "Submission Failed: The request timed out."
+        logger_instance.error(status_message, exc_info=True)
         results_df = pd.DataFrame(results_log)
         return status_message, results_df
     except requests.exceptions.RequestException as e:
         status_message = f"Submission Failed: Network error - {e}"
+        logger_instance.error(status_message, exc_info=True)
         results_df = pd.DataFrame(results_log)
         return status_message, results_df
     except Exception as e:
         status_message = f"An unexpected error occurred during submission: {e}"
+        logger_instance.error(status_message, exc_info=True)
         results_df = pd.DataFrame(results_log)
         return status_message, results_df
+def run_and_submit_all(profile: gr.OAuthProfile | None):
+    """
+    Fetches all questions, runs the BasicAgent on them, submits all answers,
+    and displays the results.
+    """
+    logger.info("run_and_submit_all started.")
+    space_id = os.getenv("SPACE_ID")
+    if profile:
+        username = f"{profile.username}"
+        logger.info(f"User logged in: {username}")
+    else:
+        logger.warning("User not logged in.")
+        return "Please Login to Hugging Face with the button.", None
+    api_url = DEFAULT_API_URL
+    submit_url = f"{api_url}/submit"
+    try:
+        logger.info("Initializing agent...")
+        global agent
+        agent = LangGraphAgent(api_url=DEFAULT_API_URL, answers_dir="answers")
+        logger.info("Agent initialized.")
+    except Exception as e:
+        logger.error(f"Error instantiating agent: {e}", exc_info=True)
+        return f"Error initializing agent: {e}", None
+    agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
+    logger.info(f"Agent code URL: {agent_code}")
+    logger.info(f"Fetching questions using dataset_helper from: {api_url}")
+    questions_data = fetch_all_questions(api_url)
+    if questions_data is None:
+        logger.error("Failed to fetch questions via dataset_helper. questions_data is None.")
+        return "Error fetching questions. Please check the logs.", None
+    total_questions_fetched = len(questions_data)
+    logger.info(f"Fetched {total_questions_fetched} questions via dataset_helper.")
+    if not questions_data:
+        logger.warning("Fetched questions list is empty (0 questions).")
+        return "Fetched questions list is empty. Nothing to process.", pd.DataFrame(results_log if 'results_log' in locals() else [])
+    results_log = []
+    answers_payload = []
+    successful_answers_count = 0
+    answers_from_cache_count = 0
+    logger.info(f"Running agent on {total_questions_fetched} questions...")
+    for item_index, item in enumerate(questions_data):
+        task_id = item.get("task_id")
+        question_text = item.get("question")
+        file_name = item.get("file_name")
+        logger.info(f"Processing question {item_index + 1}/{total_questions_fetched}, task_id: {task_id}")
+        if not task_id or question_text is None:
+            logger.warning(f"Skipping item {item_index + 1} with missing task_id or question: {item}")
+            results_log.append({"Task ID": task_id if task_id else "MISSING_ID", "Question": question_text if question_text else "MISSING_QUESTION", "Associated File": file_name if file_name else "None", "Submitted Answer": "SKIPPED - Missing data", "From Cache": "N/A"})
+            continue
+        try:
+            submitted_answer_tuple = agent(task_id, question_text, file_name) # Returns (answer, from_cache)
+            submitted_answer, from_cache = submitted_answer_tuple
+            answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
+            results_log.append({"Task ID": task_id, "Question": question_text, "Associated File": file_name if file_name else "None", "Submitted Answer": submitted_answer, "From Cache": from_cache})
+            successful_answers_count += 1
+            if from_cache:
+                answers_from_cache_count += 1
+                logger.info(f"Agent successfully processed task_id: {task_id} (from cache)")
+            else:
+                logger.info(f"Agent successfully processed task_id: {task_id} (newly generated)")
+        except Exception as e:
+            logger.error(f"Error running agent on task {task_id}: {e}", exc_info=True)
+            results_log.append({"Task ID": task_id, "Question": question_text, "Associated File": file_name if file_name else "None", "Submitted Answer": f"AGENT ERROR: {e}", "From Cache": False})
+    logger.info(f"Agent finished processing. Successfully generated/retrieved answers for {successful_answers_count}/{total_questions_fetched} questions. {answers_from_cache_count} answers were from cache.")
+    if not answers_payload:
+        logger.warning("Agent did not produce any answers to submit (all attempts might have failed or been skipped).")
+        return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
+    submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
+    summary_line = f"Agent processed {total_questions_fetched} questions. Successfully generated/retrieved {successful_answers_count} answers ({answers_from_cache_count} from cache)."
+    logger.info(summary_line)
+    # --- TEMPORARILY BYPASS SUBMISSION FOR TESTING ---
+    # logger.warning("SUBMISSION TO API IS CURRENTLY BYPASSED FOR TESTING.")
+    # bypassed_status_message = (
+    #     f"SUBMISSION BYPASSED. {summary_line}\\n"
+    #     f"User: '{username}'. Results log is available. Submission data prepared but not sent."
+    # )
+    # results_df = pd.DataFrame(results_log)
+    # return bypassed_status_message, results_df
+    # --- END OF TEMPORARY BYPASS ---
+    # Call the refactored submission method, passing the global logger instance
+    # Note: If re-enabling submission, ensure the summary_line is incorporated into the _submit_answers_to_api or its return message.
+    return _submit_answers_to_api(submit_url, submission_data, results_log, logger)
 with gr.Blocks() as demo:
     gr.Markdown("# Basic Agent Evaluation Runner")
     gr.Markdown(
     run_button = gr.Button("Run Evaluation & Submit All Answers")
     status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
     results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
     run_button.click(
     )
 if __name__ == "__main__":
+    logger.info("App Starting...")
     space_host_startup = os.getenv("SPACE_HOST")
+    space_id_startup = os.getenv("SPACE_ID")
     if space_host_startup:
+        logger.info(f"SPACE_HOST found: {space_host_startup}")
+        logger.info(f"   Runtime URL should be: https://{space_host_startup}.hf.space")
     else:
+        logger.info("ℹ️  SPACE_HOST environment variable not found (running locally?).")
+    if space_id_startup:
+        logger.info(f"SPACE_ID found: {space_id_startup}")
+        logger.info(f"   Repo URL: https://huggingface.co/spaces/{space_id_startup}")
+        logger.info(f"   Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
     else:
+        logger.info("ℹ️  SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
+    logger.info("-"*(60 + len(" App Starting ")) + "\n")
+    logger.info("Launching Gradio Interface for Basic Agent Evaluation...")
     demo.launch(debug=True, share=False)

dataset_helper.py ADDED Viewed

	@@ -0,0 +1,172 @@

+import os
+import requests
+import shutil
+import logging
+logger = logging.getLogger("eval_logger")
+def fetch_all_questions(api_url: str) -> list[dict] | None:
+    """
+    Fetches all questions from the API.
+    Args:
+        api_url: The base URL of the scoring API.
+    Returns:
+        A list of question dictionaries, or None if an error occurs.
+    """
+    questions_url = f"{api_url}/questions"
+    logger.info(f"Fetching all questions from: {questions_url}")
+    try:
+        response = requests.get(questions_url, timeout=15)
+        response.raise_for_status()
+        questions_data = response.json()
+        if not questions_data:
+            logger.warning("Fetched questions list is empty.")
+            return None
+        logger.info(f"Fetched {len(questions_data)} questions successfully.")
+        return questions_data
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Error fetching all questions: {e}", exc_info=True)
+        return None
+    except requests.exceptions.JSONDecodeError as e:
+        logger.error(f"Error decoding JSON response from questions endpoint: {e}", exc_info=True)
+        logger.error(f"Response text: {response.text[:500] if response else 'No response'}")
+        return None
+    except Exception as e:
+        logger.error(f"An unexpected error occurred fetching all questions: {e}", exc_info=True)
+        return None
+def fetch_random_question(api_url: str) -> dict | None:
+    """
+    Fetches a single random question from the API.
+    Args:
+        api_url: The base URL of the scoring API.
+    Returns:
+        A dictionary representing a single question, or None if an error occurs.
+    """
+    random_question_url = f"{api_url}/random-question"
+    logger.info(f"Fetching random question from: {random_question_url}")
+    try:
+        response = requests.get(random_question_url, timeout=15)
+        response.raise_for_status()
+        question_data = response.json()
+        if not question_data:
+            logger.warning("Fetched random question is empty.")
+            return None
+        logger.info(f"Fetched random question successfully: {question_data.get('task_id')}")
+        return question_data
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Error fetching random question: {e}", exc_info=True)
+        return None
+    except requests.exceptions.JSONDecodeError as e:
+        logger.error(f"Error decoding JSON response from random question endpoint: {e}", exc_info=True)
+        logger.error(f"Response text: {response.text[:500] if response else 'No response'}")
+        return None
+    except Exception as e:
+        logger.error(f"An unexpected error occurred fetching random question: {e}", exc_info=True)
+        return None
+def download_file(api_url: str, task_id: str, file_name: str, download_dir: str = "downloads") -> str | None:
+    """
+    Downloads a specific file associated with a given task ID.
+    Args:
+        api_url: The base URL of the scoring API.
+        task_id: The ID of the task for which to download the file.
+        file_name: The name of the file to be saved.
+        download_dir: The directory where the file should be saved. Defaults to "downloads".
+    Returns:
+        The local path to the downloaded file, or None if an error occurs.
+    """
+    if not file_name:
+        logger.info(f"No file_name provided for task_id {task_id}. Skipping download.")
+        return None
+    file_url = f"{api_url}/files/{task_id}"
+    os.makedirs(download_dir, exist_ok=True)
+    local_file_path = os.path.join(download_dir, file_name)
+    if os.path.exists(local_file_path):
+        logger.info(f"File already exists at {local_file_path}. Skipping download.")
+        return local_file_path
+    logger.info(f"Downloading file for task_id {task_id} from: {file_url} to {local_file_path}")
+    try:
+        with requests.get(file_url, stream=True, timeout=30) as r:
+            r.raise_for_status()
+            with open(local_file_path, 'wb') as f:
+                shutil.copyfileobj(r.raw, f)
+        logger.info(f"File downloaded successfully: {local_file_path}")
+        return local_file_path
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Error downloading file for task_id {task_id}: {e}", exc_info=True)
+        if os.path.exists(local_file_path):
+            os.remove(local_file_path)
+        return None
+    except Exception as e:
+        logger.error(f"An unexpected error occurred downloading file for task_id {task_id}: {e}", exc_info=True)
+        if os.path.exists(local_file_path):
+            os.remove(local_file_path)
+        return None
+if __name__ == '__main__':
+    print("--- Testing dataset_helper.py directly ---")
+    print("NOTE: For full logging, run through app.py. This direct test uses print statements.")
+    test_api_url = "https://agents-course-unit4-scoring.hf.space"
+    print("\n--- Testing fetch_all_questions ---")
+    questions = fetch_all_questions(test_api_url)
+    if questions:
+        print(f"Successfully fetched {len(questions)} questions. First question task_id: {questions[0].get('task_id')}")
+    else:
+        print("Failed to fetch all questions.")
+    print("\n--- Testing fetch_random_question ---")
+    random_q = fetch_random_question(test_api_url)
+    if random_q:
+        print(f"Successfully fetched random question: {random_q.get('question')[:50]}...")
+    else:
+        print("Failed to fetch random question.")
+    print("\n--- Testing download_file (example with a known task_id and file_name if available) ---")
+    if questions:
+        test_task_with_file = None
+        test_file_name = None
+        for q_item in questions:
+            if q_item.get("file_name"):
+                test_task_with_file = q_item.get("task_id")
+                test_file_name = q_item.get("file_name")
+                break
+        if test_task_with_file and test_file_name:
+            print(f"Attempting to download file for task_id: {test_task_with_file}, file_name: {test_file_name}")
+            downloaded_path = download_file(test_api_url, test_task_with_file, test_file_name)
+            if downloaded_path:
+                print(f"File downloaded to: {downloaded_path}")
+            else:
+                print(f"Failed to download file for task_id: {test_task_with_file}")
+        else:
+            print("No question with an associated file found in the first batch of questions to test download.")
+    else:
+        print("Skipping download_file test as fetching questions failed.")
+    print("\n--- Testing download_file (with a task_id that might not have a file or invalid file_name) ---")
+    if questions and questions[0].get("file_name") == "":
+        task_id_no_file = questions[0].get("task_id")
+        file_name_empty = questions[0].get("file_name")
+        print(f"Attempting to download file for task_id: {task_id_no_file} (expected to skip due to empty file_name)")
+        path_no_file = download_file(test_api_url, task_id_no_file, file_name_empty)
+        if path_no_file is None:
+            print("Correctly skipped download or failed as expected for task with no file_name.")
+        else:
+            print(f"Unexpectedly downloaded something to {path_no_file} for a task with no file_name.")
+    else:
+        print("Skipping test for task with no file_name (either no questions or first question has a file).")

downloads/1f975693-876d-457b-a649-393859e79bf3.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:200f767e732b49efef5c05d128903ee4d2c34e66fdce7f5593ac123b2e637673
+size 280868

downloads/7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx ADDED Viewed

Binary file (5.29 kB). View file

downloads/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b218c951c1f888f0bbe6f46c080f57afc7c9348fffc7ba4da35749ff1e2ac40f
+size 179304

downloads/cca530fc-4052-43b2-b130-b30968d8aa44.png ADDED Viewed

downloads/f918266a-b3e0-4914-865d-4faa564f1aef.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from random import randint
+import time
+class UhOh(Exception):
+    pass
+class Hmm:
+    def __init__(self):
+        self.value = randint(-100, 100)
+    def Yeah(self):
+        if self.value == 0:
+            return True
+        else:
+            raise UhOh()
+def Okay():
+    while True:
+        yield Hmm()
+def keep_trying(go, first_try=True):
+    maybe = next(go)
+    try:
+        if maybe.Yeah():
+            return maybe.value
+    except UhOh:
+        if first_try:
+            print("Working...")
+            print("Please wait patiently...")
+        time.sleep(0.1)
+        return keep_trying(go, first_try=False)
+if __name__ == "__main__":
+    go = Okay()
+    print(f"{keep_trying(go)}")

requirements.txt CHANGED Viewed

@@ -1,2 +1,16 @@
-gradio
-requests

+gradio[oauth]>=4.44.1
+requests
+pandas
+langchain
+langgraph
+langchain_openai
+langchain_core
+langchain_community
+duckduckgo-search
+python-dotenv
+assemblyai
+wikipedia
+openpyxl
+tabulate
+youtube-transcript-api
+langchain-google-genai

tools.py ADDED Viewed

	@@ -0,0 +1,314 @@

+import logging
+import os
+import base64
+import pandas as pd
+import io
+import contextlib
+from langchain_core.messages import HumanMessage
+from langchain_openai import ChatOpenAI
+from langchain_community.tools import DuckDuckGoSearchRun
+from langchain_community.document_loaders import AssemblyAIAudioTranscriptLoader
+from langchain_community.tools import WikipediaQueryRun
+from langchain_community.utilities import WikipediaAPIWrapper
+from langchain_core.tools import tool
+from langchain_google_genai import ChatGoogleGenerativeAI
+logger = logging.getLogger("eval_logger")
+try:
+    tools_llm = ChatOpenAI(model="gpt-4o", temperature=0)
+except Exception as e:
+    logger.error(f"Failed to initialize tools_llm (OpenAI gpt-4o) in tools.py: {e}. Ensure OPENAI_API_KEY is set.", exc_info=True)
+    tools_llm = None
+GEMINI_SHARED_MODEL_NAME = "gemini-2.5-pro-preview-05-06"
+try:
+    gemini_llm = ChatGoogleGenerativeAI(
+        model=GEMINI_SHARED_MODEL_NAME,
+        temperature=0,
+        timeout=360 # 6-minute timeout
+    )
+    logger.info(f"Successfully initialized shared Gemini model: {GEMINI_SHARED_MODEL_NAME} with a 360s timeout.")
+except Exception as e:
+    logger.error(f"Failed to initialize shared_gemini_llm in tools.py (model: {GEMINI_SHARED_MODEL_NAME}): {e}. Ensure GOOGLE_API_KEY is set and valid, and the model name is correct/available.", exc_info=True)
+    gemini_llm = None
+try:
+    tools_llm = ChatOpenAI(model="gpt-4o", temperature=0)
+except Exception as e:
+    logger.error(f"Failed to initialize tools_llm in tools.py (ChatOpenAI with gpt-4o): {e}. Ensure OPENAI_API_KEY is set and .env is loaded.", exc_info=True)
+    tools_llm = None # Set to None so the app can still load, but tool will fail
+@tool
+def analyse_image(img_path: str, question: str) -> str:
+    """
+    Analyses a **locally stored** image file to answer a specific question using a multimodal model.
+    IMPORTANT: This tool expects a local file path for 'img_path' and cannot process web URLs directly.
+    Args:
+        img_path: Local path to the image file (e.g., /path/to/your/image.png).
+        question: The question the user is trying to answer by analysing this image.
+    Returns:
+        A string containing the relevant information extracted from the image to answer the question,
+        or an error message if analysis fails.
+    """
+    if not tools_llm:
+        return "Error: Vision LLM (gpt-4o) not initialized in tools.py. Cannot analyse image."
+    if not os.path.exists(img_path):
+        # This check is more critical now that we emphasize local paths.
+        return f"Error: Image file not found at local path: {img_path}. This tool requires a local file path."
+    logger.info(f"Attempting to analyse image: {img_path} for question: '{question}'")
+    try:
+        with open(img_path, "rb") as image_file:
+            image_bytes = image_file.read()
+        image_base64 = base64.b64encode(image_bytes).decode("utf-8")
+        image_type = os.path.splitext(img_path)[1].lower()
+        if image_type == '.jpg':
+            image_type = '.jpeg'
+        if image_type not in ['.png', '.jpeg', '.gif', '.webp']:
+             return f"Error: Unsupported image type '{image_type}' for gpt-4o vision. Supported: PNG, JPEG, GIF, WEBP."
+        prompt_text = f"Analyse this image to answer the following question: '{question}'. Focus on extracting only the information directly relevant to this question. Return only the extracted information, with no additional explanations or commentary."
+        message = HumanMessage(
+            content=[
+                {"type": "text", "text": prompt_text},
+                {"type": "image_url", "image_url": {"url": f"data:image/{image_type[1:]};base64,{image_base64}"}},
+            ]
+        )
+        response = tools_llm.invoke([message])
+        extracted_text = response.content
+        logger.info(f"Successfully analysed {img_path} for question '{question}'. Response length: {len(extracted_text)}")
+        return extracted_text.strip()
+    except Exception as e:
+        logger.error(f"Error analysing image {img_path} for question '{question}': {e}", exc_info=True)
+        return f"Error during image analysis for question '{question}': {str(e)}"
+@tool
+def analyse_audio(audio_path: str, question: str) -> str:
+    """
+    Transcribes a **locally stored** audio file using AssemblyAI and then analyses the transcript
+    with a multimodal model (gpt-4o) to answer a specific question.
+    IMPORTANT: This tool expects a local file path for 'audio_path' (e.g., /path/to/your/audio.mp3)
+    and **cannot process web URLs (like YouTube links) directly.**
+    Args:
+        audio_path: Local path to the audio file (e.g., /path/to/your/audio.mp3).
+        question: The question the user is trying to answer by analysing this audio.
+    Returns:
+        A string containing the relevant information extracted from the audio to answer the question,
+        or an error message if analysis fails.
+    """
+    logger.info(f"Attempting to analyse audio from local path: {audio_path} for question: '{question}'")
+    if not tools_llm: # vision_llm (gpt-4o) is used for the Q&A part
+        return "Error: LLM (gpt-4o) for Q&A not initialized in tools.py. Cannot analyse audio transcript."
+    if not audio_path:
+        return "Error: Audio file path not provided."
+    if not os.path.exists(audio_path):
+        return f"Error: Audio file not found at local path: {audio_path}. This tool requires a local file path."
+    try:
+        logger.info(f"Loading/transcribing audio from local file: {audio_path} using AssemblyAI.")
+        loader = AssemblyAIAudioTranscriptLoader(file_path=audio_path) # AssemblyAI loader primarily works with local paths for reliability.
+        docs = loader.load()
+        if not docs or not docs[0].page_content:
+            logger.error(f"AssemblyAI transcription failed or returned empty for {audio_path}.")
+            return f"Error: Transcription failed or returned empty content for {audio_path}."
+        transcript = docs[0].page_content
+        logger.info(f"Successfully transcribed audio from {audio_path}. Transcript length: {len(transcript)}")
+        qa_prompt_text = (
+            f"The following is a transcript of an audio file: \n\nTranscript:\n{transcript}\n\n---\n\n"
+            f"Based SOLELY on the information in the transcript above, answer the following question: '{question}'. "
+            f"Provide only the direct answer as extracted or inferred from the transcript, with no additional commentary."
+        )
+        message = HumanMessage(content=qa_prompt_text)
+        response = tools_llm.invoke([message])
+        answer = response.content
+        logger.info(f"Successfully analysed transcript from {audio_path} for question '{question}'. Answer length: {len(answer)}")
+        return answer.strip()
+    except Exception as e:
+        logger.error(f"Error analysing audio {audio_path} for question '{question}': {e}", exc_info=True)
+        if "api key" in str(e).lower() or "authenticate" in str(e).lower():
+            return f"Error during audio analysis: AssemblyAI authentication failed. Please check your ASSEMBLYAI_API_KEY. Original error: {str(e)}"
+        return f"Error during audio analysis for question '{question}': {str(e)}"
+@tool
+def execute_python_code_from_file(file_path: str, question: str) -> str:
+    """
+    Reads the content of a **locally stored** Python file and uses a powerful LLM (gpt-4o)
+    to answer a specific question about the Python code (e.g., its output, functionality, or errors).
+    IMPORTANT: This tool expects a local file path for 'file_path' and cannot process web URLs directly.
+    It does NOT actually execute the code, but rather analyses it textually.
+    Args:
+        file_path: Local path to the Python file (e.g., /path/to/your/script.py).
+        question: The question the user is trying to answer about this Python code.
+    Returns:
+        A string containing the LLM's analysis or answer about the Python code, or an error message.
+    """
+    logger.info(f"Attempting to analyse Python file: {file_path} for question: '{question}'")
+    if not tools_llm: # vision_llm (gpt-4o) is used for the analysis
+        return "Error: LLM (gpt-4o) for code analysis not initialized in tools.py."
+    if not file_path:
+        return "Error: Python file path not provided."
+    if not os.path.exists(file_path):
+        return f"Error: Python file not found at local path: {file_path}. This tool requires a local file path."
+    if not file_path.lower().endswith('.py'):
+        return f"Error: File at {file_path} is not a Python (.py) file."
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            python_code_content = f.read()
+        logger.info(f"Successfully read Python file {file_path}. Content length: {len(python_code_content)}")
+        analysis_prompt_text = (
+            f"The following is the content of a Python file: \n\nPython Code:\n```python\n{python_code_content}\n```\n\n---\n\n"
+            f"Based SOLELY on the Python code provided above, answer the following question: '{question}'. "
+            f"If the question asks for the output, predict the output. If it asks about functionality, describe it. "
+            f"Provide only the direct answer or analysis, with no additional commentary or explanations unless the question asks for it."
+        )
+        message = HumanMessage(content=analysis_prompt_text)
+        response = tools_llm.invoke([message]) # Using vision_llm (gpt-4o) for this analysis
+        answer = response.content
+        logger.info(f"Successfully analysed Python code from {file_path} for question '{question}'. Answer length: {len(answer)}")
+        return answer.strip()
+    except Exception as e:
+        logger.error(f"Error analysing Python file {file_path} for question '{question}': {e}", exc_info=True)
+        return f"Error during Python file analysis for question '{question}': {str(e)}"
+@tool
+def execute_pandas_script_for_excel(excel_file_path: str, python_code: str) -> str:
+    """
+    Executes a given Python script (which should use pandas) to perform analysis on an Excel file.
+    The script MUST load the Excel file using the provided 'excel_file_path' variable.
+    The script MUST print its final answer to standard output. The print output will be returned as the result.
+    This tool is for calculations, data manipulation, and specific lookups within the Excel file.
+    Args:
+        excel_file_path: The path to the Excel file that the script will process.
+        python_code: A string containing the Python script to execute.
+                     Example:
+                     '''
+                     import pandas as pd
+                     df = pd.read_excel(excel_file_path, sheet_name=0)
+                     # Perform analysis ...
+                     final_answer = df["SomeColumn"].sum() # Example operation
+                     print(final_answer)
+                     '''
+    Returns:
+        The standard output from the executed script (which should be the answer), or an error message if execution fails.
+    """
+    logger.info(f"Attempting to execute pandas script for Excel file: {excel_file_path}")
+    logger.debug(f"Python code to execute:\n{python_code}")
+    if not os.path.exists(excel_file_path):
+        return f"Error: Excel file not found at {excel_file_path}"
+    # Prepare the local namespace for exec, including pandas and the file path
+    local_namespace = {
+        "pd": pd,
+        "excel_file_path": excel_file_path,
+        "__builtins__": __builtins__ # Ensure basic builtins are available
+    }
+    # Capture stdout
+    stdout_capture = io.StringIO()
+    try:
+        with contextlib.redirect_stdout(stdout_capture):
+            exec(python_code, {"__builtins__": __builtins__}, local_namespace) # Provide pandas in globals, path in locals
+        output = stdout_capture.getvalue().strip()
+        logger.info(f"Successfully executed pandas script. Output: '{output}'")
+        if not output: # If the script printed nothing, it might indicate an issue or missing print().
+            return "Script executed successfully but produced no output. Ensure the script prints the final answer."
+        return output
+    except Exception as e:
+        logger.error(f"Error executing pandas script: {e}", exc_info=True)
+        # Provide a more detailed error message back to the LLM
+        import traceback
+        tb_str = traceback.format_exc()
+        return f"Error during script execution: {str(e)}\nTraceback:\n{tb_str}"
+@tool
+def analyse_youtube(youtube_url: str, question: str) -> str:
+    """
+    Analyzes a YouTube video to answer a specific question.
+    This tool is intended for questions that require understanding the visual content of the video.
+    It sends the YouTube URL directly to the shared Gemini model.
+    Args:
+        youtube_url: The full URL of the YouTube video (e.g., https://www.youtube.com/watch?v=...).
+        question: The question to answer based on the video's content.
+    Returns:
+        A string containing the answer from the shared Gemini model, or an error message if analysis fails.
+    """
+    logger.info(f"Attempting to analyse YouTube video: {youtube_url} with shared Gemini model ({GEMINI_SHARED_MODEL_NAME}) for question: '{question}'")
+    if not gemini_llm:
+        return f"Error: Shared Gemini LLM ({GEMINI_SHARED_MODEL_NAME}) not initialized in tools.py. Cannot analyse YouTube video."
+    try:
+        prompt = f"Video URL: {youtube_url}\n\nQuestion: {question}\n\nBased on the video at the URL, please provide the answer."
+        message = HumanMessage(content=prompt)
+        response = gemini_llm.invoke([message])
+        answer = response.content
+        logger.info(f"Successfully analysed YouTube video {youtube_url} with shared Gemini. Answer: {answer[:200]}...")
+        return answer.strip()
+    except Exception as e:
+        logger.error(f"Error analysing YouTube video {youtube_url} with shared Gemini ({GEMINI_SHARED_MODEL_NAME}): {e}", exc_info=True)
+        return f"Error during YouTube video analysis with shared Gemini: {str(e)}"
+@tool
+def deep_analysis_with_gemini(question: str) -> str:
+    """
+    Performs a deep analysis of a complex question using a powerful shared Gemini model.
+    Use this tool for questions that are multifaceted, require deep reasoning,
+    or for historical queries where standard search tools might be insufficient after initial attempts.
+    This tool directly passes the question to a shared Gemini model for a comprehensive answer.
+    Args:
+        question: The complex question to be analyzed.
+    Returns:
+        A string containing the detailed answer from the shared Gemini model, or an error message.
+    """
+    logger.info(f"Attempting deep analysis with shared Gemini model ({GEMINI_SHARED_MODEL_NAME}) for question: '{question}'")
+    if not gemini_llm:
+        return f"Error: Shared Gemini LLM ({GEMINI_SHARED_MODEL_NAME}) not initialized in tools.py."
+    try:
+        message = HumanMessage(content=question)
+        response = gemini_llm.invoke([message])
+        answer = response.content
+        logger.info(f"Successfully performed deep analysis with shared Gemini. Answer length: {len(answer)}")
+        return answer.strip()
+    except Exception as e:
+        logger.error(f"Error during deep analysis with shared Gemini ({GEMINI_SHARED_MODEL_NAME}): {e}", exc_info=True)
+        return f"Error during deep analysis with shared Gemini: {str(e)}"
+# Initialize other tools
+search_tool = DuckDuckGoSearchRun()
+wikipedia_tool = WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper())
+TOOLS = [
+    analyse_image,
+    search_tool,
+    analyse_audio,
+    execute_python_code_from_file,
+    wikipedia_tool,
+    execute_pandas_script_for_excel,
+    analyse_youtube,
+    deep_analysis_with_gemini,
+]
+logger.info(f"Tools initialized in tools.py: {[tool.name if hasattr(tool, 'name') else tool.__name__ for tool in TOOLS]}")