GAIA_benchmarked_Agent

Sleeping

App Files Files Community

blasisd commited on Jun 4

Commit

e12c221

1 Parent(s): 81917a3

Initial commit

Browse files

Files changed (3) hide show

app.py +229 -31
requirements.txt +17 -1
tools.py +461 -0

app.py CHANGED Viewed

@@ -1,34 +1,201 @@
 import os
 import gradio as gr
 import requests
 import inspect
 import pandas as pd
 # (Keep Constants as is)
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 # --- Basic Agent Definition ---
 # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
 class BasicAgent:
     def __init__(self):
         print("BasicAgent initialized.")
-    def __call__(self, question: str) -> str:
         print(f"Agent received question (first 50 chars): {question[:50]}...")
-        fixed_answer = "This is a default answer."
-        print(f"Agent returning fixed answer: {fixed_answer}")
-        return fixed_answer
-def run_and_submit_all( profile: gr.OAuthProfile | None):
     """
     Fetches all questions, runs the BasicAgent on them, submits all answers,
     and displays the results.
     """
     # --- Determine HF Space Runtime URL and Repo URL ---
-    space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
     if profile:
-        username= f"{profile.username}"
         print(f"User logged in: {username}")
     else:
         print("User not logged in.")
@@ -55,16 +222,16 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
         response.raise_for_status()
         questions_data = response.json()
         if not questions_data:
-             print("Fetched questions list is empty.")
-             return "Fetched questions list is empty or invalid format.", None
         print(f"Fetched {len(questions_data)} questions.")
     except requests.exceptions.RequestException as e:
         print(f"Error fetching questions: {e}")
         return f"Error fetching questions: {e}", None
     except requests.exceptions.JSONDecodeError as e:
-         print(f"Error decoding JSON response from questions endpoint: {e}")
-         print(f"Response text: {response.text[:500]}")
-         return f"Error decoding server response for questions: {e}", None
     except Exception as e:
         print(f"An unexpected error occurred fetching questions: {e}")
         return f"An unexpected error occurred fetching questions: {e}", None
@@ -76,26 +243,54 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
     for item in questions_data:
         task_id = item.get("task_id")
         question_text = item.get("question")
         if not task_id or question_text is None:
             print(f"Skipping item with missing task_id or question: {item}")
             continue
         try:
-            submitted_answer = agent(question_text)
-            answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
-            results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
         except Exception as e:
-             print(f"Error running agent on task {task_id}: {e}")
-             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
     if not answers_payload:
         print("Agent did not produce any answers to submit.")
         return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
-    # 4. Prepare Submission
-    submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
     status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
     print(status_update)
     # 5. Submit
     print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
     try:
@@ -162,20 +357,19 @@ with gr.Blocks() as demo:
     run_button = gr.Button("Run Evaluation & Submit All Answers")
-    status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
     # Removed max_rows=10 from DataFrame constructor
     results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
-    run_button.click(
-        fn=run_and_submit_all,
-        outputs=[status_output, results_table]
-    )
 if __name__ == "__main__":
-    print("\n" + "-"*30 + " App Starting " + "-"*30)
     # Check for SPACE_HOST and SPACE_ID at startup for information
     space_host_startup = os.getenv("SPACE_HOST")
-    space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
     if space_host_startup:
         print(f"✅ SPACE_HOST found: {space_host_startup}")
@@ -183,14 +377,18 @@ if __name__ == "__main__":
     else:
         print("ℹ️  SPACE_HOST environment variable not found (running locally?).")
-    if space_id_startup: # Print repo URLs if SPACE_ID is found
         print(f"✅ SPACE_ID found: {space_id_startup}")
         print(f"   Repo URL: https://huggingface.co/spaces/{space_id_startup}")
-        print(f"   Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
     else:
-        print("ℹ️  SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
-    print("-"*(60 + len(" App Starting ")) + "\n")
     print("Launching Gradio Interface for Basic Agent Evaluation...")
-    demo.launch(debug=True, share=False)

 import os
 import gradio as gr
+import litellm
 import requests
 import inspect
 import pandas as pd
+from doctest import debug
+from dotenv import load_dotenv
+from smolagents import (
+    CodeAgent,
+    # HfApiModel,
+    LiteLLMModel,
+    # OpenAIServerModel,
+    Tool,
+    FinalAnswerTool,
+)
+from tools import (
+    DuckDuckGoSearchTool,
+    FileDownloaderTool,
+    HtmlTableExtractorTool,
+    ImagesAnalyzerTool,
+    LoadTextFileTool,
+    LoadXlsxFileTool,
+    RelevantInfoRetrieverTool,
+    ReverseStringTool,
+    # SpeechToTextTool,
+    VideoAnalyzerTool,
+    VisitWebpageTool,
+    WebpageTablesContextRetrieverTool,
+    # YoutubeTranscriptTool,
+    WikipediaSearchTool,
+    YoutubeVideoDownloaderTool,
+)
+load_dotenv()
+HF_TOKEN = os.getenv("HF_U1ACAPP_TOKEN")
 # (Keep Constants as is)
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
+LLM_API_BASE = os.getenv("LLM_API_BASE")
+LLM_API_KEY = os.getenv("LLM_API_KEY")
+LLM_MODEL_ID = os.getenv("LLM_MODEL_ID")
+# Tools to use
+reverse_string_tool = ReverseStringTool()
+# speech_to_text_tool = SpeechToTextTool()
+trascriber_tool = Tool.from_space(
+    space_id="hf-audio/whisper-large-v3-turbo",
+    name="transcriber",
+    description="Transcribe an audio file or youtube video either from path or from url",
+)
+wikipedia_search_tool = WikipediaSearchTool()
+web_search_tool = DuckDuckGoSearchTool()
+visit_webpage_tool = VisitWebpageTool()
+relevant_info_tool = RelevantInfoRetrieverTool()
+youtube_video_downloader_tool = YoutubeVideoDownloaderTool()
+video_analyzer_tool = VideoAnalyzerTool()
+images_analyzer_tool = ImagesAnalyzerTool()
+file_downloader_tool = FileDownloaderTool()
+load_xls_file_tool = LoadXlsxFileTool()
+load_text_file_tool = LoadTextFileTool()
+webpage_tables_context_retriever_tool = WebpageTablesContextRetrieverTool()
+html_table_extractor_tool = HtmlTableExtractorTool()
+trascriber_tool.device = "cpu"
+final_answer_tool = FinalAnswerTool()
+final_answer_tool.description = """Returns the final answer that adheres strictly to the following guidelines:
+        - Includes ONLY explicitly requested content in the exact format specified
+        - Never includes:
+        * Explanations, reasoning blocks, or step-by-step working
+        * Measurements, units, or abbreviations unless required by the task
+        * Any content not specified in the task
+        - Matches requested formats precisely (e.g., CSV lists as "a, b, c")
+        - Preserves all specified delimiters, brackets, or structures when requested
+        - No Markdown, code blocks, or rich formatting unless explicitly asked
+        - In comma separated lists makes sure that there is a space character after each comma
+        - Provides ONLY the final output with:
+        * No introductory text
+        * No closing remarks
+        * No supplemental information
+        """
 # --- Basic Agent Definition ---
 # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
 class BasicAgent:
     def __init__(self):
         print("BasicAgent initialized.")
+        # model = OpenAIServerModel(
+        #     model_id="qwen/qwen2.5-vl-7b",
+        #     api_base="http://localhost:1234/v1",
+        #     api_key="not-needed",
+        #     max_tokens=8192,
+        # )
+        model = LiteLLMModel(
+            model_id=LLM_MODEL_ID,
+            api_base=LLM_API_BASE,
+            api_key=LLM_API_KEY,
+            num_ctx=8192,
+            # flatten_messages_as_text=False,
+        )
+        # model = HfApiModel(
+        #     max_tokens=4096,
+        #     temperature=0.5,
+        #     provider="novita",
+        #     model_id="Qwen/Qwen3-32B",
+        #     custom_role_conversions=None,
+        #     token=HF_TOKEN,
+        # )
+        self.agent = CodeAgent(
+            tools=[
+                file_downloader_tool,
+                reverse_string_tool,
+                wikipedia_search_tool,
+                # youtube_transcript_tool,
+                web_search_tool,
+                visit_webpage_tool,
+                youtube_video_downloader_tool,
+                trascriber_tool,
+                video_analyzer_tool,
+                images_analyzer_tool,
+                webpage_tables_context_retriever_tool,
+                html_table_extractor_tool,
+                load_xls_file_tool,
+                load_text_file_tool,
+                final_answer_tool,
+                # relevant_info_tool,
+            ],
+            model=model,
+            # executor_type="e2b",
+            additional_authorized_imports=[
+                "bs4",
+                "datetime",
+                "json",
+                "numpy",
+                "pandas",
+                "requests",
+                "lxml",
+                # "youtube_dl",
+            ],
+            add_base_tools=True,  # Add any additional base tools
+            planning_interval=3,  # Enable planning every 3 steps
+            # max_steps=12,
+        )
+    def __call__(
+        self, question: str, task_id: str = None, attached_file: bool = False
+    ) -> str:
+        """Calling the agent
+        :param question: the initial query
+        :type question: str
+        :param task_id: Required if attached_file is True; used to retrieve the file, defaults to None
+        :type task_id: str, optional
+        :param attached_file: If True, file content for task_id is appended to the question, defaults to False
+        :type attached_file: bool, optional
+        :raises ValueError: If attached_file is True but task_id is not provided.
+        :return: the agent's answer
+        :rtype: str
+        """
         print(f"Agent received question (first 50 chars): {question[:50]}...")
+        if attached_file and not task_id:
+            raise ValueError("task_id must be provided when attached_file is True")
+        additional_args = None
+        if attached_file:
+            file_url = f"{DEFAULT_API_URL}/files/{task_id}"
+            additional_args = {"file_url": file_url}
+        agent_answer = self.agent.run(question, additional_args=additional_args)
+        return agent_answer
+def run_and_submit_all(profile: gr.OAuthProfile | None):
     """
     Fetches all questions, runs the BasicAgent on them, submits all answers,
     and displays the results.
     """
     # --- Determine HF Space Runtime URL and Repo URL ---
+    space_id = os.getenv("SPACE_ID")  # Get the SPACE_ID for sending link to the code
     if profile:
+        username = f"{profile.username}"
         print(f"User logged in: {username}")
     else:
         print("User not logged in.")
         response.raise_for_status()
         questions_data = response.json()
         if not questions_data:
+            print("Fetched questions list is empty.")
+            return "Fetched questions list is empty or invalid format.", None
         print(f"Fetched {len(questions_data)} questions.")
     except requests.exceptions.RequestException as e:
         print(f"Error fetching questions: {e}")
         return f"Error fetching questions: {e}", None
     except requests.exceptions.JSONDecodeError as e:
+        print(f"Error decoding JSON response from questions endpoint: {e}")
+        print(f"Response text: {response.text[:500]}")
+        return f"Error decoding server response for questions: {e}", None
     except Exception as e:
         print(f"An unexpected error occurred fetching questions: {e}")
         return f"An unexpected error occurred fetching questions: {e}", None
     for item in questions_data:
         task_id = item.get("task_id")
         question_text = item.get("question")
         if not task_id or question_text is None:
             print(f"Skipping item with missing task_id or question: {item}")
             continue
         try:
+            file_attached = item.get("file_name", "") != ""
+            submitted_answer = agent(question_text, task_id, file_attached)
+            answers_payload.append(
+                {"task_id": task_id, "submitted_answer": submitted_answer}
+            )
+            results_log.append(
+                {
+                    "Task ID": task_id,
+                    "Question": question_text,
+                    "Submitted Answer": submitted_answer,
+                }
+            )
         except Exception as e:
+            print(f"Error running agent on task {task_id}: {e}")
+            results_log.append(
+                {
+                    "Task ID": task_id,
+                    "Question": question_text,
+                    "Submitted Answer": f"AGENT ERROR: {e}",
+                }
+            )
     if not answers_payload:
         print("Agent did not produce any answers to submit.")
         return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
+    # 4. Prepare Submission
+    submission_data = {
+        "username": username.strip(),
+        "agent_code": agent_code,
+        "answers": answers_payload,
+    }
     status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
     print(status_update)
+    try:
+        import json
+        with open("answers.json", "w", encoding="utf-8") as ans_fp:
+            json.dump(answers_payload, ans_fp)
+    except Exception as e:
+        print(f"Could not save answers to a file: {e}.")
     # 5. Submit
     print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
     try:
     run_button = gr.Button("Run Evaluation & Submit All Answers")
+    status_output = gr.Textbox(
+        label="Run Status / Submission Result", lines=5, interactive=False
+    )
     # Removed max_rows=10 from DataFrame constructor
     results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
+    run_button.click(fn=run_and_submit_all, outputs=[status_output, results_table])
 if __name__ == "__main__":
+    print("\n" + "-" * 30 + " App Starting " + "-" * 30)
     # Check for SPACE_HOST and SPACE_ID at startup for information
     space_host_startup = os.getenv("SPACE_HOST")
+    space_id_startup = os.getenv("SPACE_ID")  # Get SPACE_ID at startup
     if space_host_startup:
         print(f"✅ SPACE_HOST found: {space_host_startup}")
     else:
         print("ℹ️  SPACE_HOST environment variable not found (running locally?).")
+    if space_id_startup:  # Print repo URLs if SPACE_ID is found
         print(f"✅ SPACE_ID found: {space_id_startup}")
         print(f"   Repo URL: https://huggingface.co/spaces/{space_id_startup}")
+        print(
+            f"   Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main"
+        )
     else:
+        print(
+            "ℹ️  SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined."
+        )
+    print("-" * (60 + len(" App Starting ")) + "\n")
     print("Launching Gradio Interface for Basic Agent Evaluation...")
+    demo.launch(debug=True, share=False)

requirements.txt CHANGED Viewed

@@ -1,2 +1,18 @@
 gradio
-requests

+bs4
 gradio
+gradio[oauth]
+python-dotenv
+requests
+smolagents
+smolagents[litellm, toolkit, transformers, e2b]
+openpyxl
+opencv-python
+protobuf
+sentencepiece
+soundfile
+torch
+transformers
+youtube-transcript-api
+yt-dlp
+langchain-community
+wikipedia-api

tools.py ADDED Viewed

	@@ -0,0 +1,461 @@

+import os
+import tempfile
+from typing import Dict, List, Optional
+from bs4 import BeautifulSoup
+import yt_dlp
+import pandas as pd
+import requests
+import torch
+from langchain_community.document_loaders import YoutubeLoader
+from langchain_community.retrievers import BM25Retriever
+from langchain_community.tools import BearlyInterpreterTool
+from langchain.docstore.document import Document
+from smolagents import (
+    DuckDuckGoSearchTool,
+    SpeechToTextTool,
+    Tool,
+    VisitWebpageTool,
+    WikipediaSearchTool,
+)
+from transformers import AutoProcessor, AutoModelForImageTextToText
+class RelevantInfoRetrieverTool(Tool):
+    name = "relevant_info_retriever"
+    description = "Retrieves relevant to the query information."
+    inputs = {
+        "query": {
+            "type": "string",
+            "description": "The query for which to retrieve information.",
+        },
+        "docs": {
+            "type": "string",
+            "description": "The source documents from which to choose in order to retrieve relevant information",
+        },
+    }
+    output_type = "string"
+    def forward(self, query: str, docs: List[Document]):
+        self.retriever = BM25Retriever.from_documents(docs)
+        results = self.retriever.get_relevant_documents(query)
+        if results:
+            return "\n\n".join([doc.page_content for doc in results])
+        else:
+            return "No relevant information found."
+class YoutubeTranscriptTool(Tool):
+    name = "youtube_transcript"
+    description = "Fetches youtube video's transcript."
+    inputs = {
+        "youtube_url": {
+            "type": "string",
+            "description": "The youtube video url",
+        },
+        "source_langs": {
+            "type": "array",
+            "description": "A list of language codes in a descending priority for the video trascript.",
+            "items": {"type": "string"},
+            "default": ["en"],
+            "required": False,
+            "nullable": True,
+        },
+        "target_lang": {
+            "type": "string",
+            "description": "The language to which the transcript will be translated.",
+            "default": "en",
+            "required": False,
+            "nullable": True,
+        },
+    }
+    output_type = "string"
+    def forward(
+        self,
+        youtube_url: str,
+        source_langs: Optional[List[str]] = ["en"],
+        target_lang: Optional[str] = "en",
+    ):
+        try:
+            loader = YoutubeLoader.from_youtube_url(
+                youtube_url,
+                add_video_info=True,
+                language=source_langs,
+                translation=target_lang,
+                # transcript_format=TranscriptFormat.CHUNKS,
+                # chunk_size_seconds=30,
+            )
+            transcript_docs = loader.load()
+            return transcript_docs
+        except Exception as e:
+            return f"Error fetching video's transcript: {e}"
+class ReverseStringTool(Tool):
+    name = "reverse_string"
+    description = "Reverses the input string."
+    inputs = {
+        "string": {
+            "type": "string",
+            "description": "The string that needs to be reversed.",
+        }
+    }
+    output_type = "string"
+    def forward(self, string: str):
+        try:
+            return string[-1::-1]
+        except Exception as e:
+            return f"Error reversing string: {e}"
+class SmolVLM2:
+    """The parent class for visual analyzer tools (using SmolVLM2-500M-Video model)"""
+    def __init__(self):
+        """Initializations for the analyzer tool"""
+        model_path = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
+        device = "cpu"  # "cuda" if torch.cuda.is_available() else "cpu"
+        self.processor = AutoProcessor.from_pretrained(model_path)
+        self.model = AutoModelForImageTextToText.from_pretrained(
+            model_path,
+            torch_dtype=torch.bfloat16,
+            # _attn_implementation="flash_attention_2",
+        ).to(device)
+class ImagesAnalyzerTool(Tool, SmolVLM2):
+    name = "image_analyzer"
+    description = "Analyzes each input image according to the query"
+    inputs = {
+        "query": {
+            "type": "string",
+            "description": "The query according to which the image will be analyzed.",
+        },
+        "images_urls": {
+            "type": "array",
+            "description": "A list of strings containing the images' urls",
+            "items": {"type": "string"},
+        },
+    }
+    output_type = "string"
+    def __init__(self):
+        Tool.__init__(self)
+        SmolVLM2.__init__(self)
+    def forward(self, query: str, images_urls: List[str]):
+        try:
+            # Image message entities for the different images' urls
+            image_message_ents = [{"type": "image", "url": iu} for iu in images_urls]
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": query,
+                        },
+                    ]
+                    + image_message_ents,
+                },
+            ]
+            inputs = self.processor.apply_chat_template(
+                messages,
+                add_generation_prompt=True,
+                tokenize=True,
+                return_dict=True,
+                return_tensors="pt",
+            ).to(self.model.device, dtype=torch.bfloat16)
+            generated_ids = self.model.generate(
+                **inputs, do_sample=False, max_new_tokens=64
+            )
+            generated_texts = self.processor.batch_decode(
+                generated_ids,
+                skip_special_tokens=True,
+            )
+            return generated_texts[0]
+        except Exception as e:
+            return f"Error analyzing image(s): {e}"
+class VideoAnalyzerTool(Tool, SmolVLM2):
+    name = "video_analyzer"
+    description = "Analyzes video at a specified path according to the query"
+    inputs = {
+        "query": {
+            "type": "string",
+            "description": "The query according to which the video will be analyzed.",
+        },
+        "video_path": {
+            "type": "string",
+            "description": "A string containing the video path",
+        },
+    }
+    output_type = "string"
+    def __init__(self):
+        Tool.__init__(self)
+        SmolVLM2.__init__(self)
+    def forward(self, query: str, video_path: str) -> str:
+        try:
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "video", "path": video_path},
+                        {"type": "text", "text": query},
+                    ],
+                },
+            ]
+            inputs = self.processor.apply_chat_template(
+                messages,
+                add_generation_prompt=True,
+                tokenize=True,
+                return_dict=True,
+                return_tensors="pt",
+            ).to(self.model.device, dtype=torch.bfloat16)
+            generated_ids = self.model.generate(
+                **inputs, do_sample=False, max_new_tokens=64
+            )
+            generated_texts = self.processor.batch_decode(
+                generated_ids,
+                skip_special_tokens=True,
+            )
+            return generated_texts[0]
+        except Exception as e:
+            return f"Error analyzing video: {e}"
+        finally:
+            # Cleanup if needed
+            if video_path and os.path.exists(video_path):
+                os.remove(video_path)
+class FileDownloaderTool(Tool):
+    name = "file_downloader"
+    description = "Downloads a file returning the name of the temporarily saved file"
+    inputs = {
+        "file_url": {
+            "type": "string",
+            "description": "The url from which the file shall be downloaded.",
+        },
+    }
+    output_type = "string"
+    def forward(self, file_url: str) -> str:
+        response = requests.get(file_url, stream=True)
+        response.raise_for_status()
+        original_filename = (
+            response.headers.get("content-disposition", "")
+            .split("=", -1)[-1]
+            .strip('"')
+        )
+        # Even if original_filename is empty or there is no extension, ext will be ""
+        ext = os.path.splitext(original_filename)[-1]
+        with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp_file:
+            for chunk in response.iter_content(chunk_size=8192):
+                tmp_file.write(chunk)
+            return tmp_file.name
+class YoutubeVideoDownloaderTool(Tool):
+    name = "youtube_video_downloader"
+    description = "Downloads the video from the specified url and returns the path where the video was saved"
+    inputs = {
+        "video_url": {
+            "type": "string",
+            "description": "A string containing the video url",
+        },
+    }
+    output_type = "string"
+    def forward(self, video_url: str) -> str:
+        try:
+            saved_video_path = ""
+            temp_dir = tempfile.gettempdir()
+            ydl_opts = {
+                "outtmpl": f"{temp_dir}/%(title)s.%(ext)s",  # Absolute or relative path
+                "quiet": True,
+            }
+            # Download youtube video as a file in tmp directory
+            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+                info = ydl.extract_info(video_url, download=True)
+                saved_video_path = ydl.prepare_filename(info)
+                return saved_video_path
+        except Exception as e:
+            return f"Error downloading video: {e}"
+class LoadXlsxFileTool(Tool):
+    name = "load_xlsx_file"
+    description = "This tool loads xlsx file into pandas and returns it"
+    inputs = {"file_path": {"type": "string", "description": "File path"}}
+    output_type = "object"
+    def forward(self, file_path: str) -> object:
+        return pd.read_excel(file_path)
+class LoadTextFileTool(Tool):
+    name = "load_text_file"
+    description = "This tool loads any text file"
+    inputs = {"file_path": {"type": "string", "description": "File path"}}
+    output_type = "string"
+    def forward(self, file_path: str) -> str:
+        with open(file_path, "r", encoding="utf-8") as file:
+            return file.read()
+class WebpageTablesContextRetrieverTool(Tool):
+    name = "webpage_tables_context_retriever"
+    description = """Retrieves structural context for all tables on a webpage.
+    Returns table indexes with captions, headers, and surrounding text to help identify relevant tables.
+    Use this first to determine which table index to extract."""
+    inputs = {
+        "url": {"type": "string", "description": "The URL of the webpage to analyze"}
+    }
+    output_type = "object"
+    def forward(self, url: str) -> Dict:
+        """Retrieve context information for all tables on the page"""
+        try:
+            response = requests.get(url, timeout=15)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.text, "html.parser")
+            tables = soup.find_all("table")
+            if not tables:
+                return {
+                    "status": "success",
+                    "tables": [],
+                    "message": "No tables found on page",
+                    "url": url,
+                }
+            results = []
+            for i, table in enumerate(tables):
+                context = {
+                    "index": i,
+                    "id": table.get("id", ""),
+                    "class": " ".join(table.get("class", [])),
+                    "summary": table.get("summary", ""),
+                    "caption": self._get_table_caption(table),
+                    "preceding_header": self._get_preceding_header(table),
+                    "surrounding_text": self._get_surrounding_text(table),
+                }
+                results.append(context)
+            return {
+                "status": "success",
+                "tables": results,
+                "url": url,
+                "message": f"Found {len(results)} tables with context information",
+                "suggestion": "Use html_table_extractor with the most relevant index",
+            }
+        except Exception as e:
+            return {
+                "status": "error",
+                "url": url,
+                "message": f"Failed to retrieve table contexts: {str(e)}",
+            }
+    def _get_table_caption(self, table) -> str:
+        """Extract table caption text if available"""
+        caption = table.find("caption")
+        return caption.get_text(strip=True) if caption else ""
+    def _get_preceding_header(self, table) -> str:
+        """Find the nearest preceding heading"""
+        for tag in table.find_all_previous(["h1", "h2", "h3", "h4", "h5", "h6"]):
+            return tag.get_text(strip=True)
+        return ""
+    def _get_surrounding_text(self, table, chars=150) -> str:
+        """Get relevant text around the table"""
+        prev_text = " ".join(
+            t.strip()
+            for t in table.find_all_previous(string=True, limit=3)
+            if t.strip()
+        )
+        next_text = " ".join(
+            t.strip() for t in table.find_all_next(string=True, limit=3) if t.strip()
+        )
+        return f"...{prev_text[-chars:]} [TABLE] {next_text[:chars]}..."
+class HtmlTableExtractorTool(Tool):
+    name = "html_table_extractor"
+    description = """Extracts a specific HTML table as structured data.
+    Use after webpage_tables_context_retriever to get the correct table index."""
+    inputs = {
+        "page_url": {
+            "type": "string",
+            "description": "The webpage URL containing the table",
+        },
+        "table_index": {
+            "type": "integer",
+            "description": "0-based index of the table to extract (from webpage_tables_context_retriever)",
+        },
+    }
+    output_type = "object"
+    def forward(self, page_url: str, table_index: int) -> Dict:
+        """Extract a specific table by index"""
+        try:
+            # First verify the URL is accessible
+            test_request = requests.head(page_url, timeout=5)
+            test_request.raise_for_status()
+            # Read all tables
+            tables = pd.read_html(page_url)
+            if not tables:
+                return {
+                    "status": "error",
+                    "message": "No tables found at URL",
+                    "url": page_url,
+                }
+            # Validate index
+            if table_index < 0 or table_index >= len(tables):
+                return {
+                    "status": "error",
+                    "message": f"Invalid table index {table_index}. Page has {len(tables)} tables.",
+                    "url": page_url,
+                    "available_indexes": list(range(len(tables))),
+                }
+            # Convert DataFrame to JSON-serializable format
+            df = tables[table_index]
+            return {
+                "status": "success",
+                "table_index": table_index,
+                "table_data": df,
+                "url": page_url,
+            }
+        except Exception as e:
+            return {
+                "status": "error",
+                "message": f"Table extraction failed: {str(e)}",
+                "url": page_url,
+                "table_index": table_index,
+            }