super_agent

Sleeping

App Files Files Community

lezaf commited on Jun 15

Commit

448903c

1 Parent(s): d3b88d9

Add agent implementation

Browse files

Files changed (7) hide show

.gitignore +5 -1
agent.py +187 -0
app.py +94 -14
requirements.txt +0 -0
subset_task_ids.txt +11 -0
system_prompt.txt +56 -5
tools.py +267 -0

.gitignore CHANGED Viewed

	@@ -1 +1,5 @@
1	- venv/

+.venv/
+.env
+# Python cache files
+__pycache__/
+.dist/

agent.py ADDED Viewed

	@@ -0,0 +1,187 @@

+from io import BytesIO
+import os
+import getpass
+import requests
+from dotenv import load_dotenv
+from langgraph.graph import StateGraph, MessagesState, START
+from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
+from langchain_openai import ChatOpenAI
+from langchain_core.messages import HumanMessage, SystemMessage
+from langgraph.prebuilt import ToolNode, tools_condition
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langfuse.langchain import CallbackHandler
+from tools import *
+load_dotenv(override=True)
+PROVIDER="google"
+langfuse_handler = CallbackHandler()
+tools = [
+    # add_numbers,
+    add_numbers_in_list,
+    web_search,
+    # wikipedia_search,
+    arxiv_search,
+    check_commutativity,
+    extract_sales_data_from_excel,
+    extract_transcript_from_youtube
+]
+# --------------- Define the agent structure ---------------- #
+def build_agent(provider: str = "hf"):
+    print(f"Building agent with provider: {provider}")
+    if provider == "hf":
+        llm = HuggingFaceEndpoint(
+            repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
+            task="text-generation",
+            temperature=0.0,
+            provider="hf-inference"
+        )
+        llm = ChatHuggingFace(llm=llm)
+    elif provider == "google":
+        # Google Gemini
+        llm = ChatGoogleGenerativeAI(
+            model="gemini-2.0-flash",
+            # temperature=0,
+            max_tokens=512,
+            # timeout=None,
+            max_retries=2,
+        )
+    elif provider == "openai":
+        llm = ChatOpenAI(
+            model="gpt-3.5-turbo",  # or "gpt-3.5-turbo"
+            temperature=0,
+            api_key=os.getenv("OPENAI_API_KEY"),
+            max_tokens=512
+        )
+    else:
+        raise ValueError(f"Unsupported provider: {provider}")
+    # Bind the tools to the LLM
+    llm_with_tools = llm.bind_tools(tools)
+    # load the system prompt from the file
+    with open("system_prompt.txt", "r", encoding="utf-8") as f:
+        system_prompt = f.read()
+    # Create system message with the system prompt
+    sys_msg = SystemMessage(content=system_prompt)
+    # --------------- Define nodes ---------------- #
+    def assistant(state: MessagesState):
+        """Node for the assistant to respond to user input."""
+        # return {"messages": [llm_with_tools.invoke(state["messages"])]}
+        response = llm_with_tools.invoke([sys_msg] + state["messages"])
+        return {"messages": [response]}
+    tool_node = ToolNode(tools=tools)
+    # --------------- Build the state graph ---------------- #
+    graph_builder = StateGraph(MessagesState)
+    graph_builder.add_node("assistant", assistant)
+    graph_builder.add_node("tools", tool_node)
+    graph_builder.add_conditional_edges(
+        "assistant",
+        tools_condition,
+    )
+    graph_builder.add_edge("tools", "assistant")
+    graph_builder.add_edge(START, "assistant")
+    return graph_builder.compile()
+if __name__ == "__main__":
+    print("\n" + "-"*30 + " Agent Starting " + "-"*30)
+    agent = build_agent(provider=PROVIDER)  # Change to "hf" for HuggingFace
+    print("Agent built successfully.")
+    print("-"*70)
+    # Get questions
+    DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
+    api_url = DEFAULT_API_URL
+    questions_url = f"{api_url}/questions"
+    files_url = f"{api_url}/files/" # Needs task_id
+    # 2. Fetch Questions
+    print(f"Fetching questions from: {questions_url}")
+    try:
+        response = requests.get(questions_url, timeout=15)
+        response.raise_for_status()
+        questions_data = response.json()
+        if not questions_data:
+             print("Fetched questions list is empty.")
+        print(f"Fetched {len(questions_data)} questions.")
+    except Exception as e:
+        print(f"An unexpected error occurred fetching questions: {e}")
+    # 3. Get specific question by task_id
+    task_id = "cca530fc-4052-43b2-b130-b30968d8aa44" # Chess image
+    # task_id = "6f37996b-2ac7-44b0-8e68-6d28256631b4" # Commutativity check
+    # task_id = "2d83110e-a098-4ebb-9987-066c06fa42d0"  # Reverse text example
+    # task_id = "f918266a-b3e0-4914-865d-4faa564f1aef"  # Code example
+    # task_id = "7bd855d8-463d-4ed5-93ca-5fe35145f733" # Excel file (passed)
+    # task_id = "cabe07ed-9eca-40ea-8ead-410ef5e83f91" # Louvrier
+    # task_id = "305ac316-eef6-4446-960a-92d80d542f82" # Poland film (FAIL)
+    # task_id = "3f57289b-8c60-48be-bd80-01f8099ca449" # at bats (PASS)
+    # task_id = "bda648d7-d618-4883-88f4-3466eabd860e"  # Vietnamese (FAIL)
+    # task_id = "cf106601-ab4f-4af9-b045-5295fe67b37d" # Olympics
+    # task_id = "a0c07678-e491-4bbc-8f0b-07405144218f"
+    # task_id = "3cef3a44-215e-4aed-8e3b-b1e3f08063b7" # grocery list
+    # task_id = "8e867cd7-cff9-4e6c-867a-ff5ddc2550be" # Sosa albums
+    # task_id = "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8" # Dinosaur
+    # task_id = "840bfca7-4f7b-481a-8794-c560c340185d" # Carolyn Collins Petersen (FAIL)
+    # task_id = "5a0c1adf-205e-4841-a666-7c3ef95def9d" # Malko competition (PASS)
+    # get question with task_id
+    q_data = next((item for item in questions_data if item["task_id"] == task_id), None)
+    content = [
+        {"type": "text", "text": q_data["question"]}
+    ]
+    if q_data["file_name"] != "":
+        file_url = f"{files_url}{task_id}"
+        if q_data["file_name"].endswith((".png", ".jpg", ".jpeg")):
+            content.append({"type": "image_url", "image_url": {"url": file_url}})
+        elif q_data["file_name"].endswith((".py")):
+            # For code files, we can just send the text content
+            try:
+                response = requests.get(file_url, timeout=15)
+                response.raise_for_status()
+                code_content = response.text
+                content.append({"type": "text", "text": code_content})
+            except Exception as e:
+                print(f"Error fetching code file: {e}")
+        elif q_data["file_name"].endswith((".xlsx", ".xls")):
+            content.append({"type": "text", "text": "Excel file url: " + file_url})
+    human_msg = HumanMessage(content=content)
+    human_msg.pretty_print()
+    try:
+        result = agent.invoke(
+            {"messages": [human_msg]},
+            config={"callbacks": [langfuse_handler]}
+        )
+        for message in result["messages"]:
+            message.pretty_print()
+        # Result already printed inside assistant() node
+    except Exception as e:
+        print(f"Error: {e}")

app.py CHANGED Viewed

@@ -1,27 +1,99 @@
 import os
 import gradio as gr
 import requests
 import inspect
 import pandas as pd
 # (Keep Constants as is)
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 # --- Basic Agent Definition ---
-# ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
-class BasicAgent:
     def __init__(self):
-        print("BasicAgent initialized.")
-    def __call__(self, question: str) -> str:
         print(f"Agent received question (first 50 chars): {question[:50]}...")
-        fixed_answer = "This is a default answer."
-        print(f"Agent returning fixed answer: {fixed_answer}")
-        return fixed_answer
 def run_and_submit_all( profile: gr.OAuthProfile | None):
     """
-    Fetches all questions, runs the BasicAgent on them, submits all answers,
     and displays the results.
     """
     # --- Determine HF Space Runtime URL and Repo URL ---
@@ -34,13 +106,9 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
         print("User not logged in.")
         return "Please Login to Hugging Face with the button.", None
-    api_url = DEFAULT_API_URL
-    questions_url = f"{api_url}/questions"
-    submit_url = f"{api_url}/submit"
     # 1. Instantiate Agent ( modify this part to create your agent)
     try:
-        agent = BasicAgent()
     except Exception as e:
         print(f"Error instantiating agent: {e}")
         return f"Error initializing agent: {e}", None
@@ -79,10 +147,22 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
         if not task_id or question_text is None:
             print(f"Skipping item with missing task_id or question: {item}")
             continue
         try:
-            submitted_answer = agent(question_text)
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
         except Exception as e:
              print(f"Error running agent on task {task_id}: {e}")
              results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})

+"""
+NOTE:
+    - The agent only runs on a subset of tasks defined in `subset_task_ids.txt` to avoid unnecessary token usage
+      for questions that the agent cannot handle right now.
+    - There is a 30 sec delay after each question is answered to avoid rate limiting issues.
+"""
 import os
 import gradio as gr
 import requests
 import inspect
 import pandas as pd
+import time
+from agent import build_agent
+from langchain_core.messages import HumanMessage
+from langfuse.langchain import CallbackHandler
+langfuse_handler = CallbackHandler()
 # (Keep Constants as is)
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
+questions_url = f"{DEFAULT_API_URL}/questions"
+submit_url = f"{DEFAULT_API_URL}/submit"
+files_url = f"{DEFAULT_API_URL}/files/" # Needs task_id
 # --- Basic Agent Definition ---
+class SuperAgent:
     def __init__(self):
+        print("SuperAgent initialized.")
+        self.agent = build_agent(provider="google")  # Change to "hf" for HuggingFace
+    def __call__(self, data: str) -> str:
+        """
+        Args:
+            data (str): A string containing the question to be answered.
+                Schema: {
+                    task_id: str,
+                    question: str,
+                    file_name: str,
+                }
+        """
+        # Quick validation of input data (TODO: Use pydantic for schema)
+        if not data.get("question") or not data.get("task_id") or not data.get("file_name"):
+            raise ValueError("Input data must contain 'question', 'task_id', and 'file_name'.")
+        task_id, question, file_name = data["task_id"], data["question"], data["file_name"]
         print(f"Agent received question (first 50 chars): {question[:50]}...")
+        # Build HumanMessage
+        content = [
+            {"type": "text", "text": question}
+        ]
+        if file_name != "":
+            file_url = f"{files_url}{task_id}"
+            if file_name.endswith((".png", ".jpg", ".jpeg")):
+                content.append({"type": "image_url", "image_url": {"url": file_url}})
+            elif file_name.endswith((".py")):
+                # For code files, we can just send the text content
+                try:
+                    response = requests.get(file_url, timeout=15)
+                    response.raise_for_status()
+                    code_content = response.text
+                    content.append({"type": "text", "text": code_content})
+                except Exception as e:
+                    print(f"Error fetching code file: {e}")
+            elif file_name.endswith((".xlsx", ".xls")):
+                content.append({"type": "text", "text": "Excel file url: " + file_url})
+        human_msg = HumanMessage(content=content)
+        try:
+            answer = self.agent.invoke(
+                {"messages": [human_msg]},
+                config={"callbacks": [langfuse_handler]}
+            )
+            # for message in answer["messages"]:
+            #     message.pretty_print()
+            # Result already printed inside assistant() node
+        except Exception as e:
+            print(f"Error: {e}")
+        return answer["messages"][-1].content
 def run_and_submit_all( profile: gr.OAuthProfile | None):
     """
+    Fetches all questions, runs the SuperAgent on them, submits all answers,
     and displays the results.
     """
     # --- Determine HF Space Runtime URL and Repo URL ---
         print("User not logged in.")
         return "Please Login to Hugging Face with the button.", None
     # 1. Instantiate Agent ( modify this part to create your agent)
     try:
+        agent = SuperAgent()
     except Exception as e:
         print(f"Error instantiating agent: {e}")
         return f"Error initializing agent: {e}", None
         if not task_id or question_text is None:
             print(f"Skipping item with missing task_id or question: {item}")
             continue
+        # Only run on subset of tasks that is capable of being run so that
+        # token usage is not wasted on tasks that the agent cannot handle.
+        with open("subset_task_ids.txt", "r") as f:
+            subset_task_ids = [line.strip() for line in f if line.strip()]
+        if task_id not in subset_task_ids:
+            continue
         try:
+            submitted_answer = agent(item)
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
+            time.sleep(30) # Sleep to avoid rate limiting issues
         except Exception as e:
              print(f"Error running agent on task {task_id}: {e}")
              results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})

requirements.txt CHANGED Viewed

Binary files a/requirements.txt and b/requirements.txt differ

subset_task_ids.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+8e867cd7-cff9-4e6c-867a-ff5ddc2550be
+2d83110e-a098-4ebb-9987-066c06fa42d0
+cca530fc-4052-43b2-b130-b30968d8aa44
+4fc2f1ae-8625-45b5-ab34-ad4433bc21f8
+6f37996b-2ac7-44b0-8e68-6d28256631b4
+9d191bce-651d-4746-be2d-7ef8ecadb9c2
+cabe07ed-9eca-40ea-8ead-410ef5e83f91
+f918266a-b3e0-4914-865d-4faa564f1aef
+3f57289b-8c60-48be-bd80-01f8099ca449
+7bd855d8-463d-4ed5-93ca-5fe35145f733
+5a0c1adf-205e-4841-a666-7c3ef95def9d

system_prompt.txt CHANGED Viewed

@@ -1,8 +1,59 @@
-You are a general AI assistant.
-I will ask you a question.
-Report your thoughts, and finish your answer with the following template: [YOUR_FINAL_ANSWER].
 For YOUR_FINAL_ANSWER follow strictly the instructions below:
   * YOUR_FINAL_ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
-  * If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
   * If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
-  * If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.

+You are a general AI assistant. I will ask you a question and I want an answer in the following template: YOUR_FINAL_ANSWER.
 For YOUR_FINAL_ANSWER follow strictly the instructions below:
   * YOUR_FINAL_ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
+  * If you are asked for a number, don't use comma to write your number neither use units such as: [$, meters (m), centimeters (cm), oz] or any other unit of measurement
+    or percent sign unless specified otherwise.
   * If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
+  * If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
+You are provided with tools that you can use to answer questions accurately. If you cannot answer the question directly, examine the list of available tools and
+choose the suitable tool for your case. You may need to use more than one tool to conclude to an answer.
+Below are some Question/Answer examples. "Q" is what you get from user, "[P]" is the internal planning and processing you make and "A" is the output to the user.
+Do not restate or explain the answer. Do not prefix the answer with "A:", "Answer:", or any other text. Only output the final value requested.
+Example 1:
+Q: What is the height of statue of liberty?
+[P]: I should use web_search tool.
+[P]: web_search("height of statue of liberty")
+[P]: The result of web_search is "The height of the statue of liberty is 93 m"
+A: 93
+Example 2:
+Q: What is the circumference of earth in miles?
+[P]: I should use web_search tool.
+[P]: web_search("circumference of earth in miles")
+[P]: The result of web_search is "The circumference of earth is 24,901 miles"
+A: 24901 miles
+Example 3:
+Q: What is the capital of France?
+[P]: This is a factual question I know.
+A: Paris
+Example 4:
+Q: What is the total cost with two decimal places of the items in the table, excluding drinks?
+Table:
+| Burgers | Salads | Soda | Ice Cream |
+| 10.0    | 5.0    | 3.0  | 4.0       |
+[P]: Soda is a drink. The rest are food.
+[P]: I should use add_numbers_in_list([10.0, 5.0, 4.0])
+[P]: The result is 19.0
+A: 19.00
+Example 5:
+Q: What was the name of the director that won the Oscar in 2009?
+A: Boyle
+IMPORTANT: Never report to the user the strategy you followed to conclude to the answer. Always report the final answer as a string, number, or whatever is asked in the question.
+If the question involves summing or totaling numeric values from a list or data source, always use the add_numbers_in_list tool.
+Do not attempt to manually perform or display the addition; instead, pass the numeric list to the tool and use its output directly as the final answer.
+Never display intermediate math like “X + Y + Z = …” unless specifically requested. Only show the final answer after using the tool.

tools.py ADDED Viewed

	@@ -0,0 +1,267 @@

+import pandas as pd
+import requests
+from io import BytesIO
+from io import StringIO
+from langchain_core.tools import tool
+from langchain_community.retrievers import WikipediaRetriever
+from langchain_community.document_loaders import ArxivLoader
+from langchain_community.retrievers import BM25Retriever
+from langchain_core.documents import Document
+from duckduckgo_search import DDGS
+from markitdown import MarkItDown
+# --------------- Math Tools ---------------- #
+@tool
+def add_numbers(a: int, b: int) -> int:
+    """Add two numbers.
+    Args:
+        a (int): The first number.
+        b (int): The second number.
+    """
+    return a + b
+@tool
+def add_numbers_in_list(numbers: list[float]) -> float:
+    """Add all numbers in a list.
+    Always use this tool for summing numerical values, instead of doing math directly in the response.
+    Args:
+        numbers (list[float]): A list of numbers to add.
+    """
+    return sum(numbers)
+# @tool
+# def web_search(query: str) -> str:
+#     """Perform a web search using DuckDuckGo.
+#     Args:
+#         query (str): The search query.
+#     Returns:
+#         str: The search results.
+#     """
+#     search_tool = DuckDuckGoSearchRun()
+#     return search_tool.invoke(query)
+@tool
+def web_search(query: str) -> str:
+    """
+    Perform a web search using DuckDuckGo. Visit the top ranked page,
+    apply chunking in page results, perform similarity search, and return
+    the top results content.
+    Args:
+        query (str): The search query.
+    Returns:
+        Document: The top results from the ranking, in langchain_core.documents.Document
+                         objects having fields 'page_content' with the chunk content and 'metadata'.
+    """
+    def _chunk_text(text, chunk_size_words=1000, overlap_words=100):
+        """
+        Split text into chunks of specified size with overlap.
+        Args:
+            text (str): The text to be chunked.
+            chunk_size (int): The size of each chunk.
+            overlap (int): The number of overlapping characters between chunks.
+        Returns:
+            list: A list of text chunks.
+        """
+        words = text.split()
+        chunks = []
+        for i in range(0, len(words), chunk_size_words - overlap_words):
+            chunk = " ".join(words[i:i + chunk_size_words])
+            chunks.append(chunk)
+        return chunks
+    # STEP 1: Find the most relevant webpage
+    results = DDGS().text(query, max_results=1)
+    top_rank_page = results[0] if results else None
+    if not top_rank_page:
+        return "No relevant results found for the query."
+    # STEP 2: Extract the content of the webpage
+    md = MarkItDown(enable_plugins=True)
+    md_result = md.convert(top_rank_page['href'])
+    page_content = md_result.text_content
+    # STEP 3: Apply chunking
+    chunks = _chunk_text(page_content)
+    # STEP 4: Apply ranking in chunks
+    list_of_docs = [
+        Document(page_content = chunk, metadata = {"source": top_rank_page['href'], "title": top_rank_page['title']})
+        for chunk in chunks
+    ]
+    retriever = BM25Retriever.from_documents(list_of_docs)
+    matched = retriever.invoke(query)
+    return matched[0]
+# TODO:
+# Maybe don't return the summary, but the full document?
+@tool
+def wikipedia_search(query: str) -> str:
+    """
+    Search Wikipedia for a given query and return a summary of the top result.
+    Args:
+        query (str): The search term.
+    Returns:
+        str: A summary of the most relevant Wikipedia entry.
+    """
+    wikipedia_retriever = WikipediaRetriever(load_max_docs=1)
+    documents = wikipedia_retriever.get_relevant_documents(query)
+    if not documents:
+        return "No relevant Wikipedia articles found."
+    formatted_search_docs = "\n\n---\n\n".join(
+        [
+            f'<Document source="{doc.metadata["source"]}" title="{doc.metadata.get("title", "")}"/>\n{doc.metadata["summary"]}\n</Document>'
+            for doc in documents
+        ])
+    # Return the content of the top document
+    return formatted_search_docs
+@tool
+def arxiv_search(query: str) -> str:
+    """
+    Search Arxiv for academic papers based on a query and return summaries of top results.
+    Args:
+        query (str): The search query for Arxiv.
+    Returns:
+        str: Summary of the top few relevant papers from Arxiv.
+    """
+    try:
+        loader = ArxivLoader(query=query, load_max_docs=2)
+        documents = loader.load()
+        if not documents:
+            return "No relevant papers found on Arxiv."
+        # Format and return top paper summaries
+        results = []
+        for doc in documents:
+            title = doc.metadata.get("Title", "No Title")
+            published = doc.metadata.get("Published", "Unknown date")
+            url = doc.metadata.get("entry_id", "No URL")
+            summary = doc.page_content[:500]  # limit summary length
+            results.append(f"Title: {title}\nPublished: {published}\nURL: {url}\nSummary: {summary}\n")
+        return "\n---\n".join(results)
+    except Exception as e:
+        return f"An error occurred while searching Arxiv: {str(e)}"
+@tool
+def check_commutativity(table_str: str) -> str:
+    """
+    Given a binary operation table (in markdown format), returns the subset of elements
+    involved in counter-examples to commutativity, sorted alphabetically.
+    Args:
+        table_str (str): Markdown table defining the operation * on a finite set.
+    Returns:
+        str: Comma-separated list of elements in the counter-example set, alphabetically sorted.
+    """
+    # Read the table using pandas
+    df = pd.read_csv(StringIO(table_str), sep="|", skipinitialspace=True, engine='python')
+    # Drop empty columns due to leading/trailing pipes
+    df = df.dropna(axis=1, how="all")
+    df.columns = [c.strip() for c in df.columns]
+    df = df.dropna(axis=0, how="all")
+    # Extract header and values
+    elements = df.columns[1:]
+    df.index = df[df.columns[0]]
+    df = df.drop(df.columns[0], axis=1)
+    # Check commutativity: a*b == b*a
+    counterexample_elements = set()
+    for x in elements:
+        for y in elements:
+            if df.loc[x, y] != df.loc[y, x]:
+                counterexample_elements.add(x)
+                counterexample_elements.add(y)
+    return ", ".join(sorted(counterexample_elements))
+@tool
+def extract_sales_data_from_excel(url: str) -> str:
+    """
+    Downloads and extracts sales data from an Excel file at the given URL.
+    Returns the contents of the first sheet as a markdown-formatted string.
+    """
+    try:
+        response = requests.get(url)
+        response.raise_for_status()
+        excel_file = BytesIO(response.content)
+        df = pd.read_excel(excel_file)
+        # Optional: Remove unnamed columns often created by Excel
+        df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
+        # Convert all numeric columns to float
+        for col in df.select_dtypes(include=["number"]).columns:
+            df[col] = df[col].astype(float)
+        return df.to_string(index=False)
+    except Exception as e:
+        return f"Failed to process Excel file from URL: {str(e)}"
+@tool
+def extract_transcript_from_youtube(url: str) -> str:
+    """
+    Extracts the transcript from a YouTube video given its URL.
+    Args:
+        url (str): The YouTube video URL.
+    Returns:
+        str: The transcript of the video, or an error message if extraction fails.
+    """
+    transcript_str = "### Transcript"
+    md = MarkItDown(enable_plugins=True)
+    try:
+        result = md.convert(url)
+    except Exception as e:
+        return f"Failed to extract transcript from YouTube video: {str(e)}"
+    parts = result.text_content.split(transcript_str)
+    if len(parts) < 2:
+        return result.text_content
+    transcript = transcript_str + "\n" + parts[1]
+    return transcript.strip()
+# @tool
+# def extract_transcript_from_audio(url: str) -> str:
+#     """
+#     Extracts the transcript from an audio file given its URL.
+#     Supported formats: mp3, wav.
+#     Args:
+#         url (str): The URL of the audio file.
+#     Returns:
+#         str: The transcript of the audio file, or an error message if extraction fails.
+#     """
+#     md = MarkItDown(enable_plugins=True)
+#     try:
+#         result = md.convert(url)
+#     except Exception as e:
+#         return f"Failed to extract transcript from audio: {str(e)}"
+#     return result.text_content