super_agent

Sleeping

File size: 8,460 Bytes

448903c

import pandas as pd
import requests
from io import BytesIO
from io import StringIO
from langchain_core.tools import tool
from langchain_community.retrievers import WikipediaRetriever
from langchain_community.document_loaders import ArxivLoader
from langchain_community.retrievers import BM25Retriever
from langchain_core.documents import Document
from duckduckgo_search import DDGS
from markitdown import MarkItDown

# --------------- Math Tools ---------------- #
@tool
def add_numbers(a: int, b: int) -> int:
    """Add two numbers.
    
    Args:
        a (int): The first number.
        b (int): The second number.
    """
    return a + b

@tool
def add_numbers_in_list(numbers: list[float]) -> float:
    """Add all numbers in a list.
    Always use this tool for summing numerical values, instead of doing math directly in the response.
    
    Args:
        numbers (list[float]): A list of numbers to add.
    """
    return sum(numbers)

# @tool
# def web_search(query: str) -> str:
#     """Perform a web search using DuckDuckGo.
    
#     Args:
#         query (str): The search query.
        
#     Returns:
#         str: The search results.
#     """
#     search_tool = DuckDuckGoSearchRun()
#     return search_tool.invoke(query)

@tool
def web_search(query: str) -> str:
    """
    Perform a web search using DuckDuckGo. Visit the top ranked page,
    apply chunking in page results, perform similarity search, and return
    the top results content.

    Args:
        query (str): The search query.
    Returns:
        Document: The top results from the ranking, in langchain_core.documents.Document
                         objects having fields 'page_content' with the chunk content and 'metadata'.
    """
    def _chunk_text(text, chunk_size_words=1000, overlap_words=100):
        """
        Split text into chunks of specified size with overlap.
        Args:
            text (str): The text to be chunked.
            chunk_size (int): The size of each chunk.
            overlap (int): The number of overlapping characters between chunks.
        Returns:
            list: A list of text chunks.
        """
        words = text.split()
        chunks = []
        for i in range(0, len(words), chunk_size_words - overlap_words):
            chunk = " ".join(words[i:i + chunk_size_words])
            chunks.append(chunk)
        return chunks
    
    # STEP 1: Find the most relevant webpage
    results = DDGS().text(query, max_results=1)
    top_rank_page = results[0] if results else None
    if not top_rank_page:
        return "No relevant results found for the query."

    # STEP 2: Extract the content of the webpage
    md = MarkItDown(enable_plugins=True)
    md_result = md.convert(top_rank_page['href'])

    page_content = md_result.text_content

    # STEP 3: Apply chunking
    chunks = _chunk_text(page_content)

    # STEP 4: Apply ranking in chunks
    list_of_docs = [
        Document(page_content = chunk, metadata = {"source": top_rank_page['href'], "title": top_rank_page['title']})
        for chunk in chunks
    ]

    retriever = BM25Retriever.from_documents(list_of_docs)
    matched = retriever.invoke(query)

    return matched[0]

# TODO:
# Maybe don't return the summary, but the full document?
@tool
def wikipedia_search(query: str) -> str:
    """
    Search Wikipedia for a given query and return a summary of the top result.

    Args:
        query (str): The search term.

    Returns:
        str: A summary of the most relevant Wikipedia entry.
    """
    wikipedia_retriever = WikipediaRetriever(load_max_docs=1)

    documents = wikipedia_retriever.get_relevant_documents(query)
    if not documents:
        return "No relevant Wikipedia articles found."

    formatted_search_docs = "\n\n---\n\n".join(
        [
            f'<Document source="{doc.metadata["source"]}" title="{doc.metadata.get("title", "")}"/>\n{doc.metadata["summary"]}\n</Document>'
            for doc in documents
        ])
    
    # Return the content of the top document
    return formatted_search_docs

@tool
def arxiv_search(query: str) -> str:
    """
    Search Arxiv for academic papers based on a query and return summaries of top results.

    Args:
        query (str): The search query for Arxiv.

    Returns:
        str: Summary of the top few relevant papers from Arxiv.
    """
    try:
        loader = ArxivLoader(query=query, load_max_docs=2)
        documents = loader.load()
        
        if not documents:
            return "No relevant papers found on Arxiv."

        # Format and return top paper summaries
        results = []
        for doc in documents:
            title = doc.metadata.get("Title", "No Title")
            published = doc.metadata.get("Published", "Unknown date")
            url = doc.metadata.get("entry_id", "No URL")
            summary = doc.page_content[:500]  # limit summary length

            results.append(f"Title: {title}\nPublished: {published}\nURL: {url}\nSummary: {summary}\n")

        return "\n---\n".join(results)

    except Exception as e:
        return f"An error occurred while searching Arxiv: {str(e)}"
    
@tool
def check_commutativity(table_str: str) -> str:
    """
    Given a binary operation table (in markdown format), returns the subset of elements 
    involved in counter-examples to commutativity, sorted alphabetically.
    
    Args:
        table_str (str): Markdown table defining the operation * on a finite set.

    Returns:
        str: Comma-separated list of elements in the counter-example set, alphabetically sorted.
    """
    # Read the table using pandas
    df = pd.read_csv(StringIO(table_str), sep="|", skipinitialspace=True, engine='python')

    # Drop empty columns due to leading/trailing pipes
    df = df.dropna(axis=1, how="all")
    df.columns = [c.strip() for c in df.columns]
    df = df.dropna(axis=0, how="all")

    # Extract header and values
    elements = df.columns[1:]
    df.index = df[df.columns[0]]
    df = df.drop(df.columns[0], axis=1)

    # Check commutativity: a*b == b*a
    counterexample_elements = set()
    for x in elements:
        for y in elements:
            if df.loc[x, y] != df.loc[y, x]:
                counterexample_elements.add(x)
                counterexample_elements.add(y)

    return ", ".join(sorted(counterexample_elements))

@tool
def extract_sales_data_from_excel(url: str) -> str:
    """
    Downloads and extracts sales data from an Excel file at the given URL.
    Returns the contents of the first sheet as a markdown-formatted string.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()

        excel_file = BytesIO(response.content)
        df = pd.read_excel(excel_file)

        # Optional: Remove unnamed columns often created by Excel
        df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

        # Convert all numeric columns to float
        for col in df.select_dtypes(include=["number"]).columns:
            df[col] = df[col].astype(float)

        return df.to_string(index=False)
    
    except Exception as e:
        return f"Failed to process Excel file from URL: {str(e)}"

@tool
def extract_transcript_from_youtube(url: str) -> str:
    """
    Extracts the transcript from a YouTube video given its URL.

    Args:
        url (str): The YouTube video URL.
    Returns:
        str: The transcript of the video, or an error message if extraction fails.
    """
    transcript_str = "### Transcript"
    md = MarkItDown(enable_plugins=True)

    try:
        result = md.convert(url)
    except Exception as e:
        return f"Failed to extract transcript from YouTube video: {str(e)}"

    parts = result.text_content.split(transcript_str)
    if len(parts) < 2:
        return result.text_content
    
    transcript = transcript_str + "\n" + parts[1]
    return transcript.strip()

# @tool
# def extract_transcript_from_audio(url: str) -> str:
#     """
#     Extracts the transcript from an audio file given its URL.
#     Supported formats: mp3, wav.

#     Args:
#         url (str): The URL of the audio file.
#     Returns:
#         str: The transcript of the audio file, or an error message if extraction fails.
#     """
#     md = MarkItDown(enable_plugins=True)

#     try:
#         result = md.convert(url)
#     except Exception as e:
#         return f"Failed to extract transcript from audio: {str(e)}"
    
#     return result.text_content