super_agent / tools.py
lezaf
Add agent implementation
448903c
raw
history blame
8.46 kB
import pandas as pd
import requests
from io import BytesIO
from io import StringIO
from langchain_core.tools import tool
from langchain_community.retrievers import WikipediaRetriever
from langchain_community.document_loaders import ArxivLoader
from langchain_community.retrievers import BM25Retriever
from langchain_core.documents import Document
from duckduckgo_search import DDGS
from markitdown import MarkItDown
# --------------- Math Tools ---------------- #
@tool
def add_numbers(a: int, b: int) -> int:
"""Add two numbers.
Args:
a (int): The first number.
b (int): The second number.
"""
return a + b
@tool
def add_numbers_in_list(numbers: list[float]) -> float:
"""Add all numbers in a list.
Always use this tool for summing numerical values, instead of doing math directly in the response.
Args:
numbers (list[float]): A list of numbers to add.
"""
return sum(numbers)
# @tool
# def web_search(query: str) -> str:
# """Perform a web search using DuckDuckGo.
# Args:
# query (str): The search query.
# Returns:
# str: The search results.
# """
# search_tool = DuckDuckGoSearchRun()
# return search_tool.invoke(query)
@tool
def web_search(query: str) -> str:
"""
Perform a web search using DuckDuckGo. Visit the top ranked page,
apply chunking in page results, perform similarity search, and return
the top results content.
Args:
query (str): The search query.
Returns:
Document: The top results from the ranking, in langchain_core.documents.Document
objects having fields 'page_content' with the chunk content and 'metadata'.
"""
def _chunk_text(text, chunk_size_words=1000, overlap_words=100):
"""
Split text into chunks of specified size with overlap.
Args:
text (str): The text to be chunked.
chunk_size (int): The size of each chunk.
overlap (int): The number of overlapping characters between chunks.
Returns:
list: A list of text chunks.
"""
words = text.split()
chunks = []
for i in range(0, len(words), chunk_size_words - overlap_words):
chunk = " ".join(words[i:i + chunk_size_words])
chunks.append(chunk)
return chunks
# STEP 1: Find the most relevant webpage
results = DDGS().text(query, max_results=1)
top_rank_page = results[0] if results else None
if not top_rank_page:
return "No relevant results found for the query."
# STEP 2: Extract the content of the webpage
md = MarkItDown(enable_plugins=True)
md_result = md.convert(top_rank_page['href'])
page_content = md_result.text_content
# STEP 3: Apply chunking
chunks = _chunk_text(page_content)
# STEP 4: Apply ranking in chunks
list_of_docs = [
Document(page_content = chunk, metadata = {"source": top_rank_page['href'], "title": top_rank_page['title']})
for chunk in chunks
]
retriever = BM25Retriever.from_documents(list_of_docs)
matched = retriever.invoke(query)
return matched[0]
# TODO:
# Maybe don't return the summary, but the full document?
@tool
def wikipedia_search(query: str) -> str:
"""
Search Wikipedia for a given query and return a summary of the top result.
Args:
query (str): The search term.
Returns:
str: A summary of the most relevant Wikipedia entry.
"""
wikipedia_retriever = WikipediaRetriever(load_max_docs=1)
documents = wikipedia_retriever.get_relevant_documents(query)
if not documents:
return "No relevant Wikipedia articles found."
formatted_search_docs = "\n\n---\n\n".join(
[
f'<Document source="{doc.metadata["source"]}" title="{doc.metadata.get("title", "")}"/>\n{doc.metadata["summary"]}\n</Document>'
for doc in documents
])
# Return the content of the top document
return formatted_search_docs
@tool
def arxiv_search(query: str) -> str:
"""
Search Arxiv for academic papers based on a query and return summaries of top results.
Args:
query (str): The search query for Arxiv.
Returns:
str: Summary of the top few relevant papers from Arxiv.
"""
try:
loader = ArxivLoader(query=query, load_max_docs=2)
documents = loader.load()
if not documents:
return "No relevant papers found on Arxiv."
# Format and return top paper summaries
results = []
for doc in documents:
title = doc.metadata.get("Title", "No Title")
published = doc.metadata.get("Published", "Unknown date")
url = doc.metadata.get("entry_id", "No URL")
summary = doc.page_content[:500] # limit summary length
results.append(f"Title: {title}\nPublished: {published}\nURL: {url}\nSummary: {summary}\n")
return "\n---\n".join(results)
except Exception as e:
return f"An error occurred while searching Arxiv: {str(e)}"
@tool
def check_commutativity(table_str: str) -> str:
"""
Given a binary operation table (in markdown format), returns the subset of elements
involved in counter-examples to commutativity, sorted alphabetically.
Args:
table_str (str): Markdown table defining the operation * on a finite set.
Returns:
str: Comma-separated list of elements in the counter-example set, alphabetically sorted.
"""
# Read the table using pandas
df = pd.read_csv(StringIO(table_str), sep="|", skipinitialspace=True, engine='python')
# Drop empty columns due to leading/trailing pipes
df = df.dropna(axis=1, how="all")
df.columns = [c.strip() for c in df.columns]
df = df.dropna(axis=0, how="all")
# Extract header and values
elements = df.columns[1:]
df.index = df[df.columns[0]]
df = df.drop(df.columns[0], axis=1)
# Check commutativity: a*b == b*a
counterexample_elements = set()
for x in elements:
for y in elements:
if df.loc[x, y] != df.loc[y, x]:
counterexample_elements.add(x)
counterexample_elements.add(y)
return ", ".join(sorted(counterexample_elements))
@tool
def extract_sales_data_from_excel(url: str) -> str:
"""
Downloads and extracts sales data from an Excel file at the given URL.
Returns the contents of the first sheet as a markdown-formatted string.
"""
try:
response = requests.get(url)
response.raise_for_status()
excel_file = BytesIO(response.content)
df = pd.read_excel(excel_file)
# Optional: Remove unnamed columns often created by Excel
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
# Convert all numeric columns to float
for col in df.select_dtypes(include=["number"]).columns:
df[col] = df[col].astype(float)
return df.to_string(index=False)
except Exception as e:
return f"Failed to process Excel file from URL: {str(e)}"
@tool
def extract_transcript_from_youtube(url: str) -> str:
"""
Extracts the transcript from a YouTube video given its URL.
Args:
url (str): The YouTube video URL.
Returns:
str: The transcript of the video, or an error message if extraction fails.
"""
transcript_str = "### Transcript"
md = MarkItDown(enable_plugins=True)
try:
result = md.convert(url)
except Exception as e:
return f"Failed to extract transcript from YouTube video: {str(e)}"
parts = result.text_content.split(transcript_str)
if len(parts) < 2:
return result.text_content
transcript = transcript_str + "\n" + parts[1]
return transcript.strip()
# @tool
# def extract_transcript_from_audio(url: str) -> str:
# """
# Extracts the transcript from an audio file given its URL.
# Supported formats: mp3, wav.
# Args:
# url (str): The URL of the audio file.
# Returns:
# str: The transcript of the audio file, or an error message if extraction fails.
# """
# md = MarkItDown(enable_plugins=True)
# try:
# result = md.convert(url)
# except Exception as e:
# return f"Failed to extract transcript from audio: {str(e)}"
# return result.text_content