super_agent

Sleeping

super_agent / tools.py

lezaf

Add agent implementation

448903c 3 months ago

8.46 kB

	import pandas as pd
	import requests
	from io import BytesIO
	from io import StringIO
	from langchain_core.tools import tool
	from langchain_community.retrievers import WikipediaRetriever
	from langchain_community.document_loaders import ArxivLoader
	from langchain_community.retrievers import BM25Retriever
	from langchain_core.documents import Document
	from duckduckgo_search import DDGS
	from markitdown import MarkItDown

	# --------------- Math Tools ---------------- #
	@tool
	def add_numbers(a: int, b: int) -> int:
	"""Add two numbers.

	Args:
	a (int): The first number.
	b (int): The second number.
	"""
	return a + b

	@tool
	def add_numbers_in_list(numbers: list[float]) -> float:
	"""Add all numbers in a list.
	Always use this tool for summing numerical values, instead of doing math directly in the response.

	Args:
	numbers (list[float]): A list of numbers to add.
	"""
	return sum(numbers)

	# @tool
	# def web_search(query: str) -> str:
	# """Perform a web search using DuckDuckGo.

	# Args:
	# query (str): The search query.

	# Returns:
	# str: The search results.
	# """
	# search_tool = DuckDuckGoSearchRun()
	# return search_tool.invoke(query)

	@tool
	def web_search(query: str) -> str:
	"""
	Perform a web search using DuckDuckGo. Visit the top ranked page,
	apply chunking in page results, perform similarity search, and return
	the top results content.

	Args:
	query (str): The search query.
	Returns:
	Document: The top results from the ranking, in langchain_core.documents.Document
	objects having fields 'page_content' with the chunk content and 'metadata'.
	"""
	def _chunk_text(text, chunk_size_words=1000, overlap_words=100):
	"""
	Split text into chunks of specified size with overlap.
	Args:
	text (str): The text to be chunked.
	chunk_size (int): The size of each chunk.
	overlap (int): The number of overlapping characters between chunks.
	Returns:
	list: A list of text chunks.
	"""
	words = text.split()
	chunks = []
	for i in range(0, len(words), chunk_size_words - overlap_words):
	chunk = " ".join(words[i:i + chunk_size_words])
	chunks.append(chunk)
	return chunks

	# STEP 1: Find the most relevant webpage
	results = DDGS().text(query, max_results=1)
	top_rank_page = results[0] if results else None
	if not top_rank_page:
	return "No relevant results found for the query."

	# STEP 2: Extract the content of the webpage
	md = MarkItDown(enable_plugins=True)
	md_result = md.convert(top_rank_page['href'])

	page_content = md_result.text_content

	# STEP 3: Apply chunking
	chunks = _chunk_text(page_content)

	# STEP 4: Apply ranking in chunks
	list_of_docs = [
	Document(page_content = chunk, metadata = {"source": top_rank_page['href'], "title": top_rank_page['title']})
	for chunk in chunks
	]

	retriever = BM25Retriever.from_documents(list_of_docs)
	matched = retriever.invoke(query)

	return matched[0]

	# TODO:
	# Maybe don't return the summary, but the full document?
	@tool
	def wikipedia_search(query: str) -> str:
	"""
	Search Wikipedia for a given query and return a summary of the top result.

	Args:
	query (str): The search term.

	Returns:
	str: A summary of the most relevant Wikipedia entry.
	"""
	wikipedia_retriever = WikipediaRetriever(load_max_docs=1)

	documents = wikipedia_retriever.get_relevant_documents(query)
	if not documents:
	return "No relevant Wikipedia articles found."

	formatted_search_docs = "\n\n---\n\n".join(
	[
	f'<Document source="{doc.metadata["source"]}" title="{doc.metadata.get("title", "")}"/>\n{doc.metadata["summary"]}\n</Document>'
	for doc in documents
	])

	# Return the content of the top document
	return formatted_search_docs

	@tool
	def arxiv_search(query: str) -> str:
	"""
	Search Arxiv for academic papers based on a query and return summaries of top results.

	Args:
	query (str): The search query for Arxiv.

	Returns:
	str: Summary of the top few relevant papers from Arxiv.
	"""
	try:
	loader = ArxivLoader(query=query, load_max_docs=2)
	documents = loader.load()

	if not documents:
	return "No relevant papers found on Arxiv."

	# Format and return top paper summaries
	results = []
	for doc in documents:
	title = doc.metadata.get("Title", "No Title")
	published = doc.metadata.get("Published", "Unknown date")
	url = doc.metadata.get("entry_id", "No URL")
	summary = doc.page_content[:500] # limit summary length

	results.append(f"Title: {title}\nPublished: {published}\nURL: {url}\nSummary: {summary}\n")

	return "\n---\n".join(results)

	except Exception as e:
	return f"An error occurred while searching Arxiv: {str(e)}"

	@tool
	def check_commutativity(table_str: str) -> str:
	"""
	Given a binary operation table (in markdown format), returns the subset of elements
	involved in counter-examples to commutativity, sorted alphabetically.

	Args:
	table_str (str): Markdown table defining the operation * on a finite set.

	Returns:
	str: Comma-separated list of elements in the counter-example set, alphabetically sorted.
	"""
	# Read the table using pandas
	df = pd.read_csv(StringIO(table_str), sep="\|", skipinitialspace=True, engine='python')

	# Drop empty columns due to leading/trailing pipes
	df = df.dropna(axis=1, how="all")
	df.columns = [c.strip() for c in df.columns]
	df = df.dropna(axis=0, how="all")

	# Extract header and values
	elements = df.columns[1:]
	df.index = df[df.columns[0]]
	df = df.drop(df.columns[0], axis=1)

	# Check commutativity: ab == ba
	counterexample_elements = set()
	for x in elements:
	for y in elements:
	if df.loc[x, y] != df.loc[y, x]:
	counterexample_elements.add(x)
	counterexample_elements.add(y)

	return ", ".join(sorted(counterexample_elements))

	@tool
	def extract_sales_data_from_excel(url: str) -> str:
	"""
	Downloads and extracts sales data from an Excel file at the given URL.
	Returns the contents of the first sheet as a markdown-formatted string.
	"""
	try:
	response = requests.get(url)
	response.raise_for_status()

	excel_file = BytesIO(response.content)
	df = pd.read_excel(excel_file)

	# Optional: Remove unnamed columns often created by Excel
	df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

	# Convert all numeric columns to float
	for col in df.select_dtypes(include=["number"]).columns:
	df[col] = df[col].astype(float)

	return df.to_string(index=False)

	except Exception as e:
	return f"Failed to process Excel file from URL: {str(e)}"

	@tool
	def extract_transcript_from_youtube(url: str) -> str:
	"""
	Extracts the transcript from a YouTube video given its URL.

	Args:
	url (str): The YouTube video URL.
	Returns:
	str: The transcript of the video, or an error message if extraction fails.
	"""
	transcript_str = "### Transcript"
	md = MarkItDown(enable_plugins=True)

	try:
	result = md.convert(url)
	except Exception as e:
	return f"Failed to extract transcript from YouTube video: {str(e)}"

	parts = result.text_content.split(transcript_str)
	if len(parts) < 2:
	return result.text_content

	transcript = transcript_str + "\n" + parts[1]
	return transcript.strip()

	# @tool
	# def extract_transcript_from_audio(url: str) -> str:
	# """
	# Extracts the transcript from an audio file given its URL.
	# Supported formats: mp3, wav.

	# Args:
	# url (str): The URL of the audio file.
	# Returns:
	# str: The transcript of the audio file, or an error message if extraction fails.
	# """
	# md = MarkItDown(enable_plugins=True)

	# try:
	# result = md.convert(url)
	# except Exception as e:
	# return f"Failed to extract transcript from audio: {str(e)}"

	# return result.text_content