import pandas as pd import requests from io import BytesIO from io import StringIO from langchain_core.tools import tool from langchain_community.retrievers import WikipediaRetriever from langchain_community.document_loaders import ArxivLoader from langchain_community.retrievers import BM25Retriever from langchain_core.documents import Document from duckduckgo_search import DDGS from markitdown import MarkItDown # --------------- Math Tools ---------------- # @tool def add_numbers(a: int, b: int) -> int: """Add two numbers. Args: a (int): The first number. b (int): The second number. """ return a + b @tool def add_numbers_in_list(numbers: list[float]) -> float: """Add all numbers in a list. Always use this tool for summing numerical values, instead of doing math directly in the response. Args: numbers (list[float]): A list of numbers to add. """ return sum(numbers) # @tool # def web_search(query: str) -> str: # """Perform a web search using DuckDuckGo. # Args: # query (str): The search query. # Returns: # str: The search results. # """ # search_tool = DuckDuckGoSearchRun() # return search_tool.invoke(query) @tool def web_search(query: str) -> str: """ Perform a web search using DuckDuckGo. Visit the top ranked page, apply chunking in page results, perform similarity search, and return the top results content. Args: query (str): The search query. Returns: Document: The top results from the ranking, in langchain_core.documents.Document objects having fields 'page_content' with the chunk content and 'metadata'. """ def _chunk_text(text, chunk_size_words=1000, overlap_words=100): """ Split text into chunks of specified size with overlap. Args: text (str): The text to be chunked. chunk_size (int): The size of each chunk. overlap (int): The number of overlapping characters between chunks. Returns: list: A list of text chunks. """ words = text.split() chunks = [] for i in range(0, len(words), chunk_size_words - overlap_words): chunk = " ".join(words[i:i + chunk_size_words]) chunks.append(chunk) return chunks # STEP 1: Find the most relevant webpage results = DDGS().text(query, max_results=1) top_rank_page = results[0] if results else None if not top_rank_page: return "No relevant results found for the query." # STEP 2: Extract the content of the webpage md = MarkItDown(enable_plugins=True) md_result = md.convert(top_rank_page['href']) page_content = md_result.text_content # STEP 3: Apply chunking chunks = _chunk_text(page_content) # STEP 4: Apply ranking in chunks list_of_docs = [ Document(page_content = chunk, metadata = {"source": top_rank_page['href'], "title": top_rank_page['title']}) for chunk in chunks ] retriever = BM25Retriever.from_documents(list_of_docs) matched = retriever.invoke(query) return matched[0] # TODO: # Maybe don't return the summary, but the full document? @tool def wikipedia_search(query: str) -> str: """ Search Wikipedia for a given query and return a summary of the top result. Args: query (str): The search term. Returns: str: A summary of the most relevant Wikipedia entry. """ wikipedia_retriever = WikipediaRetriever(load_max_docs=1) documents = wikipedia_retriever.get_relevant_documents(query) if not documents: return "No relevant Wikipedia articles found." formatted_search_docs = "\n\n---\n\n".join( [ f'\n{doc.metadata["summary"]}\n' for doc in documents ]) # Return the content of the top document return formatted_search_docs @tool def arxiv_search(query: str) -> str: """ Search Arxiv for academic papers based on a query and return summaries of top results. Args: query (str): The search query for Arxiv. Returns: str: Summary of the top few relevant papers from Arxiv. """ try: loader = ArxivLoader(query=query, load_max_docs=2) documents = loader.load() if not documents: return "No relevant papers found on Arxiv." # Format and return top paper summaries results = [] for doc in documents: title = doc.metadata.get("Title", "No Title") published = doc.metadata.get("Published", "Unknown date") url = doc.metadata.get("entry_id", "No URL") summary = doc.page_content[:500] # limit summary length results.append(f"Title: {title}\nPublished: {published}\nURL: {url}\nSummary: {summary}\n") return "\n---\n".join(results) except Exception as e: return f"An error occurred while searching Arxiv: {str(e)}" @tool def check_commutativity(table_str: str) -> str: """ Given a binary operation table (in markdown format), returns the subset of elements involved in counter-examples to commutativity, sorted alphabetically. Args: table_str (str): Markdown table defining the operation * on a finite set. Returns: str: Comma-separated list of elements in the counter-example set, alphabetically sorted. """ # Read the table using pandas df = pd.read_csv(StringIO(table_str), sep="|", skipinitialspace=True, engine='python') # Drop empty columns due to leading/trailing pipes df = df.dropna(axis=1, how="all") df.columns = [c.strip() for c in df.columns] df = df.dropna(axis=0, how="all") # Extract header and values elements = df.columns[1:] df.index = df[df.columns[0]] df = df.drop(df.columns[0], axis=1) # Check commutativity: a*b == b*a counterexample_elements = set() for x in elements: for y in elements: if df.loc[x, y] != df.loc[y, x]: counterexample_elements.add(x) counterexample_elements.add(y) return ", ".join(sorted(counterexample_elements)) @tool def extract_sales_data_from_excel(url: str) -> str: """ Downloads and extracts sales data from an Excel file at the given URL. Returns the contents of the first sheet as a markdown-formatted string. """ try: response = requests.get(url) response.raise_for_status() excel_file = BytesIO(response.content) df = pd.read_excel(excel_file) # Optional: Remove unnamed columns often created by Excel df = df.loc[:, ~df.columns.str.contains('^Unnamed')] # Convert all numeric columns to float for col in df.select_dtypes(include=["number"]).columns: df[col] = df[col].astype(float) return df.to_string(index=False) except Exception as e: return f"Failed to process Excel file from URL: {str(e)}" @tool def extract_transcript_from_youtube(url: str) -> str: """ Extracts the transcript from a YouTube video given its URL. Args: url (str): The YouTube video URL. Returns: str: The transcript of the video, or an error message if extraction fails. """ transcript_str = "### Transcript" md = MarkItDown(enable_plugins=True) try: result = md.convert(url) except Exception as e: return f"Failed to extract transcript from YouTube video: {str(e)}" parts = result.text_content.split(transcript_str) if len(parts) < 2: return result.text_content transcript = transcript_str + "\n" + parts[1] return transcript.strip() # @tool # def extract_transcript_from_audio(url: str) -> str: # """ # Extracts the transcript from an audio file given its URL. # Supported formats: mp3, wav. # Args: # url (str): The URL of the audio file. # Returns: # str: The transcript of the audio file, or an error message if extraction fails. # """ # md = MarkItDown(enable_plugins=True) # try: # result = md.convert(url) # except Exception as e: # return f"Failed to extract transcript from audio: {str(e)}" # return result.text_content