Spaces:
Sleeping
Sleeping
import pandas as pd | |
import requests | |
from io import BytesIO | |
from io import StringIO | |
from langchain_core.tools import tool | |
from langchain_community.retrievers import WikipediaRetriever | |
from langchain_community.document_loaders import ArxivLoader | |
from langchain_community.retrievers import BM25Retriever | |
from langchain_core.documents import Document | |
from duckduckgo_search import DDGS | |
from markitdown import MarkItDown | |
# --------------- Math Tools ---------------- # | |
def add_numbers(a: int, b: int) -> int: | |
"""Add two numbers. | |
Args: | |
a (int): The first number. | |
b (int): The second number. | |
""" | |
return a + b | |
def add_numbers_in_list(numbers: list[float]) -> float: | |
"""Add all numbers in a list. | |
Always use this tool for summing numerical values, instead of doing math directly in the response. | |
Args: | |
numbers (list[float]): A list of numbers to add. | |
""" | |
return sum(numbers) | |
# @tool | |
# def web_search(query: str) -> str: | |
# """Perform a web search using DuckDuckGo. | |
# Args: | |
# query (str): The search query. | |
# Returns: | |
# str: The search results. | |
# """ | |
# search_tool = DuckDuckGoSearchRun() | |
# return search_tool.invoke(query) | |
def web_search(query: str) -> str: | |
""" | |
Perform a web search using DuckDuckGo. Visit the top ranked page, | |
apply chunking in page results, perform similarity search, and return | |
the top results content. | |
Args: | |
query (str): The search query. | |
Returns: | |
Document: The top results from the ranking, in langchain_core.documents.Document | |
objects having fields 'page_content' with the chunk content and 'metadata'. | |
""" | |
def _chunk_text(text, chunk_size_words=1000, overlap_words=100): | |
""" | |
Split text into chunks of specified size with overlap. | |
Args: | |
text (str): The text to be chunked. | |
chunk_size (int): The size of each chunk. | |
overlap (int): The number of overlapping characters between chunks. | |
Returns: | |
list: A list of text chunks. | |
""" | |
words = text.split() | |
chunks = [] | |
for i in range(0, len(words), chunk_size_words - overlap_words): | |
chunk = " ".join(words[i:i + chunk_size_words]) | |
chunks.append(chunk) | |
return chunks | |
# STEP 1: Find the most relevant webpage | |
results = DDGS().text(query, max_results=1) | |
top_rank_page = results[0] if results else None | |
if not top_rank_page: | |
return "No relevant results found for the query." | |
# STEP 2: Extract the content of the webpage | |
md = MarkItDown(enable_plugins=True) | |
md_result = md.convert(top_rank_page['href']) | |
page_content = md_result.text_content | |
# STEP 3: Apply chunking | |
chunks = _chunk_text(page_content) | |
# STEP 4: Apply ranking in chunks | |
list_of_docs = [ | |
Document(page_content = chunk, metadata = {"source": top_rank_page['href'], "title": top_rank_page['title']}) | |
for chunk in chunks | |
] | |
retriever = BM25Retriever.from_documents(list_of_docs) | |
matched = retriever.invoke(query) | |
return matched[0] | |
# TODO: | |
# Maybe don't return the summary, but the full document? | |
def wikipedia_search(query: str) -> str: | |
""" | |
Search Wikipedia for a given query and return a summary of the top result. | |
Args: | |
query (str): The search term. | |
Returns: | |
str: A summary of the most relevant Wikipedia entry. | |
""" | |
wikipedia_retriever = WikipediaRetriever(load_max_docs=1) | |
documents = wikipedia_retriever.get_relevant_documents(query) | |
if not documents: | |
return "No relevant Wikipedia articles found." | |
formatted_search_docs = "\n\n---\n\n".join( | |
[ | |
f'<Document source="{doc.metadata["source"]}" title="{doc.metadata.get("title", "")}"/>\n{doc.metadata["summary"]}\n</Document>' | |
for doc in documents | |
]) | |
# Return the content of the top document | |
return formatted_search_docs | |
def arxiv_search(query: str) -> str: | |
""" | |
Search Arxiv for academic papers based on a query and return summaries of top results. | |
Args: | |
query (str): The search query for Arxiv. | |
Returns: | |
str: Summary of the top few relevant papers from Arxiv. | |
""" | |
try: | |
loader = ArxivLoader(query=query, load_max_docs=2) | |
documents = loader.load() | |
if not documents: | |
return "No relevant papers found on Arxiv." | |
# Format and return top paper summaries | |
results = [] | |
for doc in documents: | |
title = doc.metadata.get("Title", "No Title") | |
published = doc.metadata.get("Published", "Unknown date") | |
url = doc.metadata.get("entry_id", "No URL") | |
summary = doc.page_content[:500] # limit summary length | |
results.append(f"Title: {title}\nPublished: {published}\nURL: {url}\nSummary: {summary}\n") | |
return "\n---\n".join(results) | |
except Exception as e: | |
return f"An error occurred while searching Arxiv: {str(e)}" | |
def check_commutativity(table_str: str) -> str: | |
""" | |
Given a binary operation table (in markdown format), returns the subset of elements | |
involved in counter-examples to commutativity, sorted alphabetically. | |
Args: | |
table_str (str): Markdown table defining the operation * on a finite set. | |
Returns: | |
str: Comma-separated list of elements in the counter-example set, alphabetically sorted. | |
""" | |
# Read the table using pandas | |
df = pd.read_csv(StringIO(table_str), sep="|", skipinitialspace=True, engine='python') | |
# Drop empty columns due to leading/trailing pipes | |
df = df.dropna(axis=1, how="all") | |
df.columns = [c.strip() for c in df.columns] | |
df = df.dropna(axis=0, how="all") | |
# Extract header and values | |
elements = df.columns[1:] | |
df.index = df[df.columns[0]] | |
df = df.drop(df.columns[0], axis=1) | |
# Check commutativity: a*b == b*a | |
counterexample_elements = set() | |
for x in elements: | |
for y in elements: | |
if df.loc[x, y] != df.loc[y, x]: | |
counterexample_elements.add(x) | |
counterexample_elements.add(y) | |
return ", ".join(sorted(counterexample_elements)) | |
def extract_sales_data_from_excel(url: str) -> str: | |
""" | |
Downloads and extracts sales data from an Excel file at the given URL. | |
Returns the contents of the first sheet as a markdown-formatted string. | |
""" | |
try: | |
response = requests.get(url) | |
response.raise_for_status() | |
excel_file = BytesIO(response.content) | |
df = pd.read_excel(excel_file) | |
# Optional: Remove unnamed columns often created by Excel | |
df = df.loc[:, ~df.columns.str.contains('^Unnamed')] | |
# Convert all numeric columns to float | |
for col in df.select_dtypes(include=["number"]).columns: | |
df[col] = df[col].astype(float) | |
return df.to_string(index=False) | |
except Exception as e: | |
return f"Failed to process Excel file from URL: {str(e)}" | |
def extract_transcript_from_youtube(url: str) -> str: | |
""" | |
Extracts the transcript from a YouTube video given its URL. | |
Args: | |
url (str): The YouTube video URL. | |
Returns: | |
str: The transcript of the video, or an error message if extraction fails. | |
""" | |
transcript_str = "### Transcript" | |
md = MarkItDown(enable_plugins=True) | |
try: | |
result = md.convert(url) | |
except Exception as e: | |
return f"Failed to extract transcript from YouTube video: {str(e)}" | |
parts = result.text_content.split(transcript_str) | |
if len(parts) < 2: | |
return result.text_content | |
transcript = transcript_str + "\n" + parts[1] | |
return transcript.strip() | |
# @tool | |
# def extract_transcript_from_audio(url: str) -> str: | |
# """ | |
# Extracts the transcript from an audio file given its URL. | |
# Supported formats: mp3, wav. | |
# Args: | |
# url (str): The URL of the audio file. | |
# Returns: | |
# str: The transcript of the audio file, or an error message if extraction fails. | |
# """ | |
# md = MarkItDown(enable_plugins=True) | |
# try: | |
# result = md.convert(url) | |
# except Exception as e: | |
# return f"Failed to extract transcript from audio: {str(e)}" | |
# return result.text_content | |