Spaces:

Pycrolis
/

shrewd-agent

Sleeping

App Files Files Community

Pycrolis commited on May 24

Commit

e049457

2 Parent(s): d8b8674 508a421

Merge branch 'feat/add-tools'

Browse files

Files changed (7) hide show

README.md +13 -2
ShrewdAgent.py +13 -1
requirements.txt +6 -1
tools/produce_classifier.py +44 -0
tools/web_page_information_extractor.py +106 -0
tools/wikipedia_search.py +41 -0
tools/youtube_transcript.py +32 -0

README.md CHANGED Viewed

@@ -15,10 +15,21 @@ short_description: Agent for the final hands-on assignment of the Agents course
 ## Requirements
-To run this project, you need to have an OpenAI API key. Set your `OPENAI_API_KEY` as an environment variable:
 ```bash
 export OPENAI_API_KEY='your-api-key'
 ```
-You can get your OpenAI API key from [here](https://platform.openai.com/account/api-keys).

 ## Requirements
+### Prerequisites
+- Python 3.x
+- Virtual environment (recommended)
+### API Keys
+This project requires the following API keys:
+- OpenAI API key
+- Tavily API key
+Set them as environment variables:
 ```bash
 export OPENAI_API_KEY='your-api-key'
+export TAVILY_API_KEY='your-api-key'
 ```
+You can get the required API keys here:
+- OpenAI API key: [OpenAI Platform](https://platform.openai.com/account/api-keys)
+- Tavily API key: [Tavily](https://tavily.com)

ShrewdAgent.py CHANGED Viewed

@@ -4,6 +4,7 @@ from typing import TypedDict, Annotated, Optional, Any, Callable, Sequence, Unio
 from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage
 from langchain_core.tools import BaseTool
 from langchain_openai import ChatOpenAI
 from langgraph.constants import START
 from langgraph.errors import GraphRecursionError
 from langgraph.graph import add_messages, StateGraph
@@ -12,6 +13,11 @@ from langgraph.pregel import PregelProtocol
 from loguru import logger
 from pydantic import SecretStr
 class AgentState(TypedDict):
     messages: Annotated[list[AnyMessage], add_messages]
@@ -32,7 +38,13 @@ class ShrewdAgent:
                         Important: Your final output must be only a number or a short phrase, with no additional text or explanation."""
     def __init__(self):
-        self.tools = []
         self.llm = ChatOpenAI(
             model="gpt-4o-mini",
             temperature=0,

 from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage
 from langchain_core.tools import BaseTool
 from langchain_openai import ChatOpenAI
+from langchain_tavily import TavilySearch
 from langgraph.constants import START
 from langgraph.errors import GraphRecursionError
 from langgraph.graph import add_messages, StateGraph
 from loguru import logger
 from pydantic import SecretStr
+from tools.produce_classifier import produce_classifier
+from tools.web_page_information_extractor import web_page_information_extractor
+from tools.wikipedia_search import wikipedia_search
+from tools.youtube_transcript import youtube_transcript
 class AgentState(TypedDict):
     messages: Annotated[list[AnyMessage], add_messages]
                         Important: Your final output must be only a number or a short phrase, with no additional text or explanation."""
     def __init__(self):
+        self.tools = [
+            TavilySearch(),
+            wikipedia_search,
+            web_page_information_extractor,
+            youtube_transcript,
+            produce_classifier,
+        ]
         self.llm = ChatOpenAI(
             model="gpt-4o-mini",
             temperature=0,

requirements.txt CHANGED Viewed

@@ -5,4 +5,9 @@ langchain-core~=0.3.60
 langchain-openai~=0.3.17
 langgraph~=0.4.5
 loguru~=0.7.3
-pydantic~=2.11.4

 langchain-openai~=0.3.17
 langgraph~=0.4.5
 loguru~=0.7.3
+pydantic~=2.11.4
+html2text~=2025.4.15
+beautifulsoup4~=4.13.4
+readability-lxml~=0.8.4.1
+youtube-transcript-api~=1.0.3
+wikipedia~=1.4.0

tools/produce_classifier.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import os
+from langchain_core.tools import tool
+from langchain_openai import ChatOpenAI
+from loguru import logger
+from pydantic import SecretStr
+@tool("produce_classifier_tool", parse_docstring=True)
+def produce_classifier(food_name: str) -> str:
+    """
+    Classifies a food item as either 'fruit' or 'vegetable' from a botanical perspective.
+    Args:
+        food_name (str): The name of the food item to classify.
+    Returns:
+        str: The classification of the food item, either 'fruit' or 'vegetable'.
+    """
+    logger.info(f"use produce_classifier_tool with param: {food_name}")
+    chat = ChatOpenAI(
+        model="gpt-4o-mini",
+        temperature=0,
+        api_key = SecretStr(os.environ['OPENAI_API_KEY'])
+    )
+    prompt = (f"From a botanical perspective, classify {food_name} as either 'fruit' or 'vegetable'. "
+              #f"If it's not a produce name, classify as 'neither'. "
+              f"Respond only with a JSON in this exact format: "
+              f"{{\"name\": \"{food_name}\", \"kind\": \"[classification]\"}}, "
+              f"where [classification] should be replaced with just 'fruit' or 'vegetable'." #, or 'neither'. "
+              f"No other text or explanation.")
+    return chat.invoke(prompt).content
+def _print_produce_kind(food_name: str):
+    print(f'{food_name}: {produce_classifier.invoke(food_name)}')
+if __name__ == "__main__":
+    #_print_produce_kind("orange")
+    #_print_produce_kind("sweet potatoes")
+    #_print_produce_kind("bell pepper")
+    #_print_produce_kind("egg")
+    #_print_produce_kind("table")
+    _print_produce_kind("zucchini")

tools/web_page_information_extractor.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import os
+from io import StringIO
+import html2text
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+from langchain_core.messages import SystemMessage, HumanMessage
+from langchain_core.tools import tool
+from langchain_openai import ChatOpenAI
+from loguru import logger
+from pydantic import SecretStr
+from readability import Document
+@tool("web_page_information_extractor_tool", parse_docstring=True)
+def web_page_information_extractor(url: str, request: str) -> str:
+    """
+    Extracts specific information from a web page based on the user's request.
+    This function uses a language model to extract information from the content
+    of a web page specified by the URL. The user's request specifies the type of
+    information to be extracted. The function returns the extracted information as
+    a JSON string.
+    Args:
+        url (str): The URL of the web page to extract information from.
+        request (str): The user's request describing the information to extract.
+    Returns:
+        str: The extracted information in JSON format.
+    """
+    logger.info(f"use web_page_information_extractor with param: url:{url}, request:{request}")
+    response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
+    response.raise_for_status()  # Raises HTTPError for bad responses
+    html = response.text
+    doc = Document(html)
+    cleaned_html = doc.summary()
+    soup = BeautifulSoup(cleaned_html, "html.parser")
+    # Get tables
+    tables = soup.find_all('table', class_='wikitable')
+    tables_text = ""
+    for i, table in enumerate(tables, 1):
+        # Find the nearest preceding h2 or h3 header
+        header = table.find_previous(['h2', 'h3'])
+        section_title = header.get_text().strip() if header else "Untitled Section"
+        try:
+            # Convert table to pandas DataFrame using StringIO
+            table_html = str(table).replace('\n', '')  # Remove newlines for better parsing
+            df = pd.read_html(StringIO(table_html))[0]
+            # Format the table with section title, context, and clean layout
+            tables_text += f"\nSection: {section_title}\n"
+            tables_text += "=" * 40 + "\n"
+            tables_text += df.to_string(index=False) + "\n\n"
+        except Exception as e:
+            tables_text += f"\nError processing table in section {section_title}: {str(e)}\n"
+            continue
+    # Step 3: Convert HTML to Markdown
+    markdown_converter = html2text.HTML2Text()
+    markdown_converter.ignore_links = False
+    markdown_converter.bypass_tables = False
+    markdown_converter.ignore_images = True  # optional
+    markdown_converter.body_width = 0  # don't wrap lines
+    text = markdown_converter.handle(cleaned_html)
+    if tables_text:
+        text += f'Tables:\n{tables_text}'
+    logger.debug(f"web_page_information_extractor text: {text}")
+    chat = ChatOpenAI(
+        model="gpt-4o-mini",
+        temperature=0,
+        api_key=SecretStr(os.environ['OPENAI_API_KEY'])
+    )
+    system_message = "You are an expert information extraction system. Respond ONLY with valid JSON based on the user's request."
+    extraction_user_prompt = f"""From the text below:\n\"\"\"\n{text}\n\"\"\"\n\nExtract the following: "{request}"."""
+    extracted_information = chat.invoke([
+        SystemMessage(system_message),
+        HumanMessage(extraction_user_prompt)
+    ])
+    return extracted_information.content
+if __name__ == "__main__":
+    # result = web_page_information_extractor.invoke(
+    #     {"url": "https://en.wikipedia.org/wiki/Python_(programming_language)",
+    #      "request": "What are changes introduced in Python 3.11"})
+    # print(result)
+    result = web_page_information_extractor.invoke(
+        {"url": "https://en.wikipedia.org/wiki/1928_Summer_Olympics",
+         "request": "List of countries and number of athletes at the 1928 Summer Olympics"})
+    print(result)
+    # result = web_page_information_extractor.invoke(
+    #     {"url": "https://chem.libretexts.org/Courses/Chabot_College/Introduction_to_General_Organic_and_Biochemistry/01%3A_Chemistry_in_our_Lives/1.E%3A_Exercises",
+    # "request": "What is the surname of the equine veterinarian mentioned"})
+    # print(result)

tools/wikipedia_search.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import wikipedia
+from langchain_core.tools import tool
+from loguru import logger
+@tool("wikipedia_search_tool", parse_docstring=True)
+def wikipedia_search(query: str) -> str:
+    """
+    Searches Wikipedia for the given query.
+    Args:
+        query (str): The search query to look up on Wikipedia.
+    Returns:
+        str: A formatted string with the search results, page title and url.
+    """
+    logger.info(f"use wikipedia_search_tool with param: {query}")
+    search_results = wikipedia.search(query, results=5)
+    if not search_results:
+        return "No results found for the query."
+    result_text = ""
+    try:
+        for i, title in enumerate(search_results, 1):
+            page = wikipedia.page(search_results[i - 1], auto_suggest=False)
+            result_text += f"{i}. [{title}]({page.url})\n"
+        return result_text
+    except wikipedia.DisambiguationError as e:
+        return f"Disambiguation page found. Possible matches:\n{'\n'.join(e.options)}"
+    except wikipedia.PageError as e:
+        return f"Page not found. Try another search term."
+    except Exception as e:
+        return f"An error occurred: {str(e)}"
+if __name__ == "__main__":
+    print(wikipedia_search.invoke("Mercedes Sosa discography"))

tools/youtube_transcript.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from langchain_core.tools import tool
+from loguru import logger
+from youtube_transcript_api import YouTubeTranscriptApi, FetchedTranscript
+@tool("youtube_transcript_tool", parse_docstring=True)
+def youtube_transcript(video_id: str) -> str:
+    """
+    Fetches the transcript of a YouTube video using its video ID.
+    The video ID must be provided to successfully fetch the transcript.
+    Args:
+        video_id (str): The unique identifier of a YouTube video. You can retrieve the video_id from the URL of the video. For example, with the URL https://www.youtube.com/watch?v=12345 the video_id is 12345.
+    Returns:
+        FetchedTranscript: The transcript of the specified YouTube video.
+    Raises:
+        Any exceptions related to YouTubeTranscriptApi when a problem
+        occurs during fetching the transcript.
+    """
+    logger.info(f"use youtube_transcript with param: {video_id}")
+    transcript = YouTubeTranscriptApi().fetch(video_id).to_raw_data()
+    bullet_points = '\n'.join(f"- {entry['text']}" for entry in transcript)
+    return bullet_points
+if __name__ == "__main__":
+    transcript = youtube_transcript.invoke("1htKBjuUWec")
+    print(transcript)