Spaces:

Pycrolis
/

shrewd-agent

Sleeping

App Files Files Community

Pycrolis commited on May 24

Commit

fe989a0

1 Parent(s): 5e2ccaa

feat(tool): add web page information extraction tool

Browse files

Files changed (3) hide show

ShrewdAgent.py +6 -1
requirements.txt +4 -1
tools/web_page_information_extractor.py +106 -0

ShrewdAgent.py CHANGED Viewed

@@ -13,6 +13,8 @@ from langgraph.pregel import PregelProtocol
 from loguru import logger
 from pydantic import SecretStr
 class AgentState(TypedDict):
     messages: Annotated[list[AnyMessage], add_messages]
@@ -33,7 +35,10 @@ class ShrewdAgent:
                         Important: Your final output must be only a number or a short phrase, with no additional text or explanation."""
     def __init__(self):
-        self.tools = [TavilySearch()]
         self.llm = ChatOpenAI(
             model="gpt-4o-mini",
             temperature=0,

 from loguru import logger
 from pydantic import SecretStr
+from tools.web_page_information_extractor import web_page_information_extractor
 class AgentState(TypedDict):
     messages: Annotated[list[AnyMessage], add_messages]
                         Important: Your final output must be only a number or a short phrase, with no additional text or explanation."""
     def __init__(self):
+        self.tools = [
+            TavilySearch(),
+            web_page_information_extractor,
+        ]
         self.llm = ChatOpenAI(
             model="gpt-4o-mini",
             temperature=0,

requirements.txt CHANGED Viewed

@@ -5,4 +5,7 @@ langchain-core~=0.3.60
 langchain-openai~=0.3.17
 langgraph~=0.4.5
 loguru~=0.7.3
-pydantic~=2.11.4

 langchain-openai~=0.3.17
 langgraph~=0.4.5
 loguru~=0.7.3
+pydantic~=2.11.4
+html2text~=2025.4.15
+beautifulsoup4~=4.13.4
+readability-lxml~=0.8.4.1

tools/web_page_information_extractor.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import os
+from io import StringIO
+import html2text
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+from langchain_core.messages import SystemMessage, HumanMessage
+from langchain_core.tools import tool
+from langchain_openai import ChatOpenAI
+from loguru import logger
+from pydantic import SecretStr
+from readability import Document
+@tool("web_page_information_extractor_tool", parse_docstring=True)
+def web_page_information_extractor(url: str, request: str) -> str:
+    """
+    Extracts specific information from a web page based on the user's request.
+    This function uses a language model to extract information from the content
+    of a web page specified by the URL. The user's request specifies the type of
+    information to be extracted. The function returns the extracted information as
+    a JSON string.
+    Args:
+        url (str): The URL of the web page to extract information from.
+        request (str): The user's request describing the information to extract.
+    Returns:
+        str: The extracted information in JSON format.
+    """
+    logger.info(f"use web_page_information_extractor with param: url:{url}, request:{request}")
+    response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
+    response.raise_for_status()  # Raises HTTPError for bad responses
+    html = response.text
+    doc = Document(html)
+    cleaned_html = doc.summary()
+    soup = BeautifulSoup(cleaned_html, "html.parser")
+    # Get tables
+    tables = soup.find_all('table', class_='wikitable')
+    tables_text = ""
+    for i, table in enumerate(tables, 1):
+        # Find the nearest preceding h2 or h3 header
+        header = table.find_previous(['h2', 'h3'])
+        section_title = header.get_text().strip() if header else "Untitled Section"
+        try:
+            # Convert table to pandas DataFrame using StringIO
+            table_html = str(table).replace('\n', '')  # Remove newlines for better parsing
+            df = pd.read_html(StringIO(table_html))[0]
+            # Format the table with section title, context, and clean layout
+            tables_text += f"\nSection: {section_title}\n"
+            tables_text += "=" * 40 + "\n"
+            tables_text += df.to_string(index=False) + "\n\n"
+        except Exception as e:
+            tables_text += f"\nError processing table in section {section_title}: {str(e)}\n"
+            continue
+    # Step 3: Convert HTML to Markdown
+    markdown_converter = html2text.HTML2Text()
+    markdown_converter.ignore_links = False
+    markdown_converter.bypass_tables = False
+    markdown_converter.ignore_images = True  # optional
+    markdown_converter.body_width = 0  # don't wrap lines
+    text = markdown_converter.handle(cleaned_html)
+    if tables_text:
+        text += f'Tables:\n{tables_text}'
+    logger.debug(f"web_page_information_extractor text: {text}")
+    chat = ChatOpenAI(
+        model="gpt-4o-mini",
+        temperature=0,
+        api_key=SecretStr(os.environ['OPENAI_API_KEY'])
+    )
+    system_message = "You are an expert information extraction system. Respond ONLY with valid JSON based on the user's request."
+    extraction_user_prompt = f"""From the text below:\n\"\"\"\n{text}\n\"\"\"\n\nExtract the following: "{request}"."""
+    extracted_information = chat.invoke([
+        SystemMessage(system_message),
+        HumanMessage(extraction_user_prompt)
+    ])
+    return extracted_information.content
+if __name__ == "__main__":
+    # result = web_page_information_extractor.invoke(
+    #     {"url": "https://en.wikipedia.org/wiki/Python_(programming_language)",
+    #      "request": "What are changes introduced in Python 3.11"})
+    # print(result)
+    result = web_page_information_extractor.invoke(
+        {"url": "https://en.wikipedia.org/wiki/1928_Summer_Olympics",
+         "request": "List of countries and number of athletes at the 1928 Summer Olympics"})
+    print(result)
+    # result = web_page_information_extractor.invoke(
+    #     {"url": "https://chem.libretexts.org/Courses/Chabot_College/Introduction_to_General_Organic_and_Biochemistry/01%3A_Chemistry_in_our_Lives/1.E%3A_Exercises",
+    # "request": "What is the surname of the equine veterinarian mentioned"})
+    # print(result)