shrewd-agent / tools /web_page_information_extractor.py
Pycrolis
feat(tool): add web page information extraction tool
fe989a0
raw
history blame
4.23 kB
import os
from io import StringIO
import html2text
import pandas as pd
import requests
from bs4 import BeautifulSoup
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_core.tools import tool
from langchain_openai import ChatOpenAI
from loguru import logger
from pydantic import SecretStr
from readability import Document
@tool("web_page_information_extractor_tool", parse_docstring=True)
def web_page_information_extractor(url: str, request: str) -> str:
"""
Extracts specific information from a web page based on the user's request.
This function uses a language model to extract information from the content
of a web page specified by the URL. The user's request specifies the type of
information to be extracted. The function returns the extracted information as
a JSON string.
Args:
url (str): The URL of the web page to extract information from.
request (str): The user's request describing the information to extract.
Returns:
str: The extracted information in JSON format.
"""
logger.info(f"use web_page_information_extractor with param: url:{url}, request:{request}")
response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
response.raise_for_status() # Raises HTTPError for bad responses
html = response.text
doc = Document(html)
cleaned_html = doc.summary()
soup = BeautifulSoup(cleaned_html, "html.parser")
# Get tables
tables = soup.find_all('table', class_='wikitable')
tables_text = ""
for i, table in enumerate(tables, 1):
# Find the nearest preceding h2 or h3 header
header = table.find_previous(['h2', 'h3'])
section_title = header.get_text().strip() if header else "Untitled Section"
try:
# Convert table to pandas DataFrame using StringIO
table_html = str(table).replace('\n', '') # Remove newlines for better parsing
df = pd.read_html(StringIO(table_html))[0]
# Format the table with section title, context, and clean layout
tables_text += f"\nSection: {section_title}\n"
tables_text += "=" * 40 + "\n"
tables_text += df.to_string(index=False) + "\n\n"
except Exception as e:
tables_text += f"\nError processing table in section {section_title}: {str(e)}\n"
continue
# Step 3: Convert HTML to Markdown
markdown_converter = html2text.HTML2Text()
markdown_converter.ignore_links = False
markdown_converter.bypass_tables = False
markdown_converter.ignore_images = True # optional
markdown_converter.body_width = 0 # don't wrap lines
text = markdown_converter.handle(cleaned_html)
if tables_text:
text += f'Tables:\n{tables_text}'
logger.debug(f"web_page_information_extractor text: {text}")
chat = ChatOpenAI(
model="gpt-4o-mini",
temperature=0,
api_key=SecretStr(os.environ['OPENAI_API_KEY'])
)
system_message = "You are an expert information extraction system. Respond ONLY with valid JSON based on the user's request."
extraction_user_prompt = f"""From the text below:\n\"\"\"\n{text}\n\"\"\"\n\nExtract the following: "{request}"."""
extracted_information = chat.invoke([
SystemMessage(system_message),
HumanMessage(extraction_user_prompt)
])
return extracted_information.content
if __name__ == "__main__":
# result = web_page_information_extractor.invoke(
# {"url": "https://en.wikipedia.org/wiki/Python_(programming_language)",
# "request": "What are changes introduced in Python 3.11"})
# print(result)
result = web_page_information_extractor.invoke(
{"url": "https://en.wikipedia.org/wiki/1928_Summer_Olympics",
"request": "List of countries and number of athletes at the 1928 Summer Olympics"})
print(result)
# result = web_page_information_extractor.invoke(
# {"url": "https://chem.libretexts.org/Courses/Chabot_College/Introduction_to_General_Organic_and_Biochemistry/01%3A_Chemistry_in_our_Lives/1.E%3A_Exercises",
# "request": "What is the surname of the equine veterinarian mentioned"})
# print(result)