Spaces:
Sleeping
Sleeping
Pycrolis
commited on
Commit
·
fe989a0
1
Parent(s):
5e2ccaa
feat(tool): add web page information extraction tool
Browse files- ShrewdAgent.py +6 -1
- requirements.txt +4 -1
- tools/web_page_information_extractor.py +106 -0
ShrewdAgent.py
CHANGED
@@ -13,6 +13,8 @@ from langgraph.pregel import PregelProtocol
|
|
13 |
from loguru import logger
|
14 |
from pydantic import SecretStr
|
15 |
|
|
|
|
|
16 |
|
17 |
class AgentState(TypedDict):
|
18 |
messages: Annotated[list[AnyMessage], add_messages]
|
@@ -33,7 +35,10 @@ class ShrewdAgent:
|
|
33 |
Important: Your final output must be only a number or a short phrase, with no additional text or explanation."""
|
34 |
|
35 |
def __init__(self):
|
36 |
-
self.tools = [
|
|
|
|
|
|
|
37 |
self.llm = ChatOpenAI(
|
38 |
model="gpt-4o-mini",
|
39 |
temperature=0,
|
|
|
13 |
from loguru import logger
|
14 |
from pydantic import SecretStr
|
15 |
|
16 |
+
from tools.web_page_information_extractor import web_page_information_extractor
|
17 |
+
|
18 |
|
19 |
class AgentState(TypedDict):
|
20 |
messages: Annotated[list[AnyMessage], add_messages]
|
|
|
35 |
Important: Your final output must be only a number or a short phrase, with no additional text or explanation."""
|
36 |
|
37 |
def __init__(self):
|
38 |
+
self.tools = [
|
39 |
+
TavilySearch(),
|
40 |
+
web_page_information_extractor,
|
41 |
+
]
|
42 |
self.llm = ChatOpenAI(
|
43 |
model="gpt-4o-mini",
|
44 |
temperature=0,
|
requirements.txt
CHANGED
@@ -5,4 +5,7 @@ langchain-core~=0.3.60
|
|
5 |
langchain-openai~=0.3.17
|
6 |
langgraph~=0.4.5
|
7 |
loguru~=0.7.3
|
8 |
-
pydantic~=2.11.4
|
|
|
|
|
|
|
|
5 |
langchain-openai~=0.3.17
|
6 |
langgraph~=0.4.5
|
7 |
loguru~=0.7.3
|
8 |
+
pydantic~=2.11.4
|
9 |
+
html2text~=2025.4.15
|
10 |
+
beautifulsoup4~=4.13.4
|
11 |
+
readability-lxml~=0.8.4.1
|
tools/web_page_information_extractor.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from io import StringIO
|
3 |
+
|
4 |
+
import html2text
|
5 |
+
import pandas as pd
|
6 |
+
import requests
|
7 |
+
from bs4 import BeautifulSoup
|
8 |
+
from langchain_core.messages import SystemMessage, HumanMessage
|
9 |
+
from langchain_core.tools import tool
|
10 |
+
from langchain_openai import ChatOpenAI
|
11 |
+
from loguru import logger
|
12 |
+
from pydantic import SecretStr
|
13 |
+
from readability import Document
|
14 |
+
|
15 |
+
|
16 |
+
@tool("web_page_information_extractor_tool", parse_docstring=True)
|
17 |
+
def web_page_information_extractor(url: str, request: str) -> str:
|
18 |
+
"""
|
19 |
+
Extracts specific information from a web page based on the user's request.
|
20 |
+
|
21 |
+
This function uses a language model to extract information from the content
|
22 |
+
of a web page specified by the URL. The user's request specifies the type of
|
23 |
+
information to be extracted. The function returns the extracted information as
|
24 |
+
a JSON string.
|
25 |
+
|
26 |
+
Args:
|
27 |
+
url (str): The URL of the web page to extract information from.
|
28 |
+
request (str): The user's request describing the information to extract.
|
29 |
+
|
30 |
+
Returns:
|
31 |
+
str: The extracted information in JSON format.
|
32 |
+
"""
|
33 |
+
logger.info(f"use web_page_information_extractor with param: url:{url}, request:{request}")
|
34 |
+
response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
|
35 |
+
response.raise_for_status() # Raises HTTPError for bad responses
|
36 |
+
html = response.text
|
37 |
+
doc = Document(html)
|
38 |
+
cleaned_html = doc.summary()
|
39 |
+
|
40 |
+
soup = BeautifulSoup(cleaned_html, "html.parser")
|
41 |
+
|
42 |
+
# Get tables
|
43 |
+
tables = soup.find_all('table', class_='wikitable')
|
44 |
+
tables_text = ""
|
45 |
+
|
46 |
+
for i, table in enumerate(tables, 1):
|
47 |
+
# Find the nearest preceding h2 or h3 header
|
48 |
+
header = table.find_previous(['h2', 'h3'])
|
49 |
+
section_title = header.get_text().strip() if header else "Untitled Section"
|
50 |
+
try:
|
51 |
+
# Convert table to pandas DataFrame using StringIO
|
52 |
+
table_html = str(table).replace('\n', '') # Remove newlines for better parsing
|
53 |
+
df = pd.read_html(StringIO(table_html))[0]
|
54 |
+
|
55 |
+
# Format the table with section title, context, and clean layout
|
56 |
+
tables_text += f"\nSection: {section_title}\n"
|
57 |
+
tables_text += "=" * 40 + "\n"
|
58 |
+
tables_text += df.to_string(index=False) + "\n\n"
|
59 |
+
except Exception as e:
|
60 |
+
tables_text += f"\nError processing table in section {section_title}: {str(e)}\n"
|
61 |
+
continue
|
62 |
+
|
63 |
+
# Step 3: Convert HTML to Markdown
|
64 |
+
markdown_converter = html2text.HTML2Text()
|
65 |
+
markdown_converter.ignore_links = False
|
66 |
+
markdown_converter.bypass_tables = False
|
67 |
+
markdown_converter.ignore_images = True # optional
|
68 |
+
markdown_converter.body_width = 0 # don't wrap lines
|
69 |
+
|
70 |
+
text = markdown_converter.handle(cleaned_html)
|
71 |
+
if tables_text:
|
72 |
+
text += f'Tables:\n{tables_text}'
|
73 |
+
|
74 |
+
logger.debug(f"web_page_information_extractor text: {text}")
|
75 |
+
|
76 |
+
chat = ChatOpenAI(
|
77 |
+
model="gpt-4o-mini",
|
78 |
+
temperature=0,
|
79 |
+
api_key=SecretStr(os.environ['OPENAI_API_KEY'])
|
80 |
+
)
|
81 |
+
|
82 |
+
system_message = "You are an expert information extraction system. Respond ONLY with valid JSON based on the user's request."
|
83 |
+
extraction_user_prompt = f"""From the text below:\n\"\"\"\n{text}\n\"\"\"\n\nExtract the following: "{request}"."""
|
84 |
+
|
85 |
+
extracted_information = chat.invoke([
|
86 |
+
SystemMessage(system_message),
|
87 |
+
HumanMessage(extraction_user_prompt)
|
88 |
+
])
|
89 |
+
return extracted_information.content
|
90 |
+
|
91 |
+
|
92 |
+
if __name__ == "__main__":
|
93 |
+
# result = web_page_information_extractor.invoke(
|
94 |
+
# {"url": "https://en.wikipedia.org/wiki/Python_(programming_language)",
|
95 |
+
# "request": "What are changes introduced in Python 3.11"})
|
96 |
+
# print(result)
|
97 |
+
|
98 |
+
result = web_page_information_extractor.invoke(
|
99 |
+
{"url": "https://en.wikipedia.org/wiki/1928_Summer_Olympics",
|
100 |
+
"request": "List of countries and number of athletes at the 1928 Summer Olympics"})
|
101 |
+
print(result)
|
102 |
+
|
103 |
+
# result = web_page_information_extractor.invoke(
|
104 |
+
# {"url": "https://chem.libretexts.org/Courses/Chabot_College/Introduction_to_General_Organic_and_Biochemistry/01%3A_Chemistry_in_our_Lives/1.E%3A_Exercises",
|
105 |
+
# "request": "What is the surname of the equine veterinarian mentioned"})
|
106 |
+
# print(result)
|