Pycrolis commited on
Commit
fe989a0
·
1 Parent(s): 5e2ccaa

feat(tool): add web page information extraction tool

Browse files
ShrewdAgent.py CHANGED
@@ -13,6 +13,8 @@ from langgraph.pregel import PregelProtocol
13
  from loguru import logger
14
  from pydantic import SecretStr
15
 
 
 
16
 
17
  class AgentState(TypedDict):
18
  messages: Annotated[list[AnyMessage], add_messages]
@@ -33,7 +35,10 @@ class ShrewdAgent:
33
  Important: Your final output must be only a number or a short phrase, with no additional text or explanation."""
34
 
35
  def __init__(self):
36
- self.tools = [TavilySearch()]
 
 
 
37
  self.llm = ChatOpenAI(
38
  model="gpt-4o-mini",
39
  temperature=0,
 
13
  from loguru import logger
14
  from pydantic import SecretStr
15
 
16
+ from tools.web_page_information_extractor import web_page_information_extractor
17
+
18
 
19
  class AgentState(TypedDict):
20
  messages: Annotated[list[AnyMessage], add_messages]
 
35
  Important: Your final output must be only a number or a short phrase, with no additional text or explanation."""
36
 
37
  def __init__(self):
38
+ self.tools = [
39
+ TavilySearch(),
40
+ web_page_information_extractor,
41
+ ]
42
  self.llm = ChatOpenAI(
43
  model="gpt-4o-mini",
44
  temperature=0,
requirements.txt CHANGED
@@ -5,4 +5,7 @@ langchain-core~=0.3.60
5
  langchain-openai~=0.3.17
6
  langgraph~=0.4.5
7
  loguru~=0.7.3
8
- pydantic~=2.11.4
 
 
 
 
5
  langchain-openai~=0.3.17
6
  langgraph~=0.4.5
7
  loguru~=0.7.3
8
+ pydantic~=2.11.4
9
+ html2text~=2025.4.15
10
+ beautifulsoup4~=4.13.4
11
+ readability-lxml~=0.8.4.1
tools/web_page_information_extractor.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from io import StringIO
3
+
4
+ import html2text
5
+ import pandas as pd
6
+ import requests
7
+ from bs4 import BeautifulSoup
8
+ from langchain_core.messages import SystemMessage, HumanMessage
9
+ from langchain_core.tools import tool
10
+ from langchain_openai import ChatOpenAI
11
+ from loguru import logger
12
+ from pydantic import SecretStr
13
+ from readability import Document
14
+
15
+
16
+ @tool("web_page_information_extractor_tool", parse_docstring=True)
17
+ def web_page_information_extractor(url: str, request: str) -> str:
18
+ """
19
+ Extracts specific information from a web page based on the user's request.
20
+
21
+ This function uses a language model to extract information from the content
22
+ of a web page specified by the URL. The user's request specifies the type of
23
+ information to be extracted. The function returns the extracted information as
24
+ a JSON string.
25
+
26
+ Args:
27
+ url (str): The URL of the web page to extract information from.
28
+ request (str): The user's request describing the information to extract.
29
+
30
+ Returns:
31
+ str: The extracted information in JSON format.
32
+ """
33
+ logger.info(f"use web_page_information_extractor with param: url:{url}, request:{request}")
34
+ response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
35
+ response.raise_for_status() # Raises HTTPError for bad responses
36
+ html = response.text
37
+ doc = Document(html)
38
+ cleaned_html = doc.summary()
39
+
40
+ soup = BeautifulSoup(cleaned_html, "html.parser")
41
+
42
+ # Get tables
43
+ tables = soup.find_all('table', class_='wikitable')
44
+ tables_text = ""
45
+
46
+ for i, table in enumerate(tables, 1):
47
+ # Find the nearest preceding h2 or h3 header
48
+ header = table.find_previous(['h2', 'h3'])
49
+ section_title = header.get_text().strip() if header else "Untitled Section"
50
+ try:
51
+ # Convert table to pandas DataFrame using StringIO
52
+ table_html = str(table).replace('\n', '') # Remove newlines for better parsing
53
+ df = pd.read_html(StringIO(table_html))[0]
54
+
55
+ # Format the table with section title, context, and clean layout
56
+ tables_text += f"\nSection: {section_title}\n"
57
+ tables_text += "=" * 40 + "\n"
58
+ tables_text += df.to_string(index=False) + "\n\n"
59
+ except Exception as e:
60
+ tables_text += f"\nError processing table in section {section_title}: {str(e)}\n"
61
+ continue
62
+
63
+ # Step 3: Convert HTML to Markdown
64
+ markdown_converter = html2text.HTML2Text()
65
+ markdown_converter.ignore_links = False
66
+ markdown_converter.bypass_tables = False
67
+ markdown_converter.ignore_images = True # optional
68
+ markdown_converter.body_width = 0 # don't wrap lines
69
+
70
+ text = markdown_converter.handle(cleaned_html)
71
+ if tables_text:
72
+ text += f'Tables:\n{tables_text}'
73
+
74
+ logger.debug(f"web_page_information_extractor text: {text}")
75
+
76
+ chat = ChatOpenAI(
77
+ model="gpt-4o-mini",
78
+ temperature=0,
79
+ api_key=SecretStr(os.environ['OPENAI_API_KEY'])
80
+ )
81
+
82
+ system_message = "You are an expert information extraction system. Respond ONLY with valid JSON based on the user's request."
83
+ extraction_user_prompt = f"""From the text below:\n\"\"\"\n{text}\n\"\"\"\n\nExtract the following: "{request}"."""
84
+
85
+ extracted_information = chat.invoke([
86
+ SystemMessage(system_message),
87
+ HumanMessage(extraction_user_prompt)
88
+ ])
89
+ return extracted_information.content
90
+
91
+
92
+ if __name__ == "__main__":
93
+ # result = web_page_information_extractor.invoke(
94
+ # {"url": "https://en.wikipedia.org/wiki/Python_(programming_language)",
95
+ # "request": "What are changes introduced in Python 3.11"})
96
+ # print(result)
97
+
98
+ result = web_page_information_extractor.invoke(
99
+ {"url": "https://en.wikipedia.org/wiki/1928_Summer_Olympics",
100
+ "request": "List of countries and number of athletes at the 1928 Summer Olympics"})
101
+ print(result)
102
+
103
+ # result = web_page_information_extractor.invoke(
104
+ # {"url": "https://chem.libretexts.org/Courses/Chabot_College/Introduction_to_General_Organic_and_Biochemistry/01%3A_Chemistry_in_our_Lives/1.E%3A_Exercises",
105
+ # "request": "What is the surname of the equine veterinarian mentioned"})
106
+ # print(result)