Pycrolis commited on
Commit
dc5bfe3
·
2 Parent(s): 76d7a0c 181c63d

Merge branch 'feat/improve-information_extractor_tool'

Browse files
requirements.txt CHANGED
@@ -8,9 +8,9 @@ loguru~=0.7.3
8
  pydantic~=2.11.4
9
  html2text~=2025.4.15
10
  beautifulsoup4~=4.13.4
11
- readability-lxml~=0.8.4.1
12
  youtube-transcript-api~=1.0.3
13
  wikipedia~=1.4.0
14
  langchain_tavily~=0.1.6
15
  rizaio~=0.11.0
16
- openai-whisper==20240930
 
 
8
  pydantic~=2.11.4
9
  html2text~=2025.4.15
10
  beautifulsoup4~=4.13.4
 
11
  youtube-transcript-api~=1.0.3
12
  wikipedia~=1.4.0
13
  langchain_tavily~=0.1.6
14
  rizaio~=0.11.0
15
+ openai-whisper==20240930
16
+ openpyxl~=3.1.5
tools/web_page_information_extractor.py CHANGED
@@ -10,7 +10,6 @@ from langchain_core.tools import tool
10
  from langchain_openai import ChatOpenAI
11
  from loguru import logger
12
  from pydantic import SecretStr
13
- from readability import Document
14
 
15
 
16
  @tool("web_page_information_extractor_tool", parse_docstring=True)
@@ -31,13 +30,31 @@ def web_page_information_extractor(url: str, request: str) -> str:
31
  str: The extracted information in JSON format.
32
  """
33
  logger.info(f"use web_page_information_extractor with param: url:{url}, request:{request}")
34
- response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  response.raise_for_status() # Raises HTTPError for bad responses
36
  html = response.text
37
- doc = Document(html)
38
- cleaned_html = doc.summary()
39
 
40
- soup = BeautifulSoup(cleaned_html, "html.parser")
41
 
42
  # Get tables
43
  tables = soup.find_all('table', class_='wikitable')
@@ -67,27 +84,11 @@ def web_page_information_extractor(url: str, request: str) -> str:
67
  markdown_converter.ignore_images = True # optional
68
  markdown_converter.body_width = 0 # don't wrap lines
69
 
70
- text = markdown_converter.handle(cleaned_html)
71
  if tables_text:
72
- text += f'Tables:\n{tables_text}'
73
-
74
- logger.debug(f"web_page_information_extractor text: {text}")
75
-
76
- chat = ChatOpenAI(
77
- model="gpt-4o-mini",
78
- temperature=0,
79
- api_key=SecretStr(os.environ['OPENAI_API_KEY'])
80
- )
81
-
82
- system_message = "You are an expert information extraction system. Respond ONLY with valid JSON based on the user's request."
83
- extraction_user_prompt = f"""From the text below:\n\"\"\"\n{text}\n\"\"\"\n\nExtract the following: "{request}"."""
84
-
85
- extracted_information = chat.invoke([
86
- SystemMessage(system_message),
87
- HumanMessage(extraction_user_prompt)
88
- ])
89
- return extracted_information.content
90
 
 
91
 
92
  if __name__ == "__main__":
93
  # result = web_page_information_extractor.invoke(
@@ -104,3 +105,5 @@ if __name__ == "__main__":
104
  # {"url": "https://chem.libretexts.org/Courses/Chabot_College/Introduction_to_General_Organic_and_Biochemistry/01%3A_Chemistry_in_our_Lives/1.E%3A_Exercises",
105
  # "request": "What is the surname of the equine veterinarian mentioned"})
106
  # print(result)
 
 
 
10
  from langchain_openai import ChatOpenAI
11
  from loguru import logger
12
  from pydantic import SecretStr
 
13
 
14
 
15
  @tool("web_page_information_extractor_tool", parse_docstring=True)
 
30
  str: The extracted information in JSON format.
31
  """
32
  logger.info(f"use web_page_information_extractor with param: url:{url}, request:{request}")
33
+
34
+ text = _get_text_from_url(url)
35
+ logger.debug(f"web_page_information_extractor text: {text}")
36
+
37
+ chat = ChatOpenAI(
38
+ model="gpt-4o-mini",
39
+ temperature=0,
40
+ api_key=SecretStr(os.environ['OPENAI_API_KEY'])
41
+ )
42
+
43
+ system_message = "You are an expert information extraction system. Respond ONLY with valid JSON based on the user's request."
44
+ extraction_user_prompt = f"""From the text below:\n\"\"\"\n{text}\n\"\"\"\n\nExtract the following: "{request}"."""
45
+
46
+ extracted_information = chat.invoke([
47
+ SystemMessage(system_message),
48
+ HumanMessage(extraction_user_prompt)
49
+ ])
50
+ return extracted_information.content
51
+
52
+ def _get_text_from_url(url: str) -> str:
53
+ response = requests.get(url, timeout=15, headers={"User-Agent": "Mozilla/5.0"})
54
  response.raise_for_status() # Raises HTTPError for bad responses
55
  html = response.text
 
 
56
 
57
+ soup = BeautifulSoup(html, "html.parser")
58
 
59
  # Get tables
60
  tables = soup.find_all('table', class_='wikitable')
 
84
  markdown_converter.ignore_images = True # optional
85
  markdown_converter.body_width = 0 # don't wrap lines
86
 
87
+ text = markdown_converter.handle(html)
88
  if tables_text:
89
+ text = f'Tables:\n{tables_text}\n\nContent\n{text}'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
+ return text
92
 
93
  if __name__ == "__main__":
94
  # result = web_page_information_extractor.invoke(
 
105
  # {"url": "https://chem.libretexts.org/Courses/Chabot_College/Introduction_to_General_Organic_and_Biochemistry/01%3A_Chemistry_in_our_Lives/1.E%3A_Exercises",
106
  # "request": "What is the surname of the equine veterinarian mentioned"})
107
  # print(result)
108
+
109
+ print(_get_text_from_url("https://en.wikipedia.org/wiki/Malko_Competition"))