Pycrolis commited on
Commit
8576a1a
·
1 Parent(s): 5077a8d

fix(web_page_extractor_tool): remove readability-lxml processing because it is too strong

Browse files
requirements.txt CHANGED
@@ -8,7 +8,6 @@ loguru~=0.7.3
8
  pydantic~=2.11.4
9
  html2text~=2025.4.15
10
  beautifulsoup4~=4.13.4
11
- readability-lxml~=0.8.4.1
12
  youtube-transcript-api~=1.0.3
13
  wikipedia~=1.4.0
14
  langchain_tavily~=0.1.6
 
8
  pydantic~=2.11.4
9
  html2text~=2025.4.15
10
  beautifulsoup4~=4.13.4
 
11
  youtube-transcript-api~=1.0.3
12
  wikipedia~=1.4.0
13
  langchain_tavily~=0.1.6
tools/web_page_information_extractor.py CHANGED
@@ -10,7 +10,6 @@ from langchain_core.tools import tool
10
  from langchain_openai import ChatOpenAI
11
  from loguru import logger
12
  from pydantic import SecretStr
13
- from readability import Document
14
 
15
 
16
  @tool("web_page_information_extractor_tool", parse_docstring=True)
@@ -54,10 +53,8 @@ def _get_text_from_url(url: str) -> str:
54
  response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
55
  response.raise_for_status() # Raises HTTPError for bad responses
56
  html = response.text
57
- doc = Document(html)
58
- cleaned_html = doc.summary()
59
 
60
- soup = BeautifulSoup(cleaned_html, "html.parser")
61
 
62
  # Get tables
63
  tables = soup.find_all('table', class_='wikitable')
@@ -87,7 +84,7 @@ def _get_text_from_url(url: str) -> str:
87
  markdown_converter.ignore_images = True # optional
88
  markdown_converter.body_width = 0 # don't wrap lines
89
 
90
- text = markdown_converter.handle(cleaned_html)
91
  if tables_text:
92
  text += f'Tables:\n{tables_text}'
93
 
 
10
  from langchain_openai import ChatOpenAI
11
  from loguru import logger
12
  from pydantic import SecretStr
 
13
 
14
 
15
  @tool("web_page_information_extractor_tool", parse_docstring=True)
 
53
  response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
54
  response.raise_for_status() # Raises HTTPError for bad responses
55
  html = response.text
 
 
56
 
57
+ soup = BeautifulSoup(html, "html.parser")
58
 
59
  # Get tables
60
  tables = soup.find_all('table', class_='wikitable')
 
84
  markdown_converter.ignore_images = True # optional
85
  markdown_converter.body_width = 0 # don't wrap lines
86
 
87
+ text = markdown_converter.handle(html)
88
  if tables_text:
89
  text += f'Tables:\n{tables_text}'
90