Spaces:

Pycrolis
/

shrewd-agent

Sleeping

Pycrolis commited on May 25

Commit

8576a1a

1 Parent(s): 5077a8d

fix(web_page_extractor_tool): remove readability-lxml processing because it is too strong

Files changed (2) hide show

requirements.txt CHANGED Viewed

@@ -8,7 +8,6 @@ loguru~=0.7.3
 pydantic~=2.11.4
 html2text~=2025.4.15
 beautifulsoup4~=4.13.4
-readability-lxml~=0.8.4.1
 youtube-transcript-api~=1.0.3
 wikipedia~=1.4.0
 langchain_tavily~=0.1.6

 pydantic~=2.11.4
 html2text~=2025.4.15
 beautifulsoup4~=4.13.4
 youtube-transcript-api~=1.0.3
 wikipedia~=1.4.0
 langchain_tavily~=0.1.6

tools/web_page_information_extractor.py CHANGED Viewed

@@ -10,7 +10,6 @@ from langchain_core.tools import tool
 from langchain_openai import ChatOpenAI
 from loguru import logger
 from pydantic import SecretStr
-from readability import Document
 @tool("web_page_information_extractor_tool", parse_docstring=True)
@@ -54,10 +53,8 @@ def _get_text_from_url(url: str) -> str:
     response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
     response.raise_for_status()  # Raises HTTPError for bad responses
     html = response.text
-    doc = Document(html)
-    cleaned_html = doc.summary()
-    soup = BeautifulSoup(cleaned_html, "html.parser")
     # Get tables
     tables = soup.find_all('table', class_='wikitable')
@@ -87,7 +84,7 @@ def _get_text_from_url(url: str) -> str:
     markdown_converter.ignore_images = True  # optional
     markdown_converter.body_width = 0  # don't wrap lines
-    text = markdown_converter.handle(cleaned_html)
     if tables_text:
         text += f'Tables:\n{tables_text}'

 from langchain_openai import ChatOpenAI
 from loguru import logger
 from pydantic import SecretStr
 @tool("web_page_information_extractor_tool", parse_docstring=True)
     response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
     response.raise_for_status()  # Raises HTTPError for bad responses
     html = response.text
+    soup = BeautifulSoup(html, "html.parser")
     # Get tables
     tables = soup.find_all('table', class_='wikitable')
     markdown_converter.ignore_images = True  # optional
     markdown_converter.body_width = 0  # don't wrap lines
+    text = markdown_converter.handle(html)
     if tables_text:
         text += f'Tables:\n{tables_text}'