Spaces:
Sleeping
Sleeping
Pycrolis
commited on
Commit
·
8576a1a
1
Parent(s):
5077a8d
fix(web_page_extractor_tool): remove readability-lxml processing because it is too strong
Browse files
requirements.txt
CHANGED
@@ -8,7 +8,6 @@ loguru~=0.7.3
|
|
8 |
pydantic~=2.11.4
|
9 |
html2text~=2025.4.15
|
10 |
beautifulsoup4~=4.13.4
|
11 |
-
readability-lxml~=0.8.4.1
|
12 |
youtube-transcript-api~=1.0.3
|
13 |
wikipedia~=1.4.0
|
14 |
langchain_tavily~=0.1.6
|
|
|
8 |
pydantic~=2.11.4
|
9 |
html2text~=2025.4.15
|
10 |
beautifulsoup4~=4.13.4
|
|
|
11 |
youtube-transcript-api~=1.0.3
|
12 |
wikipedia~=1.4.0
|
13 |
langchain_tavily~=0.1.6
|
tools/web_page_information_extractor.py
CHANGED
@@ -10,7 +10,6 @@ from langchain_core.tools import tool
|
|
10 |
from langchain_openai import ChatOpenAI
|
11 |
from loguru import logger
|
12 |
from pydantic import SecretStr
|
13 |
-
from readability import Document
|
14 |
|
15 |
|
16 |
@tool("web_page_information_extractor_tool", parse_docstring=True)
|
@@ -54,10 +53,8 @@ def _get_text_from_url(url: str) -> str:
|
|
54 |
response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
|
55 |
response.raise_for_status() # Raises HTTPError for bad responses
|
56 |
html = response.text
|
57 |
-
doc = Document(html)
|
58 |
-
cleaned_html = doc.summary()
|
59 |
|
60 |
-
soup = BeautifulSoup(
|
61 |
|
62 |
# Get tables
|
63 |
tables = soup.find_all('table', class_='wikitable')
|
@@ -87,7 +84,7 @@ def _get_text_from_url(url: str) -> str:
|
|
87 |
markdown_converter.ignore_images = True # optional
|
88 |
markdown_converter.body_width = 0 # don't wrap lines
|
89 |
|
90 |
-
text = markdown_converter.handle(
|
91 |
if tables_text:
|
92 |
text += f'Tables:\n{tables_text}'
|
93 |
|
|
|
10 |
from langchain_openai import ChatOpenAI
|
11 |
from loguru import logger
|
12 |
from pydantic import SecretStr
|
|
|
13 |
|
14 |
|
15 |
@tool("web_page_information_extractor_tool", parse_docstring=True)
|
|
|
53 |
response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
|
54 |
response.raise_for_status() # Raises HTTPError for bad responses
|
55 |
html = response.text
|
|
|
|
|
56 |
|
57 |
+
soup = BeautifulSoup(html, "html.parser")
|
58 |
|
59 |
# Get tables
|
60 |
tables = soup.find_all('table', class_='wikitable')
|
|
|
84 |
markdown_converter.ignore_images = True # optional
|
85 |
markdown_converter.body_width = 0 # don't wrap lines
|
86 |
|
87 |
+
text = markdown_converter.handle(html)
|
88 |
if tables_text:
|
89 |
text += f'Tables:\n{tables_text}'
|
90 |
|