Spaces:
Sleeping
Sleeping
Merge branch 'feat/improve-information_extractor_tool'
Browse files- requirements.txt +2 -2
- tools/web_page_information_extractor.py +27 -24
requirements.txt
CHANGED
@@ -8,9 +8,9 @@ loguru~=0.7.3
|
|
8 |
pydantic~=2.11.4
|
9 |
html2text~=2025.4.15
|
10 |
beautifulsoup4~=4.13.4
|
11 |
-
readability-lxml~=0.8.4.1
|
12 |
youtube-transcript-api~=1.0.3
|
13 |
wikipedia~=1.4.0
|
14 |
langchain_tavily~=0.1.6
|
15 |
rizaio~=0.11.0
|
16 |
-
openai-whisper==20240930
|
|
|
|
8 |
pydantic~=2.11.4
|
9 |
html2text~=2025.4.15
|
10 |
beautifulsoup4~=4.13.4
|
|
|
11 |
youtube-transcript-api~=1.0.3
|
12 |
wikipedia~=1.4.0
|
13 |
langchain_tavily~=0.1.6
|
14 |
rizaio~=0.11.0
|
15 |
+
openai-whisper==20240930
|
16 |
+
openpyxl~=3.1.5
|
tools/web_page_information_extractor.py
CHANGED
@@ -10,7 +10,6 @@ from langchain_core.tools import tool
|
|
10 |
from langchain_openai import ChatOpenAI
|
11 |
from loguru import logger
|
12 |
from pydantic import SecretStr
|
13 |
-
from readability import Document
|
14 |
|
15 |
|
16 |
@tool("web_page_information_extractor_tool", parse_docstring=True)
|
@@ -31,13 +30,31 @@ def web_page_information_extractor(url: str, request: str) -> str:
|
|
31 |
str: The extracted information in JSON format.
|
32 |
"""
|
33 |
logger.info(f"use web_page_information_extractor with param: url:{url}, request:{request}")
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
response.raise_for_status() # Raises HTTPError for bad responses
|
36 |
html = response.text
|
37 |
-
doc = Document(html)
|
38 |
-
cleaned_html = doc.summary()
|
39 |
|
40 |
-
soup = BeautifulSoup(
|
41 |
|
42 |
# Get tables
|
43 |
tables = soup.find_all('table', class_='wikitable')
|
@@ -67,27 +84,11 @@ def web_page_information_extractor(url: str, request: str) -> str:
|
|
67 |
markdown_converter.ignore_images = True # optional
|
68 |
markdown_converter.body_width = 0 # don't wrap lines
|
69 |
|
70 |
-
text = markdown_converter.handle(
|
71 |
if tables_text:
|
72 |
-
text
|
73 |
-
|
74 |
-
logger.debug(f"web_page_information_extractor text: {text}")
|
75 |
-
|
76 |
-
chat = ChatOpenAI(
|
77 |
-
model="gpt-4o-mini",
|
78 |
-
temperature=0,
|
79 |
-
api_key=SecretStr(os.environ['OPENAI_API_KEY'])
|
80 |
-
)
|
81 |
-
|
82 |
-
system_message = "You are an expert information extraction system. Respond ONLY with valid JSON based on the user's request."
|
83 |
-
extraction_user_prompt = f"""From the text below:\n\"\"\"\n{text}\n\"\"\"\n\nExtract the following: "{request}"."""
|
84 |
-
|
85 |
-
extracted_information = chat.invoke([
|
86 |
-
SystemMessage(system_message),
|
87 |
-
HumanMessage(extraction_user_prompt)
|
88 |
-
])
|
89 |
-
return extracted_information.content
|
90 |
|
|
|
91 |
|
92 |
if __name__ == "__main__":
|
93 |
# result = web_page_information_extractor.invoke(
|
@@ -104,3 +105,5 @@ if __name__ == "__main__":
|
|
104 |
# {"url": "https://chem.libretexts.org/Courses/Chabot_College/Introduction_to_General_Organic_and_Biochemistry/01%3A_Chemistry_in_our_Lives/1.E%3A_Exercises",
|
105 |
# "request": "What is the surname of the equine veterinarian mentioned"})
|
106 |
# print(result)
|
|
|
|
|
|
10 |
from langchain_openai import ChatOpenAI
|
11 |
from loguru import logger
|
12 |
from pydantic import SecretStr
|
|
|
13 |
|
14 |
|
15 |
@tool("web_page_information_extractor_tool", parse_docstring=True)
|
|
|
30 |
str: The extracted information in JSON format.
|
31 |
"""
|
32 |
logger.info(f"use web_page_information_extractor with param: url:{url}, request:{request}")
|
33 |
+
|
34 |
+
text = _get_text_from_url(url)
|
35 |
+
logger.debug(f"web_page_information_extractor text: {text}")
|
36 |
+
|
37 |
+
chat = ChatOpenAI(
|
38 |
+
model="gpt-4o-mini",
|
39 |
+
temperature=0,
|
40 |
+
api_key=SecretStr(os.environ['OPENAI_API_KEY'])
|
41 |
+
)
|
42 |
+
|
43 |
+
system_message = "You are an expert information extraction system. Respond ONLY with valid JSON based on the user's request."
|
44 |
+
extraction_user_prompt = f"""From the text below:\n\"\"\"\n{text}\n\"\"\"\n\nExtract the following: "{request}"."""
|
45 |
+
|
46 |
+
extracted_information = chat.invoke([
|
47 |
+
SystemMessage(system_message),
|
48 |
+
HumanMessage(extraction_user_prompt)
|
49 |
+
])
|
50 |
+
return extracted_information.content
|
51 |
+
|
52 |
+
def _get_text_from_url(url: str) -> str:
|
53 |
+
response = requests.get(url, timeout=15, headers={"User-Agent": "Mozilla/5.0"})
|
54 |
response.raise_for_status() # Raises HTTPError for bad responses
|
55 |
html = response.text
|
|
|
|
|
56 |
|
57 |
+
soup = BeautifulSoup(html, "html.parser")
|
58 |
|
59 |
# Get tables
|
60 |
tables = soup.find_all('table', class_='wikitable')
|
|
|
84 |
markdown_converter.ignore_images = True # optional
|
85 |
markdown_converter.body_width = 0 # don't wrap lines
|
86 |
|
87 |
+
text = markdown_converter.handle(html)
|
88 |
if tables_text:
|
89 |
+
text = f'Tables:\n{tables_text}\n\nContent\n{text}'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
|
91 |
+
return text
|
92 |
|
93 |
if __name__ == "__main__":
|
94 |
# result = web_page_information_extractor.invoke(
|
|
|
105 |
# {"url": "https://chem.libretexts.org/Courses/Chabot_College/Introduction_to_General_Organic_and_Biochemistry/01%3A_Chemistry_in_our_Lives/1.E%3A_Exercises",
|
106 |
# "request": "What is the surname of the equine veterinarian mentioned"})
|
107 |
# print(result)
|
108 |
+
|
109 |
+
print(_get_text_from_url("https://en.wikipedia.org/wiki/Malko_Competition"))
|