Spaces:

Pycrolis
/

shrewd-agent

Sleeping

App Files Files Community

Pycrolis commited on May 25

Commit

dc5bfe3

2 Parent(s): 76d7a0c 181c63d

Merge branch 'feat/improve-information_extractor_tool'

Browse files

Files changed (2) hide show

requirements.txt +2 -2
tools/web_page_information_extractor.py +27 -24

requirements.txt CHANGED Viewed

@@ -8,9 +8,9 @@ loguru~=0.7.3
 pydantic~=2.11.4
 html2text~=2025.4.15
 beautifulsoup4~=4.13.4
-readability-lxml~=0.8.4.1
 youtube-transcript-api~=1.0.3
 wikipedia~=1.4.0
 langchain_tavily~=0.1.6
 rizaio~=0.11.0
-openai-whisper==20240930

 pydantic~=2.11.4
 html2text~=2025.4.15
 beautifulsoup4~=4.13.4
 youtube-transcript-api~=1.0.3
 wikipedia~=1.4.0
 langchain_tavily~=0.1.6
 rizaio~=0.11.0
+openai-whisper==20240930
+openpyxl~=3.1.5

tools/web_page_information_extractor.py CHANGED Viewed

@@ -10,7 +10,6 @@ from langchain_core.tools import tool
 from langchain_openai import ChatOpenAI
 from loguru import logger
 from pydantic import SecretStr
-from readability import Document
 @tool("web_page_information_extractor_tool", parse_docstring=True)
@@ -31,13 +30,31 @@ def web_page_information_extractor(url: str, request: str) -> str:
         str: The extracted information in JSON format.
     """
     logger.info(f"use web_page_information_extractor with param: url:{url}, request:{request}")
-    response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
     response.raise_for_status()  # Raises HTTPError for bad responses
     html = response.text
-    doc = Document(html)
-    cleaned_html = doc.summary()
-    soup = BeautifulSoup(cleaned_html, "html.parser")
     # Get tables
     tables = soup.find_all('table', class_='wikitable')
@@ -67,27 +84,11 @@ def web_page_information_extractor(url: str, request: str) -> str:
     markdown_converter.ignore_images = True  # optional
     markdown_converter.body_width = 0  # don't wrap lines
-    text = markdown_converter.handle(cleaned_html)
     if tables_text:
-        text += f'Tables:\n{tables_text}'
-    logger.debug(f"web_page_information_extractor text: {text}")
-    chat = ChatOpenAI(
-        model="gpt-4o-mini",
-        temperature=0,
-        api_key=SecretStr(os.environ['OPENAI_API_KEY'])
-    )
-    system_message = "You are an expert information extraction system. Respond ONLY with valid JSON based on the user's request."
-    extraction_user_prompt = f"""From the text below:\n\"\"\"\n{text}\n\"\"\"\n\nExtract the following: "{request}"."""
-    extracted_information = chat.invoke([
-        SystemMessage(system_message),
-        HumanMessage(extraction_user_prompt)
-    ])
-    return extracted_information.content
 if __name__ == "__main__":
     # result = web_page_information_extractor.invoke(
@@ -104,3 +105,5 @@ if __name__ == "__main__":
     #     {"url": "https://chem.libretexts.org/Courses/Chabot_College/Introduction_to_General_Organic_and_Biochemistry/01%3A_Chemistry_in_our_Lives/1.E%3A_Exercises",
     # "request": "What is the surname of the equine veterinarian mentioned"})
     # print(result)

 from langchain_openai import ChatOpenAI
 from loguru import logger
 from pydantic import SecretStr
 @tool("web_page_information_extractor_tool", parse_docstring=True)
         str: The extracted information in JSON format.
     """
     logger.info(f"use web_page_information_extractor with param: url:{url}, request:{request}")
+    text = _get_text_from_url(url)
+    logger.debug(f"web_page_information_extractor text: {text}")
+    chat = ChatOpenAI(
+        model="gpt-4o-mini",
+        temperature=0,
+        api_key=SecretStr(os.environ['OPENAI_API_KEY'])
+    )
+    system_message = "You are an expert information extraction system. Respond ONLY with valid JSON based on the user's request."
+    extraction_user_prompt = f"""From the text below:\n\"\"\"\n{text}\n\"\"\"\n\nExtract the following: "{request}"."""
+    extracted_information = chat.invoke([
+        SystemMessage(system_message),
+        HumanMessage(extraction_user_prompt)
+    ])
+    return extracted_information.content
+def _get_text_from_url(url: str) -> str:
+    response = requests.get(url, timeout=15, headers={"User-Agent": "Mozilla/5.0"})
     response.raise_for_status()  # Raises HTTPError for bad responses
     html = response.text
+    soup = BeautifulSoup(html, "html.parser")
     # Get tables
     tables = soup.find_all('table', class_='wikitable')
     markdown_converter.ignore_images = True  # optional
     markdown_converter.body_width = 0  # don't wrap lines
+    text = markdown_converter.handle(html)
     if tables_text:
+        text = f'Tables:\n{tables_text}\n\nContent\n{text}'
+    return text
 if __name__ == "__main__":
     # result = web_page_information_extractor.invoke(
     #     {"url": "https://chem.libretexts.org/Courses/Chabot_College/Introduction_to_General_Organic_and_Biochemistry/01%3A_Chemistry_in_our_Lives/1.E%3A_Exercises",
     # "request": "What is the surname of the equine veterinarian mentioned"})
     # print(result)
+    print(_get_text_from_url("https://en.wikipedia.org/wiki/Malko_Competition"))