Spaces:

Pycrolis
/

shrewd-agent

Running

shrewd-agent / tools /web_page_information_extractor.py

Pycrolis

feat(web_page_extractor_tool): add timeout to HTTP requests

181c63d about 1 month ago

4.33 kB

	import os
	from io import StringIO

	import html2text
	import pandas as pd
	import requests
	from bs4 import BeautifulSoup
	from langchain_core.messages import SystemMessage, HumanMessage
	from langchain_core.tools import tool
	from langchain_openai import ChatOpenAI
	from loguru import logger
	from pydantic import SecretStr


	@tool("web_page_information_extractor_tool", parse_docstring=True)
	def web_page_information_extractor(url: str, request: str) -> str:
	"""
	Extracts specific information from a web page based on the user's request.

	This function uses a language model to extract information from the content
	of a web page specified by the URL. The user's request specifies the type of
	information to be extracted. The function returns the extracted information as
	a JSON string.

	Args:
	url (str): The URL of the web page to extract information from.
	request (str): The user's request describing the information to extract.

	Returns:
	str: The extracted information in JSON format.
	"""
	logger.info(f"use web_page_information_extractor with param: url:{url}, request:{request}")

	text = _get_text_from_url(url)
	logger.debug(f"web_page_information_extractor text: {text}")

	chat = ChatOpenAI(
	model="gpt-4o-mini",
	temperature=0,
	api_key=SecretStr(os.environ['OPENAI_API_KEY'])
	)

	system_message = "You are an expert information extraction system. Respond ONLY with valid JSON based on the user's request."
	extraction_user_prompt = f"""From the text below:\n\"\"\"\n{text}\n\"\"\"\n\nExtract the following: "{request}"."""

	extracted_information = chat.invoke([
	SystemMessage(system_message),
	HumanMessage(extraction_user_prompt)
	])
	return extracted_information.content

	def _get_text_from_url(url: str) -> str:
	response = requests.get(url, timeout=15, headers={"User-Agent": "Mozilla/5.0"})
	response.raise_for_status() # Raises HTTPError for bad responses
	html = response.text

	soup = BeautifulSoup(html, "html.parser")

	# Get tables
	tables = soup.find_all('table', class_='wikitable')
	tables_text = ""

	for i, table in enumerate(tables, 1):
	# Find the nearest preceding h2 or h3 header
	header = table.find_previous(['h2', 'h3'])
	section_title = header.get_text().strip() if header else "Untitled Section"
	try:
	# Convert table to pandas DataFrame using StringIO
	table_html = str(table).replace('\n', '') # Remove newlines for better parsing
	df = pd.read_html(StringIO(table_html))[0]

	# Format the table with section title, context, and clean layout
	tables_text += f"\nSection: {section_title}\n"
	tables_text += "=" * 40 + "\n"
	tables_text += df.to_string(index=False) + "\n\n"
	except Exception as e:
	tables_text += f"\nError processing table in section {section_title}: {str(e)}\n"
	continue

	# Step 3: Convert HTML to Markdown
	markdown_converter = html2text.HTML2Text()
	markdown_converter.ignore_links = False
	markdown_converter.bypass_tables = False
	markdown_converter.ignore_images = True # optional
	markdown_converter.body_width = 0 # don't wrap lines

	text = markdown_converter.handle(html)
	if tables_text:
	text = f'Tables:\n{tables_text}\n\nContent\n{text}'

	return text

	if __name__ == "__main__":
	# result = web_page_information_extractor.invoke(
	# {"url": "https://en.wikipedia.org/wiki/Python_(programming_language)",
	# "request": "What are changes introduced in Python 3.11"})
	# print(result)

	result = web_page_information_extractor.invoke(
	{"url": "https://en.wikipedia.org/wiki/1928_Summer_Olympics",
	"request": "List of countries and number of athletes at the 1928 Summer Olympics"})
	print(result)

	# result = web_page_information_extractor.invoke(
	# {"url": "https://chem.libretexts.org/Courses/Chabot_College/Introduction_to_General_Organic_and_Biochemistry/01%3A_Chemistry_in_our_Lives/1.E%3A_Exercises",
	# "request": "What is the surname of the equine veterinarian mentioned"})
	# print(result)

	print(_get_text_from_url("https://en.wikipedia.org/wiki/Malko_Competition"))