Spaces:
Running
Running
File size: 4,329 Bytes
fe989a0 5077a8d 181c63d fe989a0 8576a1a fe989a0 8576a1a fe989a0 9e2fac8 fe989a0 5077a8d fe989a0 5077a8d 9e2fac8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
import os
from io import StringIO
import html2text
import pandas as pd
import requests
from bs4 import BeautifulSoup
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_core.tools import tool
from langchain_openai import ChatOpenAI
from loguru import logger
from pydantic import SecretStr
@tool("web_page_information_extractor_tool", parse_docstring=True)
def web_page_information_extractor(url: str, request: str) -> str:
"""
Extracts specific information from a web page based on the user's request.
This function uses a language model to extract information from the content
of a web page specified by the URL. The user's request specifies the type of
information to be extracted. The function returns the extracted information as
a JSON string.
Args:
url (str): The URL of the web page to extract information from.
request (str): The user's request describing the information to extract.
Returns:
str: The extracted information in JSON format.
"""
logger.info(f"use web_page_information_extractor with param: url:{url}, request:{request}")
text = _get_text_from_url(url)
logger.debug(f"web_page_information_extractor text: {text}")
chat = ChatOpenAI(
model="gpt-4o-mini",
temperature=0,
api_key=SecretStr(os.environ['OPENAI_API_KEY'])
)
system_message = "You are an expert information extraction system. Respond ONLY with valid JSON based on the user's request."
extraction_user_prompt = f"""From the text below:\n\"\"\"\n{text}\n\"\"\"\n\nExtract the following: "{request}"."""
extracted_information = chat.invoke([
SystemMessage(system_message),
HumanMessage(extraction_user_prompt)
])
return extracted_information.content
def _get_text_from_url(url: str) -> str:
response = requests.get(url, timeout=15, headers={"User-Agent": "Mozilla/5.0"})
response.raise_for_status() # Raises HTTPError for bad responses
html = response.text
soup = BeautifulSoup(html, "html.parser")
# Get tables
tables = soup.find_all('table', class_='wikitable')
tables_text = ""
for i, table in enumerate(tables, 1):
# Find the nearest preceding h2 or h3 header
header = table.find_previous(['h2', 'h3'])
section_title = header.get_text().strip() if header else "Untitled Section"
try:
# Convert table to pandas DataFrame using StringIO
table_html = str(table).replace('\n', '') # Remove newlines for better parsing
df = pd.read_html(StringIO(table_html))[0]
# Format the table with section title, context, and clean layout
tables_text += f"\nSection: {section_title}\n"
tables_text += "=" * 40 + "\n"
tables_text += df.to_string(index=False) + "\n\n"
except Exception as e:
tables_text += f"\nError processing table in section {section_title}: {str(e)}\n"
continue
# Step 3: Convert HTML to Markdown
markdown_converter = html2text.HTML2Text()
markdown_converter.ignore_links = False
markdown_converter.bypass_tables = False
markdown_converter.ignore_images = True # optional
markdown_converter.body_width = 0 # don't wrap lines
text = markdown_converter.handle(html)
if tables_text:
text = f'Tables:\n{tables_text}\n\nContent\n{text}'
return text
if __name__ == "__main__":
# result = web_page_information_extractor.invoke(
# {"url": "https://en.wikipedia.org/wiki/Python_(programming_language)",
# "request": "What are changes introduced in Python 3.11"})
# print(result)
result = web_page_information_extractor.invoke(
{"url": "https://en.wikipedia.org/wiki/1928_Summer_Olympics",
"request": "List of countries and number of athletes at the 1928 Summer Olympics"})
print(result)
# result = web_page_information_extractor.invoke(
# {"url": "https://chem.libretexts.org/Courses/Chabot_College/Introduction_to_General_Organic_and_Biochemistry/01%3A_Chemistry_in_our_Lives/1.E%3A_Exercises",
# "request": "What is the surname of the equine veterinarian mentioned"})
# print(result)
print(_get_text_from_url("https://en.wikipedia.org/wiki/Malko_Competition")) |