Spaces:
Running
Running
import os | |
from io import StringIO | |
import html2text | |
import pandas as pd | |
import requests | |
from bs4 import BeautifulSoup | |
from langchain_core.messages import SystemMessage, HumanMessage | |
from langchain_core.tools import tool | |
from langchain_openai import ChatOpenAI | |
from loguru import logger | |
from pydantic import SecretStr | |
def web_page_information_extractor(url: str, request: str) -> str: | |
""" | |
Extracts specific information from a web page based on the user's request. | |
This function uses a language model to extract information from the content | |
of a web page specified by the URL. The user's request specifies the type of | |
information to be extracted. The function returns the extracted information as | |
a JSON string. | |
Args: | |
url (str): The URL of the web page to extract information from. | |
request (str): The user's request describing the information to extract. | |
Returns: | |
str: The extracted information in JSON format. | |
""" | |
logger.info(f"use web_page_information_extractor with param: url:{url}, request:{request}") | |
text = _get_text_from_url(url) | |
logger.debug(f"web_page_information_extractor text: {text}") | |
chat = ChatOpenAI( | |
model="gpt-4o-mini", | |
temperature=0, | |
api_key=SecretStr(os.environ['OPENAI_API_KEY']) | |
) | |
system_message = "You are an expert information extraction system. Respond ONLY with valid JSON based on the user's request." | |
extraction_user_prompt = f"""From the text below:\n\"\"\"\n{text}\n\"\"\"\n\nExtract the following: "{request}".""" | |
extracted_information = chat.invoke([ | |
SystemMessage(system_message), | |
HumanMessage(extraction_user_prompt) | |
]) | |
return extracted_information.content | |
def _get_text_from_url(url: str) -> str: | |
response = requests.get(url, timeout=15, headers={"User-Agent": "Mozilla/5.0"}) | |
response.raise_for_status() # Raises HTTPError for bad responses | |
html = response.text | |
soup = BeautifulSoup(html, "html.parser") | |
# Get tables | |
tables = soup.find_all('table', class_='wikitable') | |
tables_text = "" | |
for i, table in enumerate(tables, 1): | |
# Find the nearest preceding h2 or h3 header | |
header = table.find_previous(['h2', 'h3']) | |
section_title = header.get_text().strip() if header else "Untitled Section" | |
try: | |
# Convert table to pandas DataFrame using StringIO | |
table_html = str(table).replace('\n', '') # Remove newlines for better parsing | |
df = pd.read_html(StringIO(table_html))[0] | |
# Format the table with section title, context, and clean layout | |
tables_text += f"\nSection: {section_title}\n" | |
tables_text += "=" * 40 + "\n" | |
tables_text += df.to_string(index=False) + "\n\n" | |
except Exception as e: | |
tables_text += f"\nError processing table in section {section_title}: {str(e)}\n" | |
continue | |
# Step 3: Convert HTML to Markdown | |
markdown_converter = html2text.HTML2Text() | |
markdown_converter.ignore_links = False | |
markdown_converter.bypass_tables = False | |
markdown_converter.ignore_images = True # optional | |
markdown_converter.body_width = 0 # don't wrap lines | |
text = markdown_converter.handle(html) | |
if tables_text: | |
text = f'Tables:\n{tables_text}\n\nContent\n{text}' | |
return text | |
if __name__ == "__main__": | |
# result = web_page_information_extractor.invoke( | |
# {"url": "https://en.wikipedia.org/wiki/Python_(programming_language)", | |
# "request": "What are changes introduced in Python 3.11"}) | |
# print(result) | |
result = web_page_information_extractor.invoke( | |
{"url": "https://en.wikipedia.org/wiki/1928_Summer_Olympics", | |
"request": "List of countries and number of athletes at the 1928 Summer Olympics"}) | |
print(result) | |
# result = web_page_information_extractor.invoke( | |
# {"url": "https://chem.libretexts.org/Courses/Chabot_College/Introduction_to_General_Organic_and_Biochemistry/01%3A_Chemistry_in_our_Lives/1.E%3A_Exercises", | |
# "request": "What is the surname of the equine veterinarian mentioned"}) | |
# print(result) | |
print(_get_text_from_url("https://en.wikipedia.org/wiki/Malko_Competition")) |