File size: 4,329 Bytes
fe989a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5077a8d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181c63d
fe989a0
 
 
8576a1a
fe989a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8576a1a
fe989a0
9e2fac8
fe989a0
5077a8d
fe989a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5077a8d
9e2fac8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import os
from io import StringIO

import html2text
import pandas as pd
import requests
from bs4 import BeautifulSoup
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_core.tools import tool
from langchain_openai import ChatOpenAI
from loguru import logger
from pydantic import SecretStr


@tool("web_page_information_extractor_tool", parse_docstring=True)
def web_page_information_extractor(url: str, request: str) -> str:
    """
    Extracts specific information from a web page based on the user's request.

    This function uses a language model to extract information from the content
    of a web page specified by the URL. The user's request specifies the type of
    information to be extracted. The function returns the extracted information as
    a JSON string.

    Args:
        url (str): The URL of the web page to extract information from.
        request (str): The user's request describing the information to extract.

    Returns:
        str: The extracted information in JSON format.
    """
    logger.info(f"use web_page_information_extractor with param: url:{url}, request:{request}")

    text = _get_text_from_url(url)
    logger.debug(f"web_page_information_extractor text: {text}")

    chat = ChatOpenAI(
        model="gpt-4o-mini",
        temperature=0,
        api_key=SecretStr(os.environ['OPENAI_API_KEY'])
    )

    system_message = "You are an expert information extraction system. Respond ONLY with valid JSON based on the user's request."
    extraction_user_prompt = f"""From the text below:\n\"\"\"\n{text}\n\"\"\"\n\nExtract the following: "{request}"."""

    extracted_information = chat.invoke([
        SystemMessage(system_message),
        HumanMessage(extraction_user_prompt)
    ])
    return extracted_information.content

def _get_text_from_url(url: str) -> str:
    response = requests.get(url, timeout=15, headers={"User-Agent": "Mozilla/5.0"})
    response.raise_for_status()  # Raises HTTPError for bad responses
    html = response.text

    soup = BeautifulSoup(html, "html.parser")

    # Get tables
    tables = soup.find_all('table', class_='wikitable')
    tables_text = ""

    for i, table in enumerate(tables, 1):
        # Find the nearest preceding h2 or h3 header
        header = table.find_previous(['h2', 'h3'])
        section_title = header.get_text().strip() if header else "Untitled Section"
        try:
            # Convert table to pandas DataFrame using StringIO
            table_html = str(table).replace('\n', '')  # Remove newlines for better parsing
            df = pd.read_html(StringIO(table_html))[0]

            # Format the table with section title, context, and clean layout
            tables_text += f"\nSection: {section_title}\n"
            tables_text += "=" * 40 + "\n"
            tables_text += df.to_string(index=False) + "\n\n"
        except Exception as e:
            tables_text += f"\nError processing table in section {section_title}: {str(e)}\n"
            continue

    # Step 3: Convert HTML to Markdown
    markdown_converter = html2text.HTML2Text()
    markdown_converter.ignore_links = False
    markdown_converter.bypass_tables = False
    markdown_converter.ignore_images = True  # optional
    markdown_converter.body_width = 0  # don't wrap lines

    text = markdown_converter.handle(html)
    if tables_text:
        text = f'Tables:\n{tables_text}\n\nContent\n{text}'

    return text

if __name__ == "__main__":
    # result = web_page_information_extractor.invoke(
    #     {"url": "https://en.wikipedia.org/wiki/Python_(programming_language)",
    #      "request": "What are changes introduced in Python 3.11"})
    # print(result)

    result = web_page_information_extractor.invoke(
        {"url": "https://en.wikipedia.org/wiki/1928_Summer_Olympics",
         "request": "List of countries and number of athletes at the 1928 Summer Olympics"})
    print(result)

    # result = web_page_information_extractor.invoke(
    #     {"url": "https://chem.libretexts.org/Courses/Chabot_College/Introduction_to_General_Organic_and_Biochemistry/01%3A_Chemistry_in_our_Lives/1.E%3A_Exercises",
    # "request": "What is the surname of the equine veterinarian mentioned"})
    # print(result)

    print(_get_text_from_url("https://en.wikipedia.org/wiki/Malko_Competition"))