ai_agent_course_final_project

Sleeping

File size: 14,522 Bytes

import requests
from bs4 import BeautifulSoup
from typing import List, Dict, Any, Union
from functools import lru_cache
from smolagents import Tool, WebSearchTool, WikipediaSearchTool, PythonInterpreterTool
from pydantic import BaseModel, Field

class CachedWebSearchTool(WebSearchTool):
    @lru_cache(maxsize=128)
    def run(self, query: str):
        # identical queries return instantly
        return super().run(query)

class CachedWikiTool(WikipediaSearchTool):
    @lru_cache(maxsize=128)
    def run(self, page: str):
        return super().run(page)

class PreloadedPythonTool(PythonInterpreterTool):
    """
    A PythonInterpreterTool that automatically prepends the necessary imports
    (bs4, BeautifulSoup, regex) so you never hit NameError inside your code blocks.
    """
    def run(self, code: str) -> str:
        preamble = (
            "import bs4\n"
            "from bs4 import BeautifulSoup\n"
            "import regex\n"
        )
        return super().run(preamble + code)



# --------------------- Webpage structure analyzer -------------------------------
class WebpageStructureAnalyzerTool(Tool):
    """
    A tool to fetch a webpage and analyze its basic HTML structure.
    It helps in understanding the page layout before attempting detailed parsing.
    """
    name: str = "analyze_webpage_structure"
    description: str = (
        "Fetches a webpage and returns a summary of its HTML structure "
        "(title, headings H1/H2/H3, tables found and their headers/first row, "
        "and counts of lists and forms). Use this tool *first* to understand "
        "a webpage's layout *before* trying to write specific 'bs4' code "
        "to extract detailed information."
    )
    # According to the comment: Dict[argument_name, Dict[key, Union[str, type, bool]]]
    # where the inner dict has 'type' and 'description'
    inputs: Dict[str, Dict[str, Union[str, type, bool]]] = { # Explicit type hint for clarity
        "url": {                                            # Argument name
            "type": "string",                                    # The actual Python type
            "description": "The URL of the webpage to analyze."
        }
    }
    output_type: str = "string"

    def forward(self, url) -> str:
        """
        Executes the webpage structure analysis.

        Args:
            url: The URL of the webpage to analyze.

        Returns:
            A string containing the structure summary or an error message.
        """
        # Ensure the core function is accessible here
        return analyze_webpage_structure(url)

def analyze_webpage_structure(url: str) -> str:
    """
    Fetches a webpage and returns a text summary of its key HTML structure.

    Args:
        url: The URL of the webpage to analyze.

    Returns:
        A string containing a summary of the HTML structure, or an error message.
    """
    summary_lines: List[str] = []
    
    # Define a User-Agent to mimic a browser, reducing chances of being blocked
    headers: Dict[str, str] = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    try:
        # Fetch the webpage content
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)

        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        summary_lines.append(f"--- Structure Summary for: {url} ---")

        # 1. Title
        title = soup.title.string if soup.title else "N/A"
        summary_lines.append(f"\n[Title]: {title.strip()}")

        # 2. Meta Description
        meta_desc = soup.find('meta', attrs={'name': 'description'})
        description = meta_desc['content'] if meta_desc and meta_desc.has_attr('content') else "N/A"
        summary_lines.append(f"[Meta Description]: {description.strip()}")

        # 3. Headings (H1-H4)
        summary_lines.append("\n[Headings]:")
        for i in range(1, 5):
            headings = soup.find_all(f'h{i}')
            summary_lines.append(f"  - H{i} Tags Found: {len(headings)}")
            # Show the first 5 headings for brevity
            for h in headings[:5]:
                summary_lines.append(f"    - {h.get_text(strip=True)[:100]}") # Limit length

        # 4. Links
        links = soup.find_all('a')
        summary_lines.append(f"\n[Links]:")
        summary_lines.append(f"  - Total Links Found: {len(links)}")
        # Show the first 5 links
        for link in links[:5]:
            href = link.get('href', 'N/A')
            text = link.get_text(strip=True)[:80] # Limit length
            summary_lines.append(f"  - [{text}] -> {href}")

        # 5. Images
        images = soup.find_all('img')
        summary_lines.append(f"\n[Images]:")
        summary_lines.append(f"  - Total Images Found: {len(images)}")
        # Show the first 5 image alts/srcs
        for img in images[:5]:
            alt = img.get('alt', 'No alt text')[:80] # Limit length
            src = img.get('src', 'N/A')
            summary_lines.append(f"  - [Alt: {alt}] -> {src}")
            
        # 6. Tables
        tables = soup.find_all('table')
        summary_lines.append(f"\n[Tables]:")
        summary_lines.append(f"  - Total Tables Found: {len(tables)}")
        for i, table in enumerate(tables[:3]): # Show info for first 3 tables
            headers = [th.get_text(strip=True) for th in table.find_all('th', limit=10)]
            rows = table.find_all('tr')
            if headers:
                summary_lines.append(f"  - Table {i+1} (Rows: {len(rows)}): Headers = {headers}")
            else:
                summary_lines.append(f"  - Table {i+1} (Rows: {len(rows)}): No <th> headers found.")

        # 7. Lists
        ul_lists = soup.find_all('ul')
        ol_lists = soup.find_all('ol')
        summary_lines.append(f"\n[Lists]:")
        summary_lines.append(f"  - Unordered Lists (ul) Found: {len(ul_lists)}")
        summary_lines.append(f"  - Ordered Lists (ol) Found: {len(ol_lists)}")

        # 8. Forms
        forms = soup.find_all('form')
        summary_lines.append(f"\n[Forms]:")
        summary_lines.append(f"  - Total Forms Found: {len(forms)}")
        for i, form in enumerate(forms[:3]): # Show info for first 3 forms
            action = form.get('action', 'N/A')
            inputs = form.find_all('input')
            input_names = [inp.get('name', 'No name') for inp in inputs if inp.get('type') != 'hidden']
            summary_lines.append(f"  - Form {i+1} (Action: {action}): Inputs = {input_names[:5]}")

        summary_lines.append("\n------------------------------------")
        
        return "\n".join(summary_lines)

    except requests.exceptions.HTTPError as http_err:
        return f"HTTP Error fetching webpage {url}: {http_err}"
    except requests.exceptions.ConnectionError as conn_err:
        return f"Connection Error fetching webpage {url}: {conn_err}"
    except requests.exceptions.Timeout as timeout_err:
        return f"Timeout Error fetching webpage {url}: {timeout_err}"
    except requests.exceptions.RequestException as req_err:
        return f"Error fetching webpage {url}: {req_err}"
    except Exception as e:
        return f"An error occurred while analyzing {url}: {e}"



# --------------- Summarize webpage content ------------------------#
class SummarizeWebpageContentTool(Tool):
    name: str = "summarize_webpage_content"
    description: str = (
        "Fetches a webpage and returns a concise summary of its main textual content. "
        "Use this instead of 'visit_webpage' when you need an overview of the text, not its full structure or HTML."
    )
    
    inputs: Dict[str, Dict[str, Union[str, Any]]] = {
        "url": {
            "type": "string",
            "description": "The URL of the webpage to summarize."
        }
    }
    
    output_type: type = str

    def forward(self, url: str) -> str:
        return summarize_webpage_content(url)

def summarize_webpage_content(url: str, max_length: int = 1500) -> str:
    """
    Fetches the main textual content of a webpage and returns a concise summary.
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    try:
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # Strip out script, style, nav, and footer tags to get cleaner text
        for element in soup(["script", "style", "nav", "footer", "header", "aside"]):
            element.decompose()

        # Extract text from the main body, focusing on paragraphs
        main_content = soup.find('main') or soup.find('article') or soup.find('body')
        text_chunks = [p.get_text(strip=True) for p in main_content.find_all('p')]
        full_text = " ".join(text_chunks)
        
        if not full_text:
            return "Error: Could not extract meaningful text content from the webpage."

        # Return a truncated version as a simple summary
        summary = full_text[:max_length]
        if len(full_text) > max_length:
            # Try to cut at a word boundary
            last_space = summary.rfind(' ')
            if last_space != -1:
                summary = summary[:last_space]
            summary += "..."

        return f"Summary of content from {url}:\n{summary}"

    except requests.exceptions.RequestException as e:
        return f"Error fetching webpage {url}: {e}"
    except Exception as e:
        return f"An error occurred while summarizing {url}: {e}"



# --------------- Extract table from webpage ------------------------#
class ExtractTableFromWebpageTool(Tool):
    name: str = "extract_table_from_webpage"
    description: str = (
        "Extracts a specific table from a webpage and returns it in a clean Markdown format. "
        "Use the 'table_identifier' to specify which table you want."
    )
    
    inputs: Dict[str, Dict[str, Union[str, Any]]] = {
        "url": {
            "type": "string",
            "description": "The URL of the webpage containing the table."
        },
        "table_identifier": {
            "type": "string",
            "description": "The index (e.g., '0' for the first table, '1' for the second) or a text keyword from the table's caption to identify which table to extract. Defaults to '0'."
        }
    }
    
    output_type: str = "string" # Should match the return type of the helper function

    def forward(self, url: str, table_identifier: str = "0") -> str:
        return extract_table_from_webpage(url, table_identifier)

def extract_table_from_webpage(url: str, table_identifier: str = "0") -> str:
    """
    Fetches a webpage, finds a specific table, and returns it in Markdown format.
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    try:
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        tables = soup.find_all('table')
        if not tables:
            return "Error: No tables found on the webpage."

        target_table = None
        if table_identifier.isdigit():
            table_index = int(table_identifier)
            if table_index < len(tables):
                target_table = tables[table_index]
            else:
                return f"Error: Table index {table_index} is out of bounds. Only {len(tables)} tables found."
        else:
            for table in tables:
                caption = table.find('caption')
                if caption and table_identifier.lower() in caption.get_text().lower():
                    target_table = table
                    break
        
        if not target_table:
            return f"Error: Could not find a table matching the identifier '{table_identifier}'."

        # Convert table to Markdown format
        markdown_table = ""
        headers = [th.get_text(strip=True) for th in target_table.find_all('th')]
        if headers:
            markdown_table += "| " + " | ".join(headers) + " |\n"
            markdown_table += "| " + " | ".join(["---"] * len(headers)) + " |\n"

        for row in target_table.find_all('tr'):
            cells = [td.get_text(strip=True).replace('\n', ' ') for td in row.find_all('td')]
            if cells:
                markdown_table += "| " + " | ".join(cells) + " |\n"

        return markdown_table if markdown_table else "Error: Found the table but could not parse its content."

    except requests.exceptions.RequestException as e:
        return f"Error fetching webpage {url}: {e}"
    except Exception as e:
        return f"An error occurred while extracting the table from {url}: {e}"

        
# --- Example Usage ---
if __name__ == "__main__":
    test_url_wiki = "https://en.wikipedia.org/wiki/Python_(programming_language)"
    test_url_news = "https://www.bbc.com/news"
    test_url_fail = "https://thissitedoesnotexist12345.com"

    print("--- Analyzing Wikipedia ---")
    summary_wiki = analyze_webpage_structure(test_url_wiki)
    print(summary_wiki)

    print("\n--- Analyzing BBC News ---")
    summary_news = analyze_webpage_structure(test_url_news)
    print(summary_news)
    
    print("\n--- Analyzing Failing URL ---")
    summary_fail = analyze_webpage_structure(test_url_fail)
    print(summary_fail)

    # --- Optional: Testing other tools (requires smolagents & potentially setup) ---
    # print("\n" + "="*30)
    # print("   Testing Cached Wiki Tool")
    # print("="*30)
    # 
    # try:
    #     wiki_tool_instance = CachedWikiTool()
    #     wiki_result = wiki_tool_instance.run("Artificial intelligence")
    #     print(f"Wikipedia Result (first 200 chars): {wiki_result[:200]}...")
    #     
    #     # Test caching (this *should* be instant)
    #     print("\n--- Testing Wiki Cache ---")
    #     wiki_result_cached = wiki_tool_instance.run("Artificial intelligence")
    #     print(f"Cached Result (first 200 chars): {wiki_result_cached[:200]}...")
    #     
    # except Exception as e:
    #     print(f"Could not test CachedWikiTool, likely due to missing dependencies or setup: {e}")