import requests
import pandas as pd
import os
from bs4 import BeautifulSoup
from typing import List, Dict, Any, Union
from functools import lru_cache
import urllib.parse
from smolagents import Tool, WebSearchTool, WikipediaSearchTool, PythonInterpreterTool
from pydantic import BaseModel, Field
from transformers import pipeline  # You'll need: pip install transformers torch accelerate
from PIL import Image

# ------------------ Simple wrapper tools to save loading time ------------------------
class CachedWebSearchTool(WebSearchTool):
    @lru_cache(maxsize=128)
    def run(self, query: str):
        # identical queries return instantly
        return super().run(query)

class CachedWikiTool(WikipediaSearchTool):
    @lru_cache(maxsize=128)
    def run(self, page: str):
        return super().run(page)

class PreloadedPythonTool(PythonInterpreterTool):
    """
    A PythonInterpreterTool that automatically prepends the necessary imports
    (bs4, BeautifulSoup, regex) so you never hit NameError inside your code blocks.
    """
    def run(self, code: str) -> str:
        preamble = (
            "import bs4\n"
            "from bs4 import BeautifulSoup\n"
            "import regex\n"
        )
        return super().run(preamble + code)


# --------------------- Describe image file with text --------------------------- #
class ImageContentDescriberTool(Tool):
    name: str = "describe_image_content"
    description: str = "Downloads an image from a URL and provides a textual description of its main content. It CANNOT solve complex puzzles like chess positions but can identify objects and scenes."
    
    inputs: Dict[str, Dict[str, Union[str, Any]]] = {
        "image_url": {
            "type": "string",
            "description": "The URL of the image to describe."
        }
    }
    output_type: str = "string"

    def forward(self, image_url: str) -> str:
        return describe_image_from_url(image_url)

# Lazy-load the vision model
image_captioner = None
def describe_image_from_url(image_url: str) -> str:
    """Downloads an image from a URL and generates a text description."""
    global image_captioner
    if image_captioner is None:
        try:
            print("Initializing Image Captioning model for the first time...")
            # Using a smaller, faster BLIP model.
            image_captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
            print("Image Captioning model initialized.")
        except Exception as e:
            return f"Error: Could not initialize the image captioning model. Details: {e}"

    try:
        print(f"Downloading image from {image_url}...")
        image = Image.open(requests.get(image_url, stream=True, timeout=15).raw)
        print("Generating image description...")
        description = image_captioner(image)[0]['generated_text']
        return f"Image description: {description}"
    except Exception as e:
        return f"An error occurred while processing the image file: {e}"


# --------------------- Transcribe audio file to text ---------------------------- #
class TranscribeAudioTool(Tool):
    """
    A tool to transcribe a local audio file to text.
    """
    name: str = "transcribe_audio_file"
    description: str = "Transcribes a local audio file (e.g., .mp3, .wav, .flac) from a file path into text."
    
    inputs: Dict[str, Dict[str, Union[str, Any]]] = {
        "file_path": {
            "type": "string",
            "description": "The local path to the audio file to transcribe."
        }
    }
    output_type: str = "string"

    def forward(self, file_path: str) -> str:
        return transcribe_local_audio(file_path)

# --- Helper function for TranscribeAudioTool ---
# Note: This requires ffmpeg to be installed on your system: sudo apt-get install ffmpeg
# The first time this pipeline is created, it will download the model (e.g., Whisper).
# We lazy-load it to avoid loading it if the tool is never used.
audio_transcriber = None
def transcribe_local_audio(file_path: str) -> str:
    """Downloads and transcribes an audio file from a URL."""
    global audio_transcriber
    if audio_transcriber is None:
        try:
            # Using a smaller, faster Whisper model. Larger models can be used for higher accuracy.
            print("Initializing Speech-to-Text model...")
            audio_transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base")
            print("Speech-to-Text model initialized.")
        except Exception as e:
            return f"Error: Could not initialize the speech-to-text model. Details: {e}"

    if not os.path.exists(file_path):
        return f"Error: The audio file at path '{file_path}' was not found."
    
    try:
        print(f"Transcribing audio from {file_path}...")
        transcription = audio_transcriber(file_path)
        # The pipeline output is a dictionary, we want the text.
        return transcription['text'] if 'text' in transcription else "Transcription complete, but no text was found."
    except Exception as e:
        return f"An error occurred while processing the audio file '{file_path}': {e}"


# --------------------- Read attachment file for general purpose -----------------
class ReadFileTool(Tool):
    """
    A tool to read the content of a local text or code file.
    """
    name: str = "read_file_content"
    description: str = (
        "Reads the raw text content from a local file path and returns it as a string. "
        "Use this for .txt, .py, .md, .csv, and other plain text files."
    )
    inputs: Dict[str, Dict[str, Any]] = {
        "file_path": {
            "type": "string",
            "description": "The local path to the file (e.g., 'data/my_document.txt')."
        }
    }
    output_type: str = "string"

    def forward(self, file_path: str) -> str:
        """Reads content from a local file."""
        if not os.path.exists(file_path):
            return f"Error: The file at path '{file_path}' was not found."
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            return content
        except Exception as e:
            return f"An error occurred while reading the file '{file_path}': {e}"


# --------------------- Read attached Excel file ---------------------------------
class ReadExcelTool(Tool):
    """
    A tool to read data from a local Excel file and convert it to a structured format.
    """
    name: str = "read_excel_file"
    description: str = (
        "Reads data from a local Excel file (.xlsx, .xls) and returns its content "
        "as a Markdown-formatted table. This is the primary tool for analyzing spreadsheet data."
    )
    inputs: Dict[str, Dict[str, Any]] = {
        "file_path": {
            "type": "string",
            "description": "The local path to the Excel file (e.g., 'data/sales_report.xlsx')."
        }
    }
    output_type: str = "string"

    def forward(self, file_path: str) -> str:
        """Reads an Excel file and converts it to a Markdown table."""
        if not os.path.exists(file_path):
            return f"Error: The file at path '{file_path}' was not found."
        try:
            # Read the first sheet of the Excel file into a pandas DataFrame
            df = pd.read_excel(file_path)
            # Convert the DataFrame to a Markdown table string
            markdown_table = df.to_markdown(index=False)
            return markdown_table
        except Exception as e:
            return f"An error occurred while reading the Excel file '{file_path}': {e}"

# --------------------- Read code file from URL ----------------------------------
class ReadContentFromURLTool(Tool):
    """
    A tool to read the content of a code file from a URL.
    """
    name: str = "read_code_from_url"
    description: str = (
        "Reads the content of a code file from a given URL and returns it as a string. "
        "Use this to analyze Python scripts or other text files available on the web."
    )

    inputs: Dict[str, Dict[str, Union[str, Any]]] = {
        "url": {
            "type": "string",
            "description": "The URL of the code file to read."
        }
    }
    
    output_type: str = "string"

    def forward(self, url: str) -> str:
        return read_content_from_url(url)

def read_content_from_url(url: str) -> str:
    """
    Reads the raw text content of a file from a given URL.

    Args:
        url: The URL of the file to read.

    Returns:
        The content of the file as a string, or an error message.
    """
    # Define a User-Agent to mimic a browser, reducing chances of being blocked
    headers = {
        'User-Agent': 'MyAgent/1.0 (https://example.com; myemail@example.com)'
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)
        return response.text
    except requests.exceptions.HTTPError as http_err:
        return f"HTTP Error fetching content from {url}: {http_err}"
    except requests.exceptions.ConnectionError as conn_err:
        return f"Connection Error fetching content from {url}: {conn_err}"
    except requests.exceptions.Timeout as timeout_err:
        return f"Timeout Error fetching content from {url}: {timeout_err}"
    except requests.exceptions.RequestException as req_err:
        return f"Error fetching content from {url}: {req_err}"
    except Exception as e:
        return f"An unexpected error occurred while reading from URL: {e}"


# --------------------- Webpage structure analyzer -------------------------------
class WebpageStructureAnalyzerTool(Tool):
    """
    A tool to fetch a webpage and analyze its basic HTML structure.
    It helps in understanding the page layout before attempting detailed parsing.
    """
    name: str = "analyze_webpage_structure"
    description: str = (
        "Fetches a webpage and returns a summary of its HTML structure "
        "(title, headings H1/H2/H3, tables found and their headers/first row, "
        "and counts of lists and forms). Use this tool *first* to understand "
        "a webpage's layout *before* trying to write specific 'bs4' code "
        "to extract detailed information."
    )
    # According to the comment: Dict[argument_name, Dict[key, Union[str, type, bool]]]
    # where the inner dict has 'type' and 'description'
    inputs: Dict[str, Dict[str, Union[str, type, bool]]] = { # Explicit type hint for clarity
        "url": {                                            # Argument name
            "type": "string",                                    # The actual Python type
            "description": "The URL of the webpage to analyze."
        }
    }
    output_type: str = "string"

    def forward(self, url) -> str:
        """
        Executes the webpage structure analysis.

        Args:
            url: The URL of the webpage to analyze.

        Returns:
            A string containing the structure summary or an error message.
        """
        # Ensure the core function is accessible here
        return analyze_webpage_structure(url)

def analyze_webpage_structure(url: str) -> str:
    """
    Fetches a webpage and returns a text summary of its key HTML structure.

    Args:
        url: The URL of the webpage to analyze.

    Returns:
        A string containing a summary of the HTML structure, or an error message.
    """
    summary_lines: List[str] = []
    
    # Define a User-Agent to mimic a browser, reducing chances of being blocked
    headers: Dict[str, str] = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    try:
        # Fetch the webpage content
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)

        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        summary_lines.append(f"--- Structure Summary for: {url} ---")

        # 1. Title
        title = soup.title.string if soup.title else "N/A"
        summary_lines.append(f"\n[Title]: {title.strip()}")

        # 2. Meta Description
        meta_desc = soup.find('meta', attrs={'name': 'description'})
        description = meta_desc['content'] if meta_desc and meta_desc.has_attr('content') else "N/A"
        summary_lines.append(f"[Meta Description]: {description.strip()}")

        # 3. Headings (H1-H4)
        summary_lines.append("\n[Headings]:")
        for i in range(1, 5):
            headings = soup.find_all(f'h{i}')
            summary_lines.append(f"  - H{i} Tags Found: {len(headings)}")
            # Show the first 5 headings for brevity
            for h in headings[:5]:
                summary_lines.append(f"    - {h.get_text(strip=True)[:100]}") # Limit length

        # 4. Links
        links = soup.find_all('a')
        summary_lines.append(f"\n[Links]:")
        summary_lines.append(f"  - Total Links Found: {len(links)}")
        # Show the first 5 links
        for link in links[:5]:
            href = link.get('href', 'N/A')
            text = link.get_text(strip=True)[:80] # Limit length
            summary_lines.append(f"  - [{text}] -> {href}")

        # 5. Images
        images = soup.find_all('img')
        summary_lines.append(f"\n[Images]:")
        summary_lines.append(f"  - Total Images Found: {len(images)}")
        # Show the first 5 image alts/srcs
        for img in images[:5]:
            alt = img.get('alt', 'No alt text')[:80] # Limit length
            src = img.get('src', 'N/A')
            summary_lines.append(f"  - [Alt: {alt}] -> {src}")
            
        # 6. Tables
        tables = soup.find_all('table')
        summary_lines.append(f"\n[Tables]:")
        summary_lines.append(f"  - Total Tables Found: {len(tables)}")
        for i, table in enumerate(tables[:3]): # Show info for first 3 tables
            headers = [th.get_text(strip=True) for th in table.find_all('th', limit=10)]
            rows = table.find_all('tr')
            if headers:
                summary_lines.append(f"  - Table {i+1} (Rows: {len(rows)}): Headers = {headers}")
            else:
                summary_lines.append(f"  - Table {i+1} (Rows: {len(rows)}): No <th> headers found.")

        # 7. Lists
        ul_lists = soup.find_all('ul')
        ol_lists = soup.find_all('ol')
        summary_lines.append(f"\n[Lists]:")
        summary_lines.append(f"  - Unordered Lists (ul) Found: {len(ul_lists)}")
        summary_lines.append(f"  - Ordered Lists (ol) Found: {len(ol_lists)}")

        # 8. Forms
        forms = soup.find_all('form')
        summary_lines.append(f"\n[Forms]:")
        summary_lines.append(f"  - Total Forms Found: {len(forms)}")
        for i, form in enumerate(forms[:3]): # Show info for first 3 forms
            action = form.get('action', 'N/A')
            inputs = form.find_all('input')
            input_names = [inp.get('name', 'No name') for inp in inputs if inp.get('type') != 'hidden']
            summary_lines.append(f"  - Form {i+1} (Action: {action}): Inputs = {input_names[:5]}")

        summary_lines.append("\n------------------------------------")
        
        return "\n".join(summary_lines)

    except requests.exceptions.HTTPError as http_err:
        return f"HTTP Error fetching webpage {url}: {http_err}"
    except requests.exceptions.ConnectionError as conn_err:
        return f"Connection Error fetching webpage {url}: {conn_err}"
    except requests.exceptions.Timeout as timeout_err:
        return f"Timeout Error fetching webpage {url}: {timeout_err}"
    except requests.exceptions.RequestException as req_err:
        return f"Error fetching webpage {url}: {req_err}"
    except Exception as e:
        return f"An error occurred while analyzing {url}: {e}"


# --------------- Summarize webpage content ------------------------#
class SummarizeWebpageContentTool(Tool):
    name: str = "summarize_webpage_content"
    description: str = (
        "Fetches a webpage and returns a concise summary of its main textual content. "
        "Use this instead of 'visit_webpage' when you need an overview of the text, not its full structure or HTML."
    )
    
    inputs: Dict[str, Dict[str, Union[str, Any]]] = {
        "url": {
            "type": "string",
            "description": "The URL of the webpage to summarize."
        }
    }
    
    output_type: str = "string"

    def forward(self, url: str) -> str:
        return summarize_webpage_content(url)

def summarize_webpage_content(url: str, max_length: int = 1500) -> str:
    """
    Fetches the main textual content of a webpage and returns a concise summary.
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    try:
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # Strip out script, style, nav, and footer tags to get cleaner text
        for element in soup(["script", "style", "nav", "footer", "header", "aside"]):
            element.decompose()

        # Extract text from the main body, focusing on paragraphs
        main_content = soup.find('main') or soup.find('article') or soup.find('body')
        text_chunks = [p.get_text(strip=True) for p in main_content.find_all('p')]
        full_text = " ".join(text_chunks)
        
        if not full_text:
            return "Error: Could not extract meaningful text content from the webpage."

        # Return a truncated version as a simple summary
        summary = full_text[:max_length]
        if len(full_text) > max_length:
            # Try to cut at a word boundary
            last_space = summary.rfind(' ')
            if last_space != -1:
                summary = summary[:last_space]
            summary += "..."

        return f"Summary of content from {url}:\n{summary}"

    except requests.exceptions.RequestException as e:
        return f"Error fetching webpage {url}: {e}"
    except Exception as e:
        return f"An error occurred while summarizing {url}: {e}"


# --------------- Extract table from webpage ------------------------#
class ExtractTableFromWebpageTool(Tool):
    name: str = "extract_table_from_webpage"
    description: str = (
        "Extracts a specific table from a webpage and returns it in a clean Markdown format. "
        "Use the 'table_identifier' to specify which table you want."
    )
    
    inputs: Dict[str, Dict[str, Union[str, Any]]] = {
        "url": {
            "type": "string",
            "description": "The URL of the webpage containing the table."
        },
        "table_identifier": {
            "type": "string",
            "description": "The index (e.g., '0' for the first table, '1' for the second) or a text keyword from the table's caption to identify which table to extract. Defaults to '0'.",
            "nullable": True
        }
    }
    
    output_type: str = "string" # Should match the return type of the helper function

    def forward(self, url: str, table_identifier: str = "0") -> str:
        return extract_table_from_webpage(url, table_identifier)

def extract_table_from_webpage(url: str, table_identifier: str = "0") -> str:
    """
    Fetches a webpage, finds a specific table, and returns it in Markdown format.
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    try:
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        tables = soup.find_all('table')
        if not tables:
            return "Error: No tables found on the webpage."

        target_table = None
        if table_identifier.isdigit():
            table_index = int(table_identifier)
            if table_index < len(tables):
                target_table = tables[table_index]
            else:
                return f"Error: Table index {table_index} is out of bounds. Only {len(tables)} tables found."
        else:
            for table in tables:
                caption = table.find('caption')
                if caption and table_identifier.lower() in caption.get_text().lower():
                    target_table = table
                    break
        
        if not target_table:
            return f"Error: Could not find a table matching the identifier '{table_identifier}'."

        # Convert table to Markdown format
        markdown_table = ""
        headers = [th.get_text(strip=True) for th in target_table.find_all('th')]
        if headers:
            markdown_table += "| " + " | ".join(headers) + " |\n"
            markdown_table += "| " + " | ".join(["---"] * len(headers)) + " |\n"

        for row in target_table.find_all('tr'):
            cells = [td.get_text(strip=True).replace('\n', ' ') for td in row.find_all('td')]
            if cells:
                markdown_table += "| " + " | ".join(cells) + " |\n"

        return markdown_table if markdown_table else "Error: Found the table but could not parse its content."

    except requests.exceptions.RequestException as e:
        return f"Error fetching webpage {url}: {e}"
    except Exception as e:
        return f"An error occurred while extracting the table from {url}: {e}"


# -------------------------------- Get Section of Wikipedia --------------------------------------
class GetWikipediaSectionTool(Tool):
    """
    A tool to get the text content of a specific section from a Wikipedia page.
    """
    name: str = "get_wikipedia_section"
    description: str = (
        "Retrieves the plain text content of a specific section from a given Wikipedia page title. "
        "Useful for getting targeted information like a 'Discography', 'Filmography', or 'Early life' section."
    )
    
    # Define inputs in the specific dictionary format smolagents expects
    inputs: Dict[str, Dict[str, Union[str, Any]]] = {
        "page_title": {
            "type": "string",
            "description": "The exact title of the Wikipedia page (e.g., 'Mercedes Sosa')."
        },
        "section_title": {
            "type": "string",
            "description": "The exact title of the section to retrieve (e.g., 'Discography')."
        }
    }
    
    # Define output_type as the Python type object
    output_type: str = "string"

    # Implement the 'forward' method as required by the smolagents.Tool base class
    def forward(self, page_title: str, section_title: str) -> str:
        return get_wikipedia_section(page_title, section_title)


def get_wikipedia_section(page_title: str, section_title: str) -> str:
    """
    Fetches the plain text content of a specific section from a Wikipedia page.

    Args:
        page_title: The exact title of the Wikipedia page.
        section_title: The exact title of the section to retrieve (case-sensitive).

    Returns:
        A string containing the text of the section, or an error message.
    """
    # Format the page title for a URL (e.g., "Artificial intelligence" -> "Artificial_intelligence")
    formatted_page_title = urllib.parse.quote(page_title.replace(" ", "_"))
    url = f"https://en.wikipedia.org/wiki/{formatted_page_title}"

    headers = {
        'User-Agent': 'MyAgent/1.0 (https://example.com; myemail@example.com)'
    }

    try:
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        ## Find the headline span with the exact section title <- Too strict, change to a looser logic
        # section_header = soup.find("span", class_="mw-headline", id=section_title.replace(" ", "_"))

        section_header = None
        # Find all potential header tags (h1, h2, h3, etc.)
        for header in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
            # Check if the header's text contains the section title (case-insensitive)
            if section_title.lower() in header.get_text(strip=True).lower():
                section_header = header
                break # Stop at the first match
        if not section_header:
            return f"Error: Section containing '{section_title}' not found on page '{page_title}'. Check capitalization and spelling."

        # Start collecting content from the parent tag of the headline (e.g., an <h2>)
        content_tags = []
        ## The parent of the section header span is the header tag (e.g., <h2>)
        # current_tag = section_header.parent
        ## Iterate through the siblings of the header tag
        # for sibling in current_tag.find_next_siblings():
            ## Stop when we hit the next header of the same level (e.g., another h2)
        #    if sibling.name == current_tag.name:
        #        break
        for sibling in section_header.find_next_siblings():
            # Stop when we hit the next header of the same or higher level
            # e.g., if we matched an <h2>, stop at the next <h2> or <h1>
            if sibling.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'] and int(sibling.name[1]) <= int(section_header.name[1]):
                break
            # Collect the text from relevant tags like paragraphs, lists, etc.
            if sibling.name in ['p', 'ul', 'ol', 'dl']:
                content_tags.append(sibling.get_text(strip=True))
        
        if not content_tags:
            return f"Found section '{section_title}' but it appears to be empty or contains only non-text elements."

        return "\n".join(content_tags)

    except requests.exceptions.HTTPError as http_err:
        if http_err.response.status_code == 404:
            return f"Error: Wikipedia page '{page_title}' not found (404 Error)."
        return f"HTTP Error fetching page '{page_title}': {http_err}"
    except Exception as e:
        return f"An unexpected error occurred: {e}"


# --- Example Usage ---
if __name__ == "__main__":
    test_url_wiki = "https://en.wikipedia.org/wiki/Python_(programming_language)"
    test_url_news = "https://www.bbc.com/news"
    test_url_fail = "https://thissitedoesnotexist12345.com"

    print("--- Analyzing Wikipedia ---")
    summary_wiki = analyze_webpage_structure(test_url_wiki)
    print(summary_wiki)

    print("\n--- Analyzing BBC News ---")
    summary_news = analyze_webpage_structure(test_url_news)
    print(summary_news)
    
    print("\n--- Analyzing Failing URL ---")
    summary_fail = analyze_webpage_structure(test_url_fail)
    print(summary_fail)

    # --- Optional: Testing other tools (requires smolagents & potentially setup) ---
    # print("\n" + "="*30)
    # print("   Testing Cached Wiki Tool")
    # print("="*30)
    # 
    # try:
    #     wiki_tool_instance = CachedWikiTool()
    #     wiki_result = wiki_tool_instance.run("Artificial intelligence")
    #     print(f"Wikipedia Result (first 200 chars): {wiki_result[:200]}...")
    #     
    #     # Test caching (this *should* be instant)
    #     print("\n--- Testing Wiki Cache ---")
    #     wiki_result_cached = wiki_tool_instance.run("Artificial intelligence")
    #     print(f"Cached Result (first 200 chars): {wiki_result_cached[:200]}...")
    #     
    # except Exception as e:
    #     print(f"Could not test CachedWikiTool, likely due to missing dependencies or setup: {e}")