from __future__ import annotations
import os
os.environ['HF_HOME'] = '/tmp/hf_cache'
os.makedirs(os.environ['HF_HOME'], exist_ok=True) # Ensure the directory exists
import gradio as gr
import subprocess
import os
import re
import tempfile
import json
import csv
# Removed: from typing import Iterable # Added for Theme
from rag_scraper.scraper import Scraper
from rag_scraper.converter import Converter
from rag_scraper.link_extractor import LinkExtractor, LinkType
from rag_scraper.utils import URLUtils
# Removed: from gradio.themes.base import Base # Added for Theme
# Removed: from gradio.themes.utils import colors, fonts, sizes # Added for Theme
import markdown_pdf # Added for PDF conversion

# --- Custom Theme Definition --- (REMOVED Seafoam class and instance)

def is_github_repo(url_or_id):
    """Check if the input is a GitHub repository URL or ID."""
    if "github.com" in url_or_id:
        return True
    if re.match(r'^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+$', url_or_id):
        return True
    return False

def check_repomix_installed():
    """Check if Repomix is installed."""
    try:
        result = subprocess.run(["repomix", "--version"], 
                               capture_output=True, text=True, check=False)
        return result.returncode == 0
    except Exception:
        return False

def run_repomix(repo_url_or_id, progress=gr.Progress(track_tqdm=True)):
    """Run Repomix on the GitHub repository and return the content."""
    progress(0, desc="Starting Repomix processing...")
    try:
        with tempfile.TemporaryDirectory() as temp_dir:
            output_file_name = "repomix-output.md" 
            output_file_path = os.path.join(temp_dir, output_file_name)
            
            if '/' in repo_url_or_id and not repo_url_or_id.startswith('http'):
                repo_url = f"https://github.com/{repo_url_or_id}"
            else:
                repo_url = repo_url_or_id
            
            progress(0.2, desc=f"Running Repomix on {repo_url}...")
            cmd = [
                "repomix",
                "--remote", repo_url,
                "--output", output_file_path, 
                "--style", "markdown", 
                "--compress" 
            ]
            
            process = subprocess.run(cmd, capture_output=True, text=True, check=False, encoding='utf-8')
            progress(0.8, desc="Repomix command executed.")

            if process.returncode != 0:
                error_details = f"Return Code: {process.returncode}\nStderr: {process.stderr}\nStdout: {process.stdout}"
                return f"Error running Repomix:\n{error_details}", None
            
            if os.path.exists(output_file_path):
                with open(output_file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                progress(1, desc="Repomix output processed.")
                return content, output_file_path
            else:
                error_details = f"Return Code: {process.returncode}\nStderr: {process.stderr}\nStdout: {process.stdout}"
                return f"Error: Repomix did not generate an output file at '{output_file_path}'.\nRepomix Output:\n{error_details}", None
    
    except Exception as e:
        progress(1, desc="Error during Repomix processing.")
        return f"Error processing GitHub repository: {str(e)}", None

def scrape_and_convert_website(url, depth, progress=gr.Progress(track_tqdm=True)):
    """Fetch HTML, extract links, convert to Markdown."""
    progress(0, desc=f"Starting web scrape for {url}...")
    visited_urls = set()
    all_markdown_content = ""

    def recursive_scrape(current_url, current_depth, total_links_estimate=1, link_index=0):
        if current_url in visited_urls or current_depth < 0:
            return ""
        
        visited_urls.add(current_url)
        
        try:
            progress_val = link_index / total_links_estimate if total_links_estimate > 0 else 0
            progress(progress_val, desc=f"Scraping: {current_url} (Depth: {depth - current_depth})")
            html_content = Scraper.fetch_html(current_url)
        except Exception as e:
            return f"Error fetching {current_url}: {str(e)}\n"

        markdown_content = f"## Extracted from: {current_url}\n\n"
        markdown_content += Converter.html_to_markdown(
            html=html_content,
            base_url=current_url,
            parser_features='html.parser',
            ignore_links=True 
        )
        
        page_content = markdown_content + "\n\n"

        if current_depth > 0:
            try:
                links = LinkExtractor.scrape_url(current_url, link_type=LinkType.INTERNAL)
                valid_links = [
                    link for link in links 
                    if URLUtils.is_internal(link, current_url) and link not in visited_urls
                ]
                
                num_links = len(valid_links)
                for i, link_url in enumerate(valid_links):
                    page_content += recursive_scrape(link_url, current_depth - 1, num_links, i)
            except Exception as e:
                page_content += f"Error extracting links from {current_url}: {str(e)}\n"
        return page_content

    all_markdown_content = recursive_scrape(url, depth)
    progress(1, desc="Web scraping complete.")
    
    with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".md", encoding="utf-8") as tmp_file:
        tmp_file.write(all_markdown_content)
        return all_markdown_content, tmp_file.name

def convert_to_json(markdown_content, source_url_or_id):
    data = {"source": source_url_or_id, "content": markdown_content}
    return json.dumps(data, indent=2)

def convert_to_csv(markdown_content, source_url_or_id):
    output = tempfile.NamedTemporaryFile(mode='w+', delete=False, newline='', suffix=".csv", encoding="utf-8")
    writer = csv.writer(output)
    writer.writerow(["source", "content"])
    writer.writerow([source_url_or_id, markdown_content])
    output.close()
    return output.name

def save_output_to_file(content, output_format, source_url_or_id):
    """Saves content to a temporary file based on format and returns its path."""
    processed_content = content # Default for Markdown and Text
    
    if output_format == "JSON":
        suffix = ".json"
        processed_content = convert_to_json(content, source_url_or_id)
    elif output_format == "CSV":
        # convert_to_csv returns a path directly
        return convert_to_csv(content, source_url_or_id)
    elif output_format == "Text":
        suffix = ".txt"
    elif output_format == "PDF":
        suffix = ".pdf"
        # PDF conversion happens differently, creates file directly
        pdf_output_path = ""
        try:
            with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf_file:
                pdf_output_path = tmp_pdf_file.name
            
            md_pdf = markdown_pdf.MarkdownPdf(toc_level=2)
            md_pdf.convert_from_string(content, pdf_output_path)
            return pdf_output_path
        except Exception as e:
            print(f"PDF conversion failed: {e}. Saving as Markdown instead.")
            suffix = ".pdf.md" 
            # No processed_content change needed, it's already markdown
    else: # Default to Markdown
        suffix = ".md"

    with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=suffix, encoding="utf-8") as tmp_file:
        tmp_file.write(processed_content)
        return tmp_file.name

def process_input_updated(url_or_id, source_type, depth, output_format_selection, progress=gr.Progress(track_tqdm=True)):
    progress(0, desc="Initializing...")
    raw_content = ""
    error_message = ""
    output_file_path = None

    if source_type == "GitHub Repository":
        if not check_repomix_installed():
            error_message = "Repomix is not installed or not accessible. Please ensure it's installed globally."
            return error_message, None, None
        raw_content, _ = run_repomix(url_or_id, progress=progress)
        if "Error" in raw_content:
            error_message = raw_content
            raw_content = ""
    elif source_type == "Webpage":
        raw_content, _ = scrape_and_convert_website(url_or_id, depth, progress=progress)
        if "Error" in raw_content:
            error_message = raw_content
            raw_content = ""
    else:
        error_message = "Invalid source type selected."
        return error_message, None, None

    if error_message:
        return error_message, None, None

    try:
        progress(0.9, desc=f"Converting to {output_format_selection}...")
        output_file_path = save_output_to_file(raw_content, output_format_selection, url_or_id)
        
        preview_content = raw_content 
        if output_format_selection == "JSON":
            preview_content = convert_to_json(raw_content, url_or_id)
        elif output_format_selection == "CSV" and output_file_path:
            try:
                with open(output_file_path, 'r', encoding='utf-8') as f_csv:
                    csv_preview_lines = [next(f_csv) for _ in range(5)]
                preview_content = "".join(csv_preview_lines)
                if not preview_content: preview_content = "[CSV content is empty or very short]"
            except StopIteration:
                with open(output_file_path, 'r', encoding='utf-8') as f_csv:
                    preview_content = f_csv.read()
                if not preview_content: preview_content = "[CSV content is empty]"
            except Exception as e_csv_preview:
                preview_content = f"[Error reading CSV for preview: {str(e_csv_preview)}]"
        elif output_format_selection == "CSV" and not output_file_path:
             preview_content = "[CSV file path not available for preview]"
        elif output_format_selection == "PDF":
            preview_content = f"[PDF generated. Download to view: {os.path.basename(output_file_path if output_file_path else 'file.pdf')}]"
            if "Saving as Markdown instead" in (output_file_path or ""): 
                 preview_content = raw_content + f"\n\n[Note: PDF conversion failed, showing Markdown. File saved as .pdf.md]"

        progress(1, desc="Processing complete.")
        return f"Successfully processed: {url_or_id}", preview_content, output_file_path
    except Exception as e:
        return f"Error during file conversion/saving: {str(e)}", raw_content, None

with gr.Blocks(title="RAG-Ready Content Scraper", theme="CultriX/gradio-theme") as iface:
    gr.Markdown("# RAG-Ready Content Scraper")
    gr.Markdown(
        "Scrape webpage content or GitHub repositories to generate RAG-ready datasets."
    )

    with gr.Row():
        with gr.Column(scale=2):
            url_input = gr.Textbox(
                label="Enter URL or GitHub Repository ID",
                placeholder="e.g., https://example.com OR username/repo"
            )
            source_type_input = gr.Radio(
                choices=["Webpage", "GitHub Repository"],
                value="Webpage",
                label="Select Source Type"
            )
            depth_input = gr.Slider(
                minimum=0, maximum=3, step=1, value=0,
                label="Scraping Depth (for Webpages)",
                info="0: Only main page. Ignored for GitHub repos."
            )
            output_format_input = gr.Dropdown(
                choices=["Markdown", "JSON", "CSV", "Text", "PDF"], 
                value="Markdown",
                label="Select Output Format"
            )
            submit_button = gr.Button("Process Content", variant="primary")
        
        with gr.Column(scale=3):
            status_output = gr.Textbox(label="Status", interactive=False)
            preview_output = gr.Code(label="Preview Content", language="markdown", interactive=False)
            file_download_output = gr.File(label="Download Processed File", interactive=False)

    gr.Examples(
        examples=[
            ["https://gradio.app/docs/js", "Webpage", 1, "Markdown"],
            ["gradio-app/gradio", "GitHub Repository", 0, "Text"], 
            ["https://en.wikipedia.org/wiki/Retrieval-augmented_generation", "Webpage", 0, "JSON"],
        ],
        inputs=[url_input, source_type_input, depth_input, output_format_input],
        outputs=[status_output, preview_output, file_download_output],
        fn=process_input_updated,
        cache_examples=False
    )
    
    with gr.Accordion("How it Works & More Info", open=False):
        gr.Markdown(
            """
            **Webpage Scraping:**
            1. Enter a full URL (e.g., `https://example.com`).
            2. Select "Webpage" as the source type.
            3. Set the desired scraping depth.
            4. Choose your output format.

            **GitHub Repository Processing:**
            1. Enter a GitHub repository URL or ID (e.g., `username/repo`).
            2. Select "GitHub Repository". (Depth is ignored).
            3. Choose your output format. Uses **RepoMix**.

            **Output Formats:** Markdown, JSON, CSV, Text, PDF.

            **Note:** PDF generation requires `markdown-pdf` library.
            This app is designed for Docker/HuggingFace Spaces.

            [View Source Code on HuggingFace Spaces](https://huggingface.co/spaces/CultriX/RAG-Scraper) 
            """
        )

    submit_button.click(
        fn=process_input_updated,
        inputs=[url_input, source_type_input, depth_input, output_format_input],
        outputs=[status_output, preview_output, file_download_output],
    )

if __name__ == "__main__":
    iface.launch()