Spaces:

CultriX
/

RAG-Scraper

Running

App Files Files Community

CultriX commited on May 29

Commit

d82ab96

1 Parent(s): 32f722f

feat: Overhaul WebUI, add PDF/Text export, use Poetry in Docker

Browse files

Files changed (1) hide show

app.py +10 -98

app.py CHANGED Viewed

@@ -6,95 +6,16 @@ import re
 import tempfile
 import json
 import csv
-from typing import Iterable # Added for Theme
 from rag_scraper.scraper import Scraper
 from rag_scraper.converter import Converter
 from rag_scraper.link_extractor import LinkExtractor, LinkType
 from rag_scraper.utils import URLUtils
-from gradio.themes.base import Base # Added for Theme
-from gradio.themes.utils import colors, fonts, sizes # Added for Theme
 import markdown_pdf # Added for PDF conversion
-# --- Custom Theme Definition ---
-class Seafoam(Base):
-    def __init__(
-        self,
-        *,
-        primary_hue: colors.Color | str = colors.teal,
-        secondary_hue: colors.Color | str = colors.cyan,
-        neutral_hue: colors.Color | str = colors.gray,
-        spacing_size: sizes.Size | str = sizes.spacing_md,
-        radius_size: sizes.Size | str = sizes.radius_md,
-        text_size: sizes.Size | str = sizes.text_md, # Adjusted from lg for a more professional feel
-        font: fonts.Font
-        | str
-        | Iterable[fonts.Font | str] = (
-            fonts.GoogleFont("Inter"), # Modern sans-serif
-            "ui-sans-serif",
-            "system-ui",
-            "sans-serif",
-        ),
-        font_mono: fonts.Font
-        | str
-        | Iterable[fonts.Font | str] = (
-            fonts.GoogleFont("IBM Plex Mono"),
-            "ui-monospace",
-            "monospace",
-        ),
-    ):
-        super().__init__(
-            primary_hue=primary_hue,
-            secondary_hue=secondary_hue,
-            neutral_hue=neutral_hue,
-            spacing_size=spacing_size,
-            radius_size=radius_size,
-            text_size=text_size,
-            font=font,
-            font_mono=font_mono,
-        )
-        # Dark Mode First
-        super().set(
-            # Core Colors
-            body_background_fill_dark="black", # True black
-            body_text_color_dark="*neutral_100",
-            block_background_fill_dark=colors.gray_900,
-            block_border_color_dark=colors.gray_700,
-            block_label_background_fill_dark=colors.gray_800,
-            block_label_text_color_dark="*neutral_100",
-            input_background_fill_dark=colors.gray_800,
-            input_border_color_dark=colors.gray_600,
-            input_text_color_dark="*neutral_50",
-            button_primary_background_fill_dark=colors.teal_600,
-            button_primary_background_fill_hover_dark=colors.teal_500,
-            button_primary_text_color_dark="white",
-            button_secondary_background_fill_dark=colors.gray_700,
-            button_secondary_background_fill_hover_dark=colors.gray_600,
-            button_secondary_text_color_dark="white",
-            slider_color_dark=colors.teal_500,
-            # Light Mode
-            body_background_fill="white",
-            body_text_color=colors.gray_800,
-            block_background_fill="*neutral_50",
-            block_border_color=colors.gray_300,
-            block_label_background_fill="*neutral_100",
-            block_label_text_color=colors.gray_700,
-            input_background_fill=colors.white,
-            input_border_color=colors.gray_300,
-            input_text_color=colors.gray_900,
-            button_primary_background_fill=colors.teal_500,
-            button_primary_background_fill_hover=colors.teal_600,
-            button_primary_text_color="white",
-            button_secondary_background_fill="*neutral_100",
-            button_secondary_background_fill_hover=colors.gray_300,
-            button_secondary_text_color=colors.gray_800,
-            slider_color=colors.teal_500,
-            # General
-            block_title_text_weight="600",
-            block_shadow="*shadow_drop_lg",
-            button_shadow="*shadow_drop"
-        )
-seafoam_theme = Seafoam()
 def is_github_repo(url_or_id):
     """Check if the input is a GitHub repository URL or ID."""
@@ -238,22 +159,16 @@ def save_output_to_file(content, output_format, source_url_or_id):
             with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf_file:
                 pdf_output_path = tmp_pdf_file.name
-            # Basic PDF conversion from Markdown string
-            # You might need to install a library like `markdown-pdf` or `WeasyPrint`
-            # Example using markdown_pdf (ensure it's installed: pip install markdown-pdf)
             md_pdf = markdown_pdf.MarkdownPdf(toc_level=2)
-            # md_pdf.meta["css"] = "your_custom_css_path.css" # Optional: for styling
             md_pdf.convert_from_string(content, pdf_output_path)
             return pdf_output_path
         except Exception as e:
-            # Fallback: save as markdown with .pdf.md suffix if PDF fails
             print(f"PDF conversion failed: {e}. Saving as Markdown instead.")
-            suffix = ".pdf.md" # Indicate it's markdown intended for PDF
             # No processed_content change needed, it's already markdown
     else: # Default to Markdown
         suffix = ".md"
-    # For formats that don't return early (JSON, Text, Markdown, PDF fallback)
     with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=suffix, encoding="utf-8") as tmp_file:
         tmp_file.write(processed_content)
         return tmp_file.name
@@ -288,7 +203,7 @@ def process_input_updated(url_or_id, source_type, depth, output_format_selection
         progress(0.9, desc=f"Converting to {output_format_selection}...")
         output_file_path = save_output_to_file(raw_content, output_format_selection, url_or_id)
-        preview_content = raw_content # Default for Markdown, Text
         if output_format_selection == "JSON":
             preview_content = convert_to_json(raw_content, url_or_id)
         elif output_format_selection == "CSV" and output_file_path:
@@ -307,16 +222,15 @@ def process_input_updated(url_or_id, source_type, depth, output_format_selection
              preview_content = "[CSV file path not available for preview]"
         elif output_format_selection == "PDF":
             preview_content = f"[PDF generated. Download to view: {os.path.basename(output_file_path if output_file_path else 'file.pdf')}]"
-            if "Saving as Markdown instead" in (output_file_path or ""): # Check if PDF failed
                  preview_content = raw_content + f"\n\n[Note: PDF conversion failed, showing Markdown. File saved as .pdf.md]"
         progress(1, desc="Processing complete.")
         return f"Successfully processed: {url_or_id}", preview_content, output_file_path
     except Exception as e:
         return f"Error during file conversion/saving: {str(e)}", raw_content, None
-with gr.Blocks(theme=seafoam_theme) as iface: # Applied custom theme
     gr.Markdown("# RAG-Ready Content Scraper")
     gr.Markdown(
         "Scrape webpage content or GitHub repositories to generate RAG-ready datasets."
@@ -339,7 +253,7 @@ with gr.Blocks(theme=seafoam_theme) as iface: # Applied custom theme
                 info="0: Only main page. Ignored for GitHub repos."
             )
             output_format_input = gr.Dropdown(
-                choices=["Markdown", "JSON", "CSV", "Text", "PDF"], # Added Text and PDF
                 value="Markdown",
                 label="Select Output Format"
             )
@@ -350,12 +264,10 @@ with gr.Blocks(theme=seafoam_theme) as iface: # Applied custom theme
             preview_output = gr.Code(label="Preview Content", language="markdown", interactive=False)
             file_download_output = gr.File(label="Download Processed File", interactive=False)
-    # Removed progress_bar = gr.Progress(track_tqdm=True) as it's passed directly
     gr.Examples(
         examples=[
             ["https://gradio.app/docs/js", "Webpage", 1, "Markdown"],
-            ["gradio-app/gradio", "GitHub Repository", 0, "Text"], # Changed to Text
             ["https://en.wikipedia.org/wiki/Retrieval-augmented_generation", "Webpage", 0, "JSON"],
         ],
         inputs=[url_input, source_type_input, depth_input, output_format_input],

 import tempfile
 import json
 import csv
+# Removed: from typing import Iterable # Added for Theme
 from rag_scraper.scraper import Scraper
 from rag_scraper.converter import Converter
 from rag_scraper.link_extractor import LinkExtractor, LinkType
 from rag_scraper.utils import URLUtils
+# Removed: from gradio.themes.base import Base # Added for Theme
+# Removed: from gradio.themes.utils import colors, fonts, sizes # Added for Theme
 import markdown_pdf # Added for PDF conversion
+# --- Custom Theme Definition --- (REMOVED Seafoam class and instance)
 def is_github_repo(url_or_id):
     """Check if the input is a GitHub repository URL or ID."""
             with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf_file:
                 pdf_output_path = tmp_pdf_file.name
             md_pdf = markdown_pdf.MarkdownPdf(toc_level=2)
             md_pdf.convert_from_string(content, pdf_output_path)
             return pdf_output_path
         except Exception as e:
             print(f"PDF conversion failed: {e}. Saving as Markdown instead.")
+            suffix = ".pdf.md"
             # No processed_content change needed, it's already markdown
     else: # Default to Markdown
         suffix = ".md"
     with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=suffix, encoding="utf-8") as tmp_file:
         tmp_file.write(processed_content)
         return tmp_file.name
         progress(0.9, desc=f"Converting to {output_format_selection}...")
         output_file_path = save_output_to_file(raw_content, output_format_selection, url_or_id)
+        preview_content = raw_content
         if output_format_selection == "JSON":
             preview_content = convert_to_json(raw_content, url_or_id)
         elif output_format_selection == "CSV" and output_file_path:
              preview_content = "[CSV file path not available for preview]"
         elif output_format_selection == "PDF":
             preview_content = f"[PDF generated. Download to view: {os.path.basename(output_file_path if output_file_path else 'file.pdf')}]"
+            if "Saving as Markdown instead" in (output_file_path or ""):
                  preview_content = raw_content + f"\n\n[Note: PDF conversion failed, showing Markdown. File saved as .pdf.md]"
         progress(1, desc="Processing complete.")
         return f"Successfully processed: {url_or_id}", preview_content, output_file_path
     except Exception as e:
         return f"Error during file conversion/saving: {str(e)}", raw_content, None
+with gr.Blocks(title="RAG-Ready Content Scraper", theme="CultriX/gradio-theme") as iface:
     gr.Markdown("# RAG-Ready Content Scraper")
     gr.Markdown(
         "Scrape webpage content or GitHub repositories to generate RAG-ready datasets."
                 info="0: Only main page. Ignored for GitHub repos."
             )
             output_format_input = gr.Dropdown(
+                choices=["Markdown", "JSON", "CSV", "Text", "PDF"],
                 value="Markdown",
                 label="Select Output Format"
             )
             preview_output = gr.Code(label="Preview Content", language="markdown", interactive=False)
             file_download_output = gr.File(label="Download Processed File", interactive=False)
     gr.Examples(
         examples=[
             ["https://gradio.app/docs/js", "Webpage", 1, "Markdown"],
+            ["gradio-app/gradio", "GitHub Repository", 0, "Text"],
             ["https://en.wikipedia.org/wiki/Retrieval-augmented_generation", "Webpage", 0, "JSON"],
         ],
         inputs=[url_input, source_type_input, depth_input, output_format_input],