Spaces:

CultriX
/

RAG-Scraper

Running

App Files Files Community

CultriX commited on May 17

Commit

1303e35

verified ·

1 Parent(s): c09533d

Update app.py

Browse files

Files changed (1) hide show

app.py +144 -7

app.py CHANGED Viewed

@@ -1,9 +1,130 @@
 import gradio as gr
 from rag_scraper.scraper import Scraper
 from rag_scraper.converter import Converter
 from rag_scraper.link_extractor import LinkExtractor, LinkType
 from rag_scraper.utils import URLUtils
 def scrape_and_convert(url, depth):
     """Fetch HTML content, extract links recursively (up to given depth), and convert to Markdown."""
     try:
@@ -51,17 +172,33 @@ def scrape_and_convert(url, depth):
 # Define Gradio interface
 iface = gr.Interface(
-    fn=scrape_and_convert,
     inputs=[
-        gr.Textbox(label="Enter URL", placeholder="https://example.com"),
-        gr.Slider(minimum=0, maximum=3, step=1, value=0, label="Search Depth (0 = Only main page)")
     ],
-    outputs=gr.Code(label="Markdown Output", language="markdown"),
-    title="RAGScraper with Recursive Depth",
-    description="Enter a URL and specify the search depth. The app will fetch, extract links, and convert HTML to Markdown."
 )
 # Launch the Gradio app
 if __name__ == "__main__":
     iface.launch()

 import gradio as gr
+import subprocess
+import os
+import re
+import tempfile
 from rag_scraper.scraper import Scraper
 from rag_scraper.converter import Converter
 from rag_scraper.link_extractor import LinkExtractor, LinkType
 from rag_scraper.utils import URLUtils
+def is_github_repo(url_or_id):
+    """Check if the input is a GitHub repository URL or ID."""
+    # Check for GitHub URL
+    if "github.com" in url_or_id:
+        return True
+    # Check for shorthand notation (username/repo)
+    if re.match(r'^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+$', url_or_id):
+        return True
+    return False
+def extract_repo_info(url_or_id):
+    """Extract repository owner and name from URL or ID."""
+    # Handle GitHub URLs
+    github_url_pattern = r'github\.com/([a-zA-Z0-9_.-]+)/([a-zA-Z0-9_.-]+)'
+    match = re.search(github_url_pattern, url_or_id)
+    if match:
+        return match.group(1), match.group(2)
+    # Handle shorthand notation (username/repo)
+    if '/' in url_or_id and not url_or_id.startswith('http'):
+        parts = url_or_id.split('/')
+        if len(parts) == 2:
+            return parts[0], parts[1]
+    return None, None
+def is_running_on_huggingface():
+    """Check if the app is running on HuggingFace Spaces."""
+    return os.environ.get('SPACE_ID') is not None
+def check_repomix_installed():
+    """Check if Repomix is installed."""
+    # If running on HuggingFace Spaces, Repomix is likely not available
+    if is_running_on_huggingface():
+        return False
+    try:
+        result = subprocess.run(["npx", "repomix", "--version"],
+                               capture_output=True, text=True, check=False)
+        return result.returncode == 0
+    except Exception:
+        return False
+def run_repomix(repo_url_or_id, output_format="markdown"):
+    """Run Repomix on the GitHub repository and return the content."""
+    try:
+        # Create a temporary directory for the output
+        with tempfile.TemporaryDirectory() as temp_dir:
+            output_file = os.path.join(temp_dir, f"repomix-output.{output_format}")
+            # Prepare the command
+            if '/' in repo_url_or_id and not repo_url_or_id.startswith('http'):
+                # Handle shorthand notation
+                repo_url = f"https://github.com/{repo_url_or_id}"
+            else:
+                repo_url = repo_url_or_id
+            # Run Repomix
+            cmd = [
+                "npx", "repomix",
+                "--remote", repo_url,
+                "--output", output_file,
+                "--style", output_format,
+                "--compress"  # Use compression for better token efficiency
+            ]
+            process = subprocess.run(cmd, capture_output=True, text=True, check=False)
+            if process.returncode != 0:
+                return f"Error running Repomix: {process.stderr}"
+            # Read the output file
+            if os.path.exists(output_file):
+                with open(output_file, 'r', encoding='utf-8') as f:
+                    return f.read()
+            else:
+                return f"Error: Repomix did not generate an output file."
+    except Exception as e:
+        return f"Error processing GitHub repository: {str(e)}"
+def process_input(url_or_id, depth, input_type="auto"):
+    """Process the input based on its type."""
+    try:
+        # Determine if this is a GitHub repository
+        is_github = is_github_repo(url_or_id) if input_type == "auto" else (input_type == "github")
+        if is_github:
+            # Check if running on HuggingFace Spaces
+            if is_running_on_huggingface():
+                return (
+                    "GitHub repository processing with Repomix is not available on HuggingFace Spaces. "
+                    "This feature requires Node.js and the ability to run npm/npx commands, "
+                    "which are typically not available in the HuggingFace Spaces environment.\n\n"
+                    "You can still use the web scraping functionality for regular websites, "
+                    "or run this application locally to use the Repomix feature."
+                )
+            # Check if Repomix is installed
+            if not check_repomix_installed():
+                return (
+                    "Repomix is not installed or not accessible. "
+                    "Please install it using: npm install -g repomix\n"
+                    "Or you can run it without installation using: npx repomix"
+                )
+            # Process GitHub repository with Repomix
+            return run_repomix(url_or_id, output_format="markdown")
+        else:
+            # Process regular URL with web scraping
+            return scrape_and_convert(url_or_id, depth)
+    except Exception as e:
+        return f"Error: {str(e)}"
 def scrape_and_convert(url, depth):
     """Fetch HTML content, extract links recursively (up to given depth), and convert to Markdown."""
     try:
 # Define Gradio interface
 iface = gr.Interface(
+    fn=process_input,
     inputs=[
+        gr.Textbox(label="Enter URL or GitHub Repository",
+                  placeholder="https://example.com or username/repo"),
+        gr.Slider(minimum=0, maximum=3, step=1, value=0,
+                 label="Search Depth (0 = Only main page, ignored for GitHub repos)"),
+        gr.Radio(
+            choices=["auto", "website", "github"],
+            value="auto",
+            label="Input Type",
+            info="Auto will detect GitHub repos automatically"
+        )
     ],
+    outputs=gr.Code(label="Output", language="markdown"),
+    title="RAGScraper with GitHub Repository Support",
+    description=(
+        "Enter a URL to scrape a website, or a GitHub repository URL/ID (e.g., 'username/repo') "
+        "to use Repomix for repository processing. "
+        "For websites, you can specify the search depth for recursive scraping."
+    ),
+    examples=[
+        ["https://example.com", 0, "auto"],
+        ["yamadashy/repomix", 0, "auto"],
+        ["https://github.com/yamadashy/repomix", 0, "auto"]
+    ]
 )
 # Launch the Gradio app
 if __name__ == "__main__":
     iface.launch()