Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,9 +1,130 @@
|
|
1 |
import gradio as gr
|
|
|
|
|
|
|
|
|
2 |
from rag_scraper.scraper import Scraper
|
3 |
from rag_scraper.converter import Converter
|
4 |
from rag_scraper.link_extractor import LinkExtractor, LinkType
|
5 |
from rag_scraper.utils import URLUtils
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
def scrape_and_convert(url, depth):
|
8 |
"""Fetch HTML content, extract links recursively (up to given depth), and convert to Markdown."""
|
9 |
try:
|
@@ -51,17 +172,33 @@ def scrape_and_convert(url, depth):
|
|
51 |
|
52 |
# Define Gradio interface
|
53 |
iface = gr.Interface(
|
54 |
-
fn=
|
55 |
inputs=[
|
56 |
-
gr.Textbox(label="Enter URL
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
],
|
59 |
-
outputs=gr.Code(label="
|
60 |
-
title="RAGScraper with
|
61 |
-
description=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
)
|
63 |
|
64 |
# Launch the Gradio app
|
65 |
if __name__ == "__main__":
|
66 |
iface.launch()
|
67 |
-
|
|
|
1 |
import gradio as gr
|
2 |
+
import subprocess
|
3 |
+
import os
|
4 |
+
import re
|
5 |
+
import tempfile
|
6 |
from rag_scraper.scraper import Scraper
|
7 |
from rag_scraper.converter import Converter
|
8 |
from rag_scraper.link_extractor import LinkExtractor, LinkType
|
9 |
from rag_scraper.utils import URLUtils
|
10 |
|
11 |
+
def is_github_repo(url_or_id):
|
12 |
+
"""Check if the input is a GitHub repository URL or ID."""
|
13 |
+
# Check for GitHub URL
|
14 |
+
if "github.com" in url_or_id:
|
15 |
+
return True
|
16 |
+
|
17 |
+
# Check for shorthand notation (username/repo)
|
18 |
+
if re.match(r'^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+$', url_or_id):
|
19 |
+
return True
|
20 |
+
|
21 |
+
return False
|
22 |
+
|
23 |
+
def extract_repo_info(url_or_id):
|
24 |
+
"""Extract repository owner and name from URL or ID."""
|
25 |
+
# Handle GitHub URLs
|
26 |
+
github_url_pattern = r'github\.com/([a-zA-Z0-9_.-]+)/([a-zA-Z0-9_.-]+)'
|
27 |
+
match = re.search(github_url_pattern, url_or_id)
|
28 |
+
if match:
|
29 |
+
return match.group(1), match.group(2)
|
30 |
+
|
31 |
+
# Handle shorthand notation (username/repo)
|
32 |
+
if '/' in url_or_id and not url_or_id.startswith('http'):
|
33 |
+
parts = url_or_id.split('/')
|
34 |
+
if len(parts) == 2:
|
35 |
+
return parts[0], parts[1]
|
36 |
+
|
37 |
+
return None, None
|
38 |
+
|
39 |
+
def is_running_on_huggingface():
|
40 |
+
"""Check if the app is running on HuggingFace Spaces."""
|
41 |
+
return os.environ.get('SPACE_ID') is not None
|
42 |
+
|
43 |
+
def check_repomix_installed():
|
44 |
+
"""Check if Repomix is installed."""
|
45 |
+
# If running on HuggingFace Spaces, Repomix is likely not available
|
46 |
+
if is_running_on_huggingface():
|
47 |
+
return False
|
48 |
+
|
49 |
+
try:
|
50 |
+
result = subprocess.run(["npx", "repomix", "--version"],
|
51 |
+
capture_output=True, text=True, check=False)
|
52 |
+
return result.returncode == 0
|
53 |
+
except Exception:
|
54 |
+
return False
|
55 |
+
|
56 |
+
def run_repomix(repo_url_or_id, output_format="markdown"):
|
57 |
+
"""Run Repomix on the GitHub repository and return the content."""
|
58 |
+
try:
|
59 |
+
# Create a temporary directory for the output
|
60 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
61 |
+
output_file = os.path.join(temp_dir, f"repomix-output.{output_format}")
|
62 |
+
|
63 |
+
# Prepare the command
|
64 |
+
if '/' in repo_url_or_id and not repo_url_or_id.startswith('http'):
|
65 |
+
# Handle shorthand notation
|
66 |
+
repo_url = f"https://github.com/{repo_url_or_id}"
|
67 |
+
else:
|
68 |
+
repo_url = repo_url_or_id
|
69 |
+
|
70 |
+
# Run Repomix
|
71 |
+
cmd = [
|
72 |
+
"npx", "repomix",
|
73 |
+
"--remote", repo_url,
|
74 |
+
"--output", output_file,
|
75 |
+
"--style", output_format,
|
76 |
+
"--compress" # Use compression for better token efficiency
|
77 |
+
]
|
78 |
+
|
79 |
+
process = subprocess.run(cmd, capture_output=True, text=True, check=False)
|
80 |
+
|
81 |
+
if process.returncode != 0:
|
82 |
+
return f"Error running Repomix: {process.stderr}"
|
83 |
+
|
84 |
+
# Read the output file
|
85 |
+
if os.path.exists(output_file):
|
86 |
+
with open(output_file, 'r', encoding='utf-8') as f:
|
87 |
+
return f.read()
|
88 |
+
else:
|
89 |
+
return f"Error: Repomix did not generate an output file."
|
90 |
+
|
91 |
+
except Exception as e:
|
92 |
+
return f"Error processing GitHub repository: {str(e)}"
|
93 |
+
|
94 |
+
def process_input(url_or_id, depth, input_type="auto"):
|
95 |
+
"""Process the input based on its type."""
|
96 |
+
try:
|
97 |
+
# Determine if this is a GitHub repository
|
98 |
+
is_github = is_github_repo(url_or_id) if input_type == "auto" else (input_type == "github")
|
99 |
+
|
100 |
+
if is_github:
|
101 |
+
# Check if running on HuggingFace Spaces
|
102 |
+
if is_running_on_huggingface():
|
103 |
+
return (
|
104 |
+
"GitHub repository processing with Repomix is not available on HuggingFace Spaces. "
|
105 |
+
"This feature requires Node.js and the ability to run npm/npx commands, "
|
106 |
+
"which are typically not available in the HuggingFace Spaces environment.\n\n"
|
107 |
+
"You can still use the web scraping functionality for regular websites, "
|
108 |
+
"or run this application locally to use the Repomix feature."
|
109 |
+
)
|
110 |
+
|
111 |
+
# Check if Repomix is installed
|
112 |
+
if not check_repomix_installed():
|
113 |
+
return (
|
114 |
+
"Repomix is not installed or not accessible. "
|
115 |
+
"Please install it using: npm install -g repomix\n"
|
116 |
+
"Or you can run it without installation using: npx repomix"
|
117 |
+
)
|
118 |
+
|
119 |
+
# Process GitHub repository with Repomix
|
120 |
+
return run_repomix(url_or_id, output_format="markdown")
|
121 |
+
else:
|
122 |
+
# Process regular URL with web scraping
|
123 |
+
return scrape_and_convert(url_or_id, depth)
|
124 |
+
|
125 |
+
except Exception as e:
|
126 |
+
return f"Error: {str(e)}"
|
127 |
+
|
128 |
def scrape_and_convert(url, depth):
|
129 |
"""Fetch HTML content, extract links recursively (up to given depth), and convert to Markdown."""
|
130 |
try:
|
|
|
172 |
|
173 |
# Define Gradio interface
|
174 |
iface = gr.Interface(
|
175 |
+
fn=process_input,
|
176 |
inputs=[
|
177 |
+
gr.Textbox(label="Enter URL or GitHub Repository",
|
178 |
+
placeholder="https://example.com or username/repo"),
|
179 |
+
gr.Slider(minimum=0, maximum=3, step=1, value=0,
|
180 |
+
label="Search Depth (0 = Only main page, ignored for GitHub repos)"),
|
181 |
+
gr.Radio(
|
182 |
+
choices=["auto", "website", "github"],
|
183 |
+
value="auto",
|
184 |
+
label="Input Type",
|
185 |
+
info="Auto will detect GitHub repos automatically"
|
186 |
+
)
|
187 |
],
|
188 |
+
outputs=gr.Code(label="Output", language="markdown"),
|
189 |
+
title="RAGScraper with GitHub Repository Support",
|
190 |
+
description=(
|
191 |
+
"Enter a URL to scrape a website, or a GitHub repository URL/ID (e.g., 'username/repo') "
|
192 |
+
"to use Repomix for repository processing. "
|
193 |
+
"For websites, you can specify the search depth for recursive scraping."
|
194 |
+
),
|
195 |
+
examples=[
|
196 |
+
["https://example.com", 0, "auto"],
|
197 |
+
["yamadashy/repomix", 0, "auto"],
|
198 |
+
["https://github.com/yamadashy/repomix", 0, "auto"]
|
199 |
+
]
|
200 |
)
|
201 |
|
202 |
# Launch the Gradio app
|
203 |
if __name__ == "__main__":
|
204 |
iface.launch()
|
|