CultriX commited on
Commit
1303e35
·
verified ·
1 Parent(s): c09533d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +144 -7
app.py CHANGED
@@ -1,9 +1,130 @@
1
  import gradio as gr
 
 
 
 
2
  from rag_scraper.scraper import Scraper
3
  from rag_scraper.converter import Converter
4
  from rag_scraper.link_extractor import LinkExtractor, LinkType
5
  from rag_scraper.utils import URLUtils
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  def scrape_and_convert(url, depth):
8
  """Fetch HTML content, extract links recursively (up to given depth), and convert to Markdown."""
9
  try:
@@ -51,17 +172,33 @@ def scrape_and_convert(url, depth):
51
 
52
  # Define Gradio interface
53
  iface = gr.Interface(
54
- fn=scrape_and_convert,
55
  inputs=[
56
- gr.Textbox(label="Enter URL", placeholder="https://example.com"),
57
- gr.Slider(minimum=0, maximum=3, step=1, value=0, label="Search Depth (0 = Only main page)")
 
 
 
 
 
 
 
 
58
  ],
59
- outputs=gr.Code(label="Markdown Output", language="markdown"),
60
- title="RAGScraper with Recursive Depth",
61
- description="Enter a URL and specify the search depth. The app will fetch, extract links, and convert HTML to Markdown."
 
 
 
 
 
 
 
 
 
62
  )
63
 
64
  # Launch the Gradio app
65
  if __name__ == "__main__":
66
  iface.launch()
67
-
 
1
  import gradio as gr
2
+ import subprocess
3
+ import os
4
+ import re
5
+ import tempfile
6
  from rag_scraper.scraper import Scraper
7
  from rag_scraper.converter import Converter
8
  from rag_scraper.link_extractor import LinkExtractor, LinkType
9
  from rag_scraper.utils import URLUtils
10
 
11
+ def is_github_repo(url_or_id):
12
+ """Check if the input is a GitHub repository URL or ID."""
13
+ # Check for GitHub URL
14
+ if "github.com" in url_or_id:
15
+ return True
16
+
17
+ # Check for shorthand notation (username/repo)
18
+ if re.match(r'^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+$', url_or_id):
19
+ return True
20
+
21
+ return False
22
+
23
+ def extract_repo_info(url_or_id):
24
+ """Extract repository owner and name from URL or ID."""
25
+ # Handle GitHub URLs
26
+ github_url_pattern = r'github\.com/([a-zA-Z0-9_.-]+)/([a-zA-Z0-9_.-]+)'
27
+ match = re.search(github_url_pattern, url_or_id)
28
+ if match:
29
+ return match.group(1), match.group(2)
30
+
31
+ # Handle shorthand notation (username/repo)
32
+ if '/' in url_or_id and not url_or_id.startswith('http'):
33
+ parts = url_or_id.split('/')
34
+ if len(parts) == 2:
35
+ return parts[0], parts[1]
36
+
37
+ return None, None
38
+
39
+ def is_running_on_huggingface():
40
+ """Check if the app is running on HuggingFace Spaces."""
41
+ return os.environ.get('SPACE_ID') is not None
42
+
43
+ def check_repomix_installed():
44
+ """Check if Repomix is installed."""
45
+ # If running on HuggingFace Spaces, Repomix is likely not available
46
+ if is_running_on_huggingface():
47
+ return False
48
+
49
+ try:
50
+ result = subprocess.run(["npx", "repomix", "--version"],
51
+ capture_output=True, text=True, check=False)
52
+ return result.returncode == 0
53
+ except Exception:
54
+ return False
55
+
56
+ def run_repomix(repo_url_or_id, output_format="markdown"):
57
+ """Run Repomix on the GitHub repository and return the content."""
58
+ try:
59
+ # Create a temporary directory for the output
60
+ with tempfile.TemporaryDirectory() as temp_dir:
61
+ output_file = os.path.join(temp_dir, f"repomix-output.{output_format}")
62
+
63
+ # Prepare the command
64
+ if '/' in repo_url_or_id and not repo_url_or_id.startswith('http'):
65
+ # Handle shorthand notation
66
+ repo_url = f"https://github.com/{repo_url_or_id}"
67
+ else:
68
+ repo_url = repo_url_or_id
69
+
70
+ # Run Repomix
71
+ cmd = [
72
+ "npx", "repomix",
73
+ "--remote", repo_url,
74
+ "--output", output_file,
75
+ "--style", output_format,
76
+ "--compress" # Use compression for better token efficiency
77
+ ]
78
+
79
+ process = subprocess.run(cmd, capture_output=True, text=True, check=False)
80
+
81
+ if process.returncode != 0:
82
+ return f"Error running Repomix: {process.stderr}"
83
+
84
+ # Read the output file
85
+ if os.path.exists(output_file):
86
+ with open(output_file, 'r', encoding='utf-8') as f:
87
+ return f.read()
88
+ else:
89
+ return f"Error: Repomix did not generate an output file."
90
+
91
+ except Exception as e:
92
+ return f"Error processing GitHub repository: {str(e)}"
93
+
94
+ def process_input(url_or_id, depth, input_type="auto"):
95
+ """Process the input based on its type."""
96
+ try:
97
+ # Determine if this is a GitHub repository
98
+ is_github = is_github_repo(url_or_id) if input_type == "auto" else (input_type == "github")
99
+
100
+ if is_github:
101
+ # Check if running on HuggingFace Spaces
102
+ if is_running_on_huggingface():
103
+ return (
104
+ "GitHub repository processing with Repomix is not available on HuggingFace Spaces. "
105
+ "This feature requires Node.js and the ability to run npm/npx commands, "
106
+ "which are typically not available in the HuggingFace Spaces environment.\n\n"
107
+ "You can still use the web scraping functionality for regular websites, "
108
+ "or run this application locally to use the Repomix feature."
109
+ )
110
+
111
+ # Check if Repomix is installed
112
+ if not check_repomix_installed():
113
+ return (
114
+ "Repomix is not installed or not accessible. "
115
+ "Please install it using: npm install -g repomix\n"
116
+ "Or you can run it without installation using: npx repomix"
117
+ )
118
+
119
+ # Process GitHub repository with Repomix
120
+ return run_repomix(url_or_id, output_format="markdown")
121
+ else:
122
+ # Process regular URL with web scraping
123
+ return scrape_and_convert(url_or_id, depth)
124
+
125
+ except Exception as e:
126
+ return f"Error: {str(e)}"
127
+
128
  def scrape_and_convert(url, depth):
129
  """Fetch HTML content, extract links recursively (up to given depth), and convert to Markdown."""
130
  try:
 
172
 
173
  # Define Gradio interface
174
  iface = gr.Interface(
175
+ fn=process_input,
176
  inputs=[
177
+ gr.Textbox(label="Enter URL or GitHub Repository",
178
+ placeholder="https://example.com or username/repo"),
179
+ gr.Slider(minimum=0, maximum=3, step=1, value=0,
180
+ label="Search Depth (0 = Only main page, ignored for GitHub repos)"),
181
+ gr.Radio(
182
+ choices=["auto", "website", "github"],
183
+ value="auto",
184
+ label="Input Type",
185
+ info="Auto will detect GitHub repos automatically"
186
+ )
187
  ],
188
+ outputs=gr.Code(label="Output", language="markdown"),
189
+ title="RAGScraper with GitHub Repository Support",
190
+ description=(
191
+ "Enter a URL to scrape a website, or a GitHub repository URL/ID (e.g., 'username/repo') "
192
+ "to use Repomix for repository processing. "
193
+ "For websites, you can specify the search depth for recursive scraping."
194
+ ),
195
+ examples=[
196
+ ["https://example.com", 0, "auto"],
197
+ ["yamadashy/repomix", 0, "auto"],
198
+ ["https://github.com/yamadashy/repomix", 0, "auto"]
199
+ ]
200
  )
201
 
202
  # Launch the Gradio app
203
  if __name__ == "__main__":
204
  iface.launch()