CultriX commited on
Commit
2471025
·
1 Parent(s): 2c85e25

feat: Overhaul WebUI, add PDF/Text export, use Poetry in Docker

Browse files
Files changed (3) hide show
  1. Dockerfile +15 -5
  2. app.py +153 -82
  3. pyproject.toml +1 -0
Dockerfile CHANGED
@@ -4,11 +4,12 @@ FROM python:3.10-slim
4
  # Set the working directory in the container
5
  WORKDIR /app
6
 
7
- # Install system dependencies for Node.js installation and Git
8
  RUN apt-get update && apt-get install -y \
9
  curl \
10
  gnupg \
11
  git \
 
12
  && rm -rf /var/lib/apt/lists/*
13
 
14
  # Add Node.js LTS repository and install Node.js and npm
@@ -18,11 +19,20 @@ RUN curl -fsSL https://deb.nodesource.com/setup_lts.x | bash - \
18
  # Install repomix globally using npm
19
  RUN npm install -g repomix
20
 
21
- # Copy the requirements file into the container
22
- COPY requirements.txt .
23
 
24
- # Install any needed packages specified in requirements.txt
25
- RUN pip install --no-cache-dir -r requirements.txt
 
 
 
 
 
 
 
 
 
26
 
27
  # Copy the rest of the application code into the container
28
  COPY . .
 
4
  # Set the working directory in the container
5
  WORKDIR /app
6
 
7
+ # Install system dependencies for Node.js installation, Git, and wkhtmltopdf (for PDF generation)
8
  RUN apt-get update && apt-get install -y \
9
  curl \
10
  gnupg \
11
  git \
12
+ wkhtmltopdf \
13
  && rm -rf /var/lib/apt/lists/*
14
 
15
  # Add Node.js LTS repository and install Node.js and npm
 
19
  # Install repomix globally using npm
20
  RUN npm install -g repomix
21
 
22
+ # Install Poetry
23
+ RUN curl -sSL https://install.python-poetry.org | python3 -
24
 
25
+ # Add Poetry to PATH
26
+ ENV PATH="/root/.local/bin:$PATH"
27
+
28
+ # Configure Poetry to not create virtual environments
29
+ RUN poetry config virtualenvs.create false
30
+
31
+ # Copy poetry.lock and pyproject.toml
32
+ COPY poetry.lock pyproject.toml /app/
33
+
34
+ # Install project dependencies using Poetry
35
+ RUN poetry install --no-root --no-dev --no-interaction --no-ansi
36
 
37
  # Copy the rest of the application code into the container
38
  COPY . .
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import gradio as gr
2
  import subprocess
3
  import os
@@ -5,10 +6,95 @@ import re
5
  import tempfile
6
  import json
7
  import csv
 
8
  from rag_scraper.scraper import Scraper
9
  from rag_scraper.converter import Converter
10
  from rag_scraper.link_extractor import LinkExtractor, LinkType
11
  from rag_scraper.utils import URLUtils
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  def is_github_repo(url_or_id):
14
  """Check if the input is a GitHub repository URL or ID."""
@@ -32,11 +118,7 @@ def run_repomix(repo_url_or_id, progress=gr.Progress(track_tqdm=True)):
32
  progress(0, desc="Starting Repomix processing...")
33
  try:
34
  with tempfile.TemporaryDirectory() as temp_dir:
35
- # RepoMix typically outputs a zip file if not specifying a single output style,
36
- # or a specific file if --style is used.
37
- # For simplicity, let's assume we want markdown and it outputs to a known file or stdout.
38
- # The current repomix command in the original script uses --style markdown and --output.
39
- output_file_name = "repomix-output.md" # Assuming markdown output
40
  output_file_path = os.path.join(temp_dir, output_file_name)
41
 
42
  if '/' in repo_url_or_id and not repo_url_or_id.startswith('http'):
@@ -48,12 +130,12 @@ def run_repomix(repo_url_or_id, progress=gr.Progress(track_tqdm=True)):
48
  cmd = [
49
  "repomix",
50
  "--remote", repo_url,
51
- "--output", output_file_path, # Direct output to a file
52
- "--style", "markdown", # Explicitly request markdown
53
  "--compress"
54
  ]
55
 
56
- process = subprocess.run(cmd, capture_output=True, text=True, check=False, encoding='utf-8') # Added encoding
57
  progress(0.8, desc="Repomix command executed.")
58
 
59
  if process.returncode != 0:
@@ -64,7 +146,7 @@ def run_repomix(repo_url_or_id, progress=gr.Progress(track_tqdm=True)):
64
  with open(output_file_path, 'r', encoding='utf-8') as f:
65
  content = f.read()
66
  progress(1, desc="Repomix output processed.")
67
- return content, output_file_path # Return content and path for potential download
68
  else:
69
  error_details = f"Return Code: {process.returncode}\nStderr: {process.stderr}\nStdout: {process.stdout}"
70
  return f"Error: Repomix did not generate an output file at '{output_file_path}'.\nRepomix Output:\n{error_details}", None
@@ -105,7 +187,6 @@ def scrape_and_convert_website(url, depth, progress=gr.Progress(track_tqdm=True)
105
  if current_depth > 0:
106
  try:
107
  links = LinkExtractor.scrape_url(current_url, link_type=LinkType.INTERNAL)
108
- # Filter out already visited links and external links more carefully
109
  valid_links = [
110
  link for link in links
111
  if URLUtils.is_internal(link, current_url) and link not in visited_urls
@@ -121,53 +202,63 @@ def scrape_and_convert_website(url, depth, progress=gr.Progress(track_tqdm=True)
121
  all_markdown_content = recursive_scrape(url, depth)
122
  progress(1, desc="Web scraping complete.")
123
 
124
- # For web scraping, we create a temporary file with the content for download
125
  with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".md", encoding="utf-8") as tmp_file:
126
  tmp_file.write(all_markdown_content)
127
  return all_markdown_content, tmp_file.name
128
 
129
-
130
- # --- Data Conversion Functions (Stubs for now) ---
131
  def convert_to_json(markdown_content, source_url_or_id):
132
- """Converts markdown content to a JSON string."""
133
- # Basic implementation: create a JSON object with source and content
134
- # More sophisticated parsing can be added later
135
  data = {"source": source_url_or_id, "content": markdown_content}
136
  return json.dumps(data, indent=2)
137
 
138
  def convert_to_csv(markdown_content, source_url_or_id):
139
- """Converts markdown content to a CSV string."""
140
- # Basic implementation: create a CSV with source and content
141
- # This is a simplified CSV; real CSVs might need more structure
142
  output = tempfile.NamedTemporaryFile(mode='w+', delete=False, newline='', suffix=".csv", encoding="utf-8")
143
  writer = csv.writer(output)
144
- writer.writerow(["source", "content"]) # Header
145
-
146
- # Split content into manageable chunks or lines if necessary for CSV
147
- # For now, putting all content in one cell.
148
  writer.writerow([source_url_or_id, markdown_content])
149
  output.close()
150
- return output.name # Return path to the CSV file
151
 
152
  def save_output_to_file(content, output_format, source_url_or_id):
153
  """Saves content to a temporary file based on format and returns its path."""
154
- suffix = f".{output_format.lower()}"
 
155
  if output_format == "JSON":
 
156
  processed_content = convert_to_json(content, source_url_or_id)
157
  elif output_format == "CSV":
158
- # convert_to_csv now returns a path directly
159
  return convert_to_csv(content, source_url_or_id)
160
- else: # Markdown/Text
161
- processed_content = content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  suffix = ".md"
163
 
 
164
  with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=suffix, encoding="utf-8") as tmp_file:
165
  tmp_file.write(processed_content)
166
  return tmp_file.name
167
 
168
- # --- Main Processing Function ---
169
  def process_input_updated(url_or_id, source_type, depth, output_format_selection, progress=gr.Progress(track_tqdm=True)):
170
- """Main function to process URL or GitHub repo based on selected type and format."""
171
  progress(0, desc="Initializing...")
172
  raw_content = ""
173
  error_message = ""
@@ -175,17 +266,15 @@ def process_input_updated(url_or_id, source_type, depth, output_format_selection
175
 
176
  if source_type == "GitHub Repository":
177
  if not check_repomix_installed():
178
- error_message = "Repomix is not installed or not accessible. Please ensure it's installed globally in your Docker environment."
179
- return error_message, None, None # Text output, Preview, File output
180
-
181
- raw_content, _ = run_repomix(url_or_id, progress=progress) # Repomix returns content and its original path
182
- if "Error" in raw_content: # Simple error check
183
  error_message = raw_content
184
  raw_content = ""
185
-
186
  elif source_type == "Webpage":
187
  raw_content, _ = scrape_and_convert_website(url_or_id, depth, progress=progress)
188
- if "Error" in raw_content: # Simple error check
189
  error_message = raw_content
190
  raw_content = ""
191
  else:
@@ -193,54 +282,44 @@ def process_input_updated(url_or_id, source_type, depth, output_format_selection
193
  return error_message, None, None
194
 
195
  if error_message:
196
- print(f"Error before file generation: {error_message}") # DEBUGGING
197
- return error_message, None, None # Error text, no preview, no file
198
 
199
- # Save raw_content (which is markdown) to a file of the chosen output_format
200
- # This will handle conversion if necessary
201
  try:
202
  progress(0.9, desc=f"Converting to {output_format_selection}...")
203
  output_file_path = save_output_to_file(raw_content, output_format_selection, url_or_id)
204
 
205
- # For preview, we'll show the raw markdown, or a snippet of JSON/CSV
206
- preview_content = raw_content # Default to markdown
207
  if output_format_selection == "JSON":
208
  preview_content = convert_to_json(raw_content, url_or_id)
209
  elif output_format_selection == "CSV" and output_file_path:
210
  try:
211
  with open(output_file_path, 'r', encoding='utf-8') as f_csv:
212
- # Read the first 5 lines for preview
213
  csv_preview_lines = [next(f_csv) for _ in range(5)]
214
  preview_content = "".join(csv_preview_lines)
215
- if not preview_content: # Handle empty or very short CSV
216
- preview_content = "[CSV content is empty or very short]"
217
- except StopIteration: # Handle files with less than 5 lines
218
- # If StopIteration occurs, it means we've read all lines.
219
- # We need to re-open and read all lines if it was less than 5.
220
  with open(output_file_path, 'r', encoding='utf-8') as f_csv:
221
  preview_content = f_csv.read()
222
- if not preview_content:
223
- preview_content = "[CSV content is empty]"
224
  except Exception as e_csv_preview:
225
  preview_content = f"[Error reading CSV for preview: {str(e_csv_preview)}]"
226
  elif output_format_selection == "CSV" and not output_file_path:
227
  preview_content = "[CSV file path not available for preview]"
 
 
 
 
228
 
229
 
230
- print(f"Generated output file path for download: {output_file_path}") # DEBUGGING
231
  progress(1, desc="Processing complete.")
232
  return f"Successfully processed: {url_or_id}", preview_content, output_file_path
233
  except Exception as e:
234
- print(f"Exception during file conversion/saving: {str(e)}") # DEBUGGING
235
  return f"Error during file conversion/saving: {str(e)}", raw_content, None
236
 
237
-
238
- # --- Gradio Interface Definition ---
239
- with gr.Blocks(theme=gr.themes.Soft()) as iface:
240
  gr.Markdown("# RAG-Ready Content Scraper")
241
  gr.Markdown(
242
- "Scrape webpage content (using RAG-scraper) or GitHub repositories (using RepoMix) "
243
- "to generate RAG-ready datasets. Uses Docker for full functionality on HuggingFace Spaces."
244
  )
245
 
246
  with gr.Row():
@@ -260,7 +339,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
260
  info="0: Only main page. Ignored for GitHub repos."
261
  )
262
  output_format_input = gr.Dropdown(
263
- choices=["Markdown", "JSON", "CSV"], # Markdown is like text file
264
  value="Markdown",
265
  label="Select Output Format"
266
  )
@@ -268,58 +347,50 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
268
 
269
  with gr.Column(scale=3):
270
  status_output = gr.Textbox(label="Status", interactive=False)
271
- preview_output = gr.Code(label="Preview Content", language="markdown", interactive=False) # Default to markdown, can show JSON too
272
  file_download_output = gr.File(label="Download Processed File", interactive=False)
273
 
274
- progress_bar = gr.Progress(track_tqdm=True)
275
 
276
- # --- Examples ---
277
  gr.Examples(
278
  examples=[
279
  ["https://gradio.app/docs/js", "Webpage", 1, "Markdown"],
280
- ["gradio-app/gradio", "GitHub Repository", 0, "Markdown"],
281
  ["https://en.wikipedia.org/wiki/Retrieval-augmented_generation", "Webpage", 0, "JSON"],
282
  ],
283
  inputs=[url_input, source_type_input, depth_input, output_format_input],
284
- outputs=[status_output, preview_output, file_download_output], # Function needs to match this
285
- fn=process_input_updated, # Make sure the function signature matches
286
- cache_examples=False # For development, disable caching
287
  )
288
 
289
- # --- How it Works & GitHub Link ---
290
  with gr.Accordion("How it Works & More Info", open=False):
291
  gr.Markdown(
292
  """
293
  **Webpage Scraping:**
294
  1. Enter a full URL (e.g., `https://example.com`).
295
  2. Select "Webpage" as the source type.
296
- 3. Set the desired scraping depth (how many levels of internal links to follow).
297
  4. Choose your output format.
298
- 5. The tool fetches HTML, converts it to Markdown, and follows internal links up to the specified depth.
299
 
300
  **GitHub Repository Processing:**
301
- 1. Enter a GitHub repository URL (e.g., `https://github.com/username/repo`) or shorthand ID (e.g., `username/repo`).
302
- 2. Select "GitHub Repository" as the source type. (Scraping depth is ignored).
303
- 3. Choose your output format.
304
- 4. The tool uses **RepoMix** to fetch and process the repository into a structured Markdown format.
305
 
306
- **Output Formats:**
307
- - **Markdown:** Plain text Markdown file, suitable for direct reading or further processing.
308
- - **JSON:** Structured JSON output, typically with fields like `source` and `content`.
309
- - **CSV:** Comma-Separated Values file, useful for tabular data or importing into spreadsheets.
310
 
311
- **Note on HuggingFace Spaces:** This application is designed to run in a Docker-based HuggingFace Space,
312
- which allows the use of `RepoMix` for GitHub repositories.
313
 
314
- [View Source Code on HuggingFace Spaces](https://huggingface.co/spaces/CultriX/RAG-Scraper)
315
  """
316
  )
317
 
318
  submit_button.click(
319
  fn=process_input_updated,
320
- inputs=[url_input, source_type_input, depth_input, output_format_input], # Removed progress_bar
321
  outputs=[status_output, preview_output, file_download_output],
322
- # The progress instance is passed to the function via its signature's default or if explicitly managed
323
  )
324
 
325
  if __name__ == "__main__":
 
1
+ from __future__ import annotations
2
  import gradio as gr
3
  import subprocess
4
  import os
 
6
  import tempfile
7
  import json
8
  import csv
9
+ from typing import Iterable # Added for Theme
10
  from rag_scraper.scraper import Scraper
11
  from rag_scraper.converter import Converter
12
  from rag_scraper.link_extractor import LinkExtractor, LinkType
13
  from rag_scraper.utils import URLUtils
14
+ from gradio.themes.base import Base # Added for Theme
15
+ from gradio.themes.utils import colors, fonts, sizes # Added for Theme
16
+ import markdown_pdf # Added for PDF conversion
17
+
18
+ # --- Custom Theme Definition ---
19
+ class Seafoam(Base):
20
+ def __init__(
21
+ self,
22
+ *,
23
+ primary_hue: colors.Color | str = colors.teal,
24
+ secondary_hue: colors.Color | str = colors.cyan,
25
+ neutral_hue: colors.Color | str = colors.gray,
26
+ spacing_size: sizes.Size | str = sizes.spacing_md,
27
+ radius_size: sizes.Size | str = sizes.radius_md,
28
+ text_size: sizes.Size | str = sizes.text_md, # Adjusted from lg for a more professional feel
29
+ font: fonts.Font
30
+ | str
31
+ | Iterable[fonts.Font | str] = (
32
+ fonts.GoogleFont("Inter"), # Modern sans-serif
33
+ "ui-sans-serif",
34
+ "system-ui",
35
+ "sans-serif",
36
+ ),
37
+ font_mono: fonts.Font
38
+ | str
39
+ | Iterable[fonts.Font | str] = (
40
+ fonts.GoogleFont("IBM Plex Mono"),
41
+ "ui-monospace",
42
+ "monospace",
43
+ ),
44
+ ):
45
+ super().__init__(
46
+ primary_hue=primary_hue,
47
+ secondary_hue=secondary_hue,
48
+ neutral_hue=neutral_hue,
49
+ spacing_size=spacing_size,
50
+ radius_size=radius_size,
51
+ text_size=text_size,
52
+ font=font,
53
+ font_mono=font_mono,
54
+ )
55
+ # Dark Mode First
56
+ super().set(
57
+ # Core Colors
58
+ body_background_fill_dark="black", # True black
59
+ body_text_color_dark=colors.gray_200,
60
+ block_background_fill_dark=colors.gray_900,
61
+ block_border_color_dark=colors.gray_700,
62
+ block_label_background_fill_dark=colors.gray_800,
63
+ block_label_text_color_dark=colors.gray_200,
64
+ input_background_fill_dark=colors.gray_800,
65
+ input_border_color_dark=colors.gray_600,
66
+ input_text_color_dark=colors.gray_50,
67
+ button_primary_background_fill_dark=colors.teal_600,
68
+ button_primary_background_fill_hover_dark=colors.teal_500,
69
+ button_primary_text_color_dark="white",
70
+ button_secondary_background_fill_dark=colors.gray_700,
71
+ button_secondary_background_fill_hover_dark=colors.gray_600,
72
+ button_secondary_text_color_dark="white",
73
+ slider_color_dark=colors.teal_500,
74
+ # Light Mode
75
+ body_background_fill="white",
76
+ body_text_color=colors.gray_800,
77
+ block_background_fill=colors.gray_50,
78
+ block_border_color=colors.gray_300,
79
+ block_label_background_fill=colors.gray_200,
80
+ block_label_text_color=colors.gray_700,
81
+ input_background_fill=colors.white,
82
+ input_border_color=colors.gray_300,
83
+ input_text_color=colors.gray_900,
84
+ button_primary_background_fill=colors.teal_500,
85
+ button_primary_background_fill_hover=colors.teal_600,
86
+ button_primary_text_color="white",
87
+ button_secondary_background_fill=colors.gray_200,
88
+ button_secondary_background_fill_hover=colors.gray_300,
89
+ button_secondary_text_color=colors.gray_800,
90
+ slider_color=colors.teal_500,
91
+ # General
92
+ block_title_text_weight="600",
93
+ block_shadow="*shadow_drop_lg",
94
+ button_shadow="*shadow_drop"
95
+ )
96
+
97
+ seafoam_theme = Seafoam()
98
 
99
  def is_github_repo(url_or_id):
100
  """Check if the input is a GitHub repository URL or ID."""
 
118
  progress(0, desc="Starting Repomix processing...")
119
  try:
120
  with tempfile.TemporaryDirectory() as temp_dir:
121
+ output_file_name = "repomix-output.md"
 
 
 
 
122
  output_file_path = os.path.join(temp_dir, output_file_name)
123
 
124
  if '/' in repo_url_or_id and not repo_url_or_id.startswith('http'):
 
130
  cmd = [
131
  "repomix",
132
  "--remote", repo_url,
133
+ "--output", output_file_path,
134
+ "--style", "markdown",
135
  "--compress"
136
  ]
137
 
138
+ process = subprocess.run(cmd, capture_output=True, text=True, check=False, encoding='utf-8')
139
  progress(0.8, desc="Repomix command executed.")
140
 
141
  if process.returncode != 0:
 
146
  with open(output_file_path, 'r', encoding='utf-8') as f:
147
  content = f.read()
148
  progress(1, desc="Repomix output processed.")
149
+ return content, output_file_path
150
  else:
151
  error_details = f"Return Code: {process.returncode}\nStderr: {process.stderr}\nStdout: {process.stdout}"
152
  return f"Error: Repomix did not generate an output file at '{output_file_path}'.\nRepomix Output:\n{error_details}", None
 
187
  if current_depth > 0:
188
  try:
189
  links = LinkExtractor.scrape_url(current_url, link_type=LinkType.INTERNAL)
 
190
  valid_links = [
191
  link for link in links
192
  if URLUtils.is_internal(link, current_url) and link not in visited_urls
 
202
  all_markdown_content = recursive_scrape(url, depth)
203
  progress(1, desc="Web scraping complete.")
204
 
 
205
  with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".md", encoding="utf-8") as tmp_file:
206
  tmp_file.write(all_markdown_content)
207
  return all_markdown_content, tmp_file.name
208
 
 
 
209
  def convert_to_json(markdown_content, source_url_or_id):
 
 
 
210
  data = {"source": source_url_or_id, "content": markdown_content}
211
  return json.dumps(data, indent=2)
212
 
213
  def convert_to_csv(markdown_content, source_url_or_id):
 
 
 
214
  output = tempfile.NamedTemporaryFile(mode='w+', delete=False, newline='', suffix=".csv", encoding="utf-8")
215
  writer = csv.writer(output)
216
+ writer.writerow(["source", "content"])
 
 
 
217
  writer.writerow([source_url_or_id, markdown_content])
218
  output.close()
219
+ return output.name
220
 
221
  def save_output_to_file(content, output_format, source_url_or_id):
222
  """Saves content to a temporary file based on format and returns its path."""
223
+ processed_content = content # Default for Markdown and Text
224
+
225
  if output_format == "JSON":
226
+ suffix = ".json"
227
  processed_content = convert_to_json(content, source_url_or_id)
228
  elif output_format == "CSV":
229
+ # convert_to_csv returns a path directly
230
  return convert_to_csv(content, source_url_or_id)
231
+ elif output_format == "Text":
232
+ suffix = ".txt"
233
+ elif output_format == "PDF":
234
+ suffix = ".pdf"
235
+ # PDF conversion happens differently, creates file directly
236
+ pdf_output_path = ""
237
+ try:
238
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf_file:
239
+ pdf_output_path = tmp_pdf_file.name
240
+
241
+ # Basic PDF conversion from Markdown string
242
+ # You might need to install a library like `markdown-pdf` or `WeasyPrint`
243
+ # Example using markdown_pdf (ensure it's installed: pip install markdown-pdf)
244
+ md_pdf = markdown_pdf.MarkdownPdf(toc_level=2)
245
+ # md_pdf.meta["css"] = "your_custom_css_path.css" # Optional: for styling
246
+ md_pdf.convert_from_string(content, pdf_output_path)
247
+ return pdf_output_path
248
+ except Exception as e:
249
+ # Fallback: save as markdown with .pdf.md suffix if PDF fails
250
+ print(f"PDF conversion failed: {e}. Saving as Markdown instead.")
251
+ suffix = ".pdf.md" # Indicate it's markdown intended for PDF
252
+ # No processed_content change needed, it's already markdown
253
+ else: # Default to Markdown
254
  suffix = ".md"
255
 
256
+ # For formats that don't return early (JSON, Text, Markdown, PDF fallback)
257
  with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=suffix, encoding="utf-8") as tmp_file:
258
  tmp_file.write(processed_content)
259
  return tmp_file.name
260
 
 
261
  def process_input_updated(url_or_id, source_type, depth, output_format_selection, progress=gr.Progress(track_tqdm=True)):
 
262
  progress(0, desc="Initializing...")
263
  raw_content = ""
264
  error_message = ""
 
266
 
267
  if source_type == "GitHub Repository":
268
  if not check_repomix_installed():
269
+ error_message = "Repomix is not installed or not accessible. Please ensure it's installed globally."
270
+ return error_message, None, None
271
+ raw_content, _ = run_repomix(url_or_id, progress=progress)
272
+ if "Error" in raw_content:
 
273
  error_message = raw_content
274
  raw_content = ""
 
275
  elif source_type == "Webpage":
276
  raw_content, _ = scrape_and_convert_website(url_or_id, depth, progress=progress)
277
+ if "Error" in raw_content:
278
  error_message = raw_content
279
  raw_content = ""
280
  else:
 
282
  return error_message, None, None
283
 
284
  if error_message:
285
+ return error_message, None, None
 
286
 
 
 
287
  try:
288
  progress(0.9, desc=f"Converting to {output_format_selection}...")
289
  output_file_path = save_output_to_file(raw_content, output_format_selection, url_or_id)
290
 
291
+ preview_content = raw_content # Default for Markdown, Text
 
292
  if output_format_selection == "JSON":
293
  preview_content = convert_to_json(raw_content, url_or_id)
294
  elif output_format_selection == "CSV" and output_file_path:
295
  try:
296
  with open(output_file_path, 'r', encoding='utf-8') as f_csv:
 
297
  csv_preview_lines = [next(f_csv) for _ in range(5)]
298
  preview_content = "".join(csv_preview_lines)
299
+ if not preview_content: preview_content = "[CSV content is empty or very short]"
300
+ except StopIteration:
 
 
 
301
  with open(output_file_path, 'r', encoding='utf-8') as f_csv:
302
  preview_content = f_csv.read()
303
+ if not preview_content: preview_content = "[CSV content is empty]"
 
304
  except Exception as e_csv_preview:
305
  preview_content = f"[Error reading CSV for preview: {str(e_csv_preview)}]"
306
  elif output_format_selection == "CSV" and not output_file_path:
307
  preview_content = "[CSV file path not available for preview]"
308
+ elif output_format_selection == "PDF":
309
+ preview_content = f"[PDF generated. Download to view: {os.path.basename(output_file_path if output_file_path else 'file.pdf')}]"
310
+ if "Saving as Markdown instead" in (output_file_path or ""): # Check if PDF failed
311
+ preview_content = raw_content + f"\n\n[Note: PDF conversion failed, showing Markdown. File saved as .pdf.md]"
312
 
313
 
 
314
  progress(1, desc="Processing complete.")
315
  return f"Successfully processed: {url_or_id}", preview_content, output_file_path
316
  except Exception as e:
 
317
  return f"Error during file conversion/saving: {str(e)}", raw_content, None
318
 
319
+ with gr.Blocks(theme=seafoam_theme) as iface: # Applied custom theme
 
 
320
  gr.Markdown("# RAG-Ready Content Scraper")
321
  gr.Markdown(
322
+ "Scrape webpage content or GitHub repositories to generate RAG-ready datasets."
 
323
  )
324
 
325
  with gr.Row():
 
339
  info="0: Only main page. Ignored for GitHub repos."
340
  )
341
  output_format_input = gr.Dropdown(
342
+ choices=["Markdown", "JSON", "CSV", "Text", "PDF"], # Added Text and PDF
343
  value="Markdown",
344
  label="Select Output Format"
345
  )
 
347
 
348
  with gr.Column(scale=3):
349
  status_output = gr.Textbox(label="Status", interactive=False)
350
+ preview_output = gr.Code(label="Preview Content", language="markdown", interactive=False)
351
  file_download_output = gr.File(label="Download Processed File", interactive=False)
352
 
353
+ # Removed progress_bar = gr.Progress(track_tqdm=True) as it's passed directly
354
 
 
355
  gr.Examples(
356
  examples=[
357
  ["https://gradio.app/docs/js", "Webpage", 1, "Markdown"],
358
+ ["gradio-app/gradio", "GitHub Repository", 0, "Text"], # Changed to Text
359
  ["https://en.wikipedia.org/wiki/Retrieval-augmented_generation", "Webpage", 0, "JSON"],
360
  ],
361
  inputs=[url_input, source_type_input, depth_input, output_format_input],
362
+ outputs=[status_output, preview_output, file_download_output],
363
+ fn=process_input_updated,
364
+ cache_examples=False
365
  )
366
 
 
367
  with gr.Accordion("How it Works & More Info", open=False):
368
  gr.Markdown(
369
  """
370
  **Webpage Scraping:**
371
  1. Enter a full URL (e.g., `https://example.com`).
372
  2. Select "Webpage" as the source type.
373
+ 3. Set the desired scraping depth.
374
  4. Choose your output format.
 
375
 
376
  **GitHub Repository Processing:**
377
+ 1. Enter a GitHub repository URL or ID (e.g., `username/repo`).
378
+ 2. Select "GitHub Repository". (Depth is ignored).
379
+ 3. Choose your output format. Uses **RepoMix**.
 
380
 
381
+ **Output Formats:** Markdown, JSON, CSV, Text, PDF.
 
 
 
382
 
383
+ **Note:** PDF generation requires `markdown-pdf` library.
384
+ This app is designed for Docker/HuggingFace Spaces.
385
 
386
+ [View Source Code on HuggingFace Spaces](https://huggingface.co/spaces/CultriX/RAG-Scraper)
387
  """
388
  )
389
 
390
  submit_button.click(
391
  fn=process_input_updated,
392
+ inputs=[url_input, source_type_input, depth_input, output_format_input],
393
  outputs=[status_output, preview_output, file_download_output],
 
394
  )
395
 
396
  if __name__ == "__main__":
pyproject.toml CHANGED
@@ -12,6 +12,7 @@ python = "^3.10"
12
  requests = "^2.31.0"
13
  beautifulsoup4 = "^4.12.2"
14
  html2text = "^2020.1.16"
 
15
 
16
  [tool.poetry.group.dev]
17
  optional = true
 
12
  requests = "^2.31.0"
13
  beautifulsoup4 = "^4.12.2"
14
  html2text = "^2020.1.16"
15
+ markdown-pdf = "^0.2.1"
16
 
17
  [tool.poetry.group.dev]
18
  optional = true