CultriX commited on
Commit
d82ab96
·
1 Parent(s): 32f722f

feat: Overhaul WebUI, add PDF/Text export, use Poetry in Docker

Browse files
Files changed (1) hide show
  1. app.py +10 -98
app.py CHANGED
@@ -6,95 +6,16 @@ import re
6
  import tempfile
7
  import json
8
  import csv
9
- from typing import Iterable # Added for Theme
10
  from rag_scraper.scraper import Scraper
11
  from rag_scraper.converter import Converter
12
  from rag_scraper.link_extractor import LinkExtractor, LinkType
13
  from rag_scraper.utils import URLUtils
14
- from gradio.themes.base import Base # Added for Theme
15
- from gradio.themes.utils import colors, fonts, sizes # Added for Theme
16
  import markdown_pdf # Added for PDF conversion
17
 
18
- # --- Custom Theme Definition ---
19
- class Seafoam(Base):
20
- def __init__(
21
- self,
22
- *,
23
- primary_hue: colors.Color | str = colors.teal,
24
- secondary_hue: colors.Color | str = colors.cyan,
25
- neutral_hue: colors.Color | str = colors.gray,
26
- spacing_size: sizes.Size | str = sizes.spacing_md,
27
- radius_size: sizes.Size | str = sizes.radius_md,
28
- text_size: sizes.Size | str = sizes.text_md, # Adjusted from lg for a more professional feel
29
- font: fonts.Font
30
- | str
31
- | Iterable[fonts.Font | str] = (
32
- fonts.GoogleFont("Inter"), # Modern sans-serif
33
- "ui-sans-serif",
34
- "system-ui",
35
- "sans-serif",
36
- ),
37
- font_mono: fonts.Font
38
- | str
39
- | Iterable[fonts.Font | str] = (
40
- fonts.GoogleFont("IBM Plex Mono"),
41
- "ui-monospace",
42
- "monospace",
43
- ),
44
- ):
45
- super().__init__(
46
- primary_hue=primary_hue,
47
- secondary_hue=secondary_hue,
48
- neutral_hue=neutral_hue,
49
- spacing_size=spacing_size,
50
- radius_size=radius_size,
51
- text_size=text_size,
52
- font=font,
53
- font_mono=font_mono,
54
- )
55
- # Dark Mode First
56
- super().set(
57
- # Core Colors
58
- body_background_fill_dark="black", # True black
59
- body_text_color_dark="*neutral_100",
60
- block_background_fill_dark=colors.gray_900,
61
- block_border_color_dark=colors.gray_700,
62
- block_label_background_fill_dark=colors.gray_800,
63
- block_label_text_color_dark="*neutral_100",
64
- input_background_fill_dark=colors.gray_800,
65
- input_border_color_dark=colors.gray_600,
66
- input_text_color_dark="*neutral_50",
67
- button_primary_background_fill_dark=colors.teal_600,
68
- button_primary_background_fill_hover_dark=colors.teal_500,
69
- button_primary_text_color_dark="white",
70
- button_secondary_background_fill_dark=colors.gray_700,
71
- button_secondary_background_fill_hover_dark=colors.gray_600,
72
- button_secondary_text_color_dark="white",
73
- slider_color_dark=colors.teal_500,
74
- # Light Mode
75
- body_background_fill="white",
76
- body_text_color=colors.gray_800,
77
- block_background_fill="*neutral_50",
78
- block_border_color=colors.gray_300,
79
- block_label_background_fill="*neutral_100",
80
- block_label_text_color=colors.gray_700,
81
- input_background_fill=colors.white,
82
- input_border_color=colors.gray_300,
83
- input_text_color=colors.gray_900,
84
- button_primary_background_fill=colors.teal_500,
85
- button_primary_background_fill_hover=colors.teal_600,
86
- button_primary_text_color="white",
87
- button_secondary_background_fill="*neutral_100",
88
- button_secondary_background_fill_hover=colors.gray_300,
89
- button_secondary_text_color=colors.gray_800,
90
- slider_color=colors.teal_500,
91
- # General
92
- block_title_text_weight="600",
93
- block_shadow="*shadow_drop_lg",
94
- button_shadow="*shadow_drop"
95
- )
96
-
97
- seafoam_theme = Seafoam()
98
 
99
  def is_github_repo(url_or_id):
100
  """Check if the input is a GitHub repository URL or ID."""
@@ -238,22 +159,16 @@ def save_output_to_file(content, output_format, source_url_or_id):
238
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf_file:
239
  pdf_output_path = tmp_pdf_file.name
240
 
241
- # Basic PDF conversion from Markdown string
242
- # You might need to install a library like `markdown-pdf` or `WeasyPrint`
243
- # Example using markdown_pdf (ensure it's installed: pip install markdown-pdf)
244
  md_pdf = markdown_pdf.MarkdownPdf(toc_level=2)
245
- # md_pdf.meta["css"] = "your_custom_css_path.css" # Optional: for styling
246
  md_pdf.convert_from_string(content, pdf_output_path)
247
  return pdf_output_path
248
  except Exception as e:
249
- # Fallback: save as markdown with .pdf.md suffix if PDF fails
250
  print(f"PDF conversion failed: {e}. Saving as Markdown instead.")
251
- suffix = ".pdf.md" # Indicate it's markdown intended for PDF
252
  # No processed_content change needed, it's already markdown
253
  else: # Default to Markdown
254
  suffix = ".md"
255
 
256
- # For formats that don't return early (JSON, Text, Markdown, PDF fallback)
257
  with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=suffix, encoding="utf-8") as tmp_file:
258
  tmp_file.write(processed_content)
259
  return tmp_file.name
@@ -288,7 +203,7 @@ def process_input_updated(url_or_id, source_type, depth, output_format_selection
288
  progress(0.9, desc=f"Converting to {output_format_selection}...")
289
  output_file_path = save_output_to_file(raw_content, output_format_selection, url_or_id)
290
 
291
- preview_content = raw_content # Default for Markdown, Text
292
  if output_format_selection == "JSON":
293
  preview_content = convert_to_json(raw_content, url_or_id)
294
  elif output_format_selection == "CSV" and output_file_path:
@@ -307,16 +222,15 @@ def process_input_updated(url_or_id, source_type, depth, output_format_selection
307
  preview_content = "[CSV file path not available for preview]"
308
  elif output_format_selection == "PDF":
309
  preview_content = f"[PDF generated. Download to view: {os.path.basename(output_file_path if output_file_path else 'file.pdf')}]"
310
- if "Saving as Markdown instead" in (output_file_path or ""): # Check if PDF failed
311
  preview_content = raw_content + f"\n\n[Note: PDF conversion failed, showing Markdown. File saved as .pdf.md]"
312
 
313
-
314
  progress(1, desc="Processing complete.")
315
  return f"Successfully processed: {url_or_id}", preview_content, output_file_path
316
  except Exception as e:
317
  return f"Error during file conversion/saving: {str(e)}", raw_content, None
318
 
319
- with gr.Blocks(theme=seafoam_theme) as iface: # Applied custom theme
320
  gr.Markdown("# RAG-Ready Content Scraper")
321
  gr.Markdown(
322
  "Scrape webpage content or GitHub repositories to generate RAG-ready datasets."
@@ -339,7 +253,7 @@ with gr.Blocks(theme=seafoam_theme) as iface: # Applied custom theme
339
  info="0: Only main page. Ignored for GitHub repos."
340
  )
341
  output_format_input = gr.Dropdown(
342
- choices=["Markdown", "JSON", "CSV", "Text", "PDF"], # Added Text and PDF
343
  value="Markdown",
344
  label="Select Output Format"
345
  )
@@ -350,12 +264,10 @@ with gr.Blocks(theme=seafoam_theme) as iface: # Applied custom theme
350
  preview_output = gr.Code(label="Preview Content", language="markdown", interactive=False)
351
  file_download_output = gr.File(label="Download Processed File", interactive=False)
352
 
353
- # Removed progress_bar = gr.Progress(track_tqdm=True) as it's passed directly
354
-
355
  gr.Examples(
356
  examples=[
357
  ["https://gradio.app/docs/js", "Webpage", 1, "Markdown"],
358
- ["gradio-app/gradio", "GitHub Repository", 0, "Text"], # Changed to Text
359
  ["https://en.wikipedia.org/wiki/Retrieval-augmented_generation", "Webpage", 0, "JSON"],
360
  ],
361
  inputs=[url_input, source_type_input, depth_input, output_format_input],
 
6
  import tempfile
7
  import json
8
  import csv
9
+ # Removed: from typing import Iterable # Added for Theme
10
  from rag_scraper.scraper import Scraper
11
  from rag_scraper.converter import Converter
12
  from rag_scraper.link_extractor import LinkExtractor, LinkType
13
  from rag_scraper.utils import URLUtils
14
+ # Removed: from gradio.themes.base import Base # Added for Theme
15
+ # Removed: from gradio.themes.utils import colors, fonts, sizes # Added for Theme
16
  import markdown_pdf # Added for PDF conversion
17
 
18
+ # --- Custom Theme Definition --- (REMOVED Seafoam class and instance)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  def is_github_repo(url_or_id):
21
  """Check if the input is a GitHub repository URL or ID."""
 
159
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf_file:
160
  pdf_output_path = tmp_pdf_file.name
161
 
 
 
 
162
  md_pdf = markdown_pdf.MarkdownPdf(toc_level=2)
 
163
  md_pdf.convert_from_string(content, pdf_output_path)
164
  return pdf_output_path
165
  except Exception as e:
 
166
  print(f"PDF conversion failed: {e}. Saving as Markdown instead.")
167
+ suffix = ".pdf.md"
168
  # No processed_content change needed, it's already markdown
169
  else: # Default to Markdown
170
  suffix = ".md"
171
 
 
172
  with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=suffix, encoding="utf-8") as tmp_file:
173
  tmp_file.write(processed_content)
174
  return tmp_file.name
 
203
  progress(0.9, desc=f"Converting to {output_format_selection}...")
204
  output_file_path = save_output_to_file(raw_content, output_format_selection, url_or_id)
205
 
206
+ preview_content = raw_content
207
  if output_format_selection == "JSON":
208
  preview_content = convert_to_json(raw_content, url_or_id)
209
  elif output_format_selection == "CSV" and output_file_path:
 
222
  preview_content = "[CSV file path not available for preview]"
223
  elif output_format_selection == "PDF":
224
  preview_content = f"[PDF generated. Download to view: {os.path.basename(output_file_path if output_file_path else 'file.pdf')}]"
225
+ if "Saving as Markdown instead" in (output_file_path or ""):
226
  preview_content = raw_content + f"\n\n[Note: PDF conversion failed, showing Markdown. File saved as .pdf.md]"
227
 
 
228
  progress(1, desc="Processing complete.")
229
  return f"Successfully processed: {url_or_id}", preview_content, output_file_path
230
  except Exception as e:
231
  return f"Error during file conversion/saving: {str(e)}", raw_content, None
232
 
233
+ with gr.Blocks(title="RAG-Ready Content Scraper", theme="CultriX/gradio-theme") as iface:
234
  gr.Markdown("# RAG-Ready Content Scraper")
235
  gr.Markdown(
236
  "Scrape webpage content or GitHub repositories to generate RAG-ready datasets."
 
253
  info="0: Only main page. Ignored for GitHub repos."
254
  )
255
  output_format_input = gr.Dropdown(
256
+ choices=["Markdown", "JSON", "CSV", "Text", "PDF"],
257
  value="Markdown",
258
  label="Select Output Format"
259
  )
 
264
  preview_output = gr.Code(label="Preview Content", language="markdown", interactive=False)
265
  file_download_output = gr.File(label="Download Processed File", interactive=False)
266
 
 
 
267
  gr.Examples(
268
  examples=[
269
  ["https://gradio.app/docs/js", "Webpage", 1, "Markdown"],
270
+ ["gradio-app/gradio", "GitHub Repository", 0, "Text"],
271
  ["https://en.wikipedia.org/wiki/Retrieval-augmented_generation", "Webpage", 0, "JSON"],
272
  ],
273
  inputs=[url_input, source_type_input, depth_input, output_format_input],