Spaces:

CultriX
/

RAG-Scraper

Running

App Files Files Community

RAG-Scraper / app.py

CultriX

Deploy RAG-Scraper application to HuggingFace Space

2d6afaa 4 months ago

raw

history blame

14.1 kB

	import gradio as gr
	import subprocess
	import os
	import re
	import tempfile
	import json
	import csv
	from rag_scraper.scraper import Scraper
	from rag_scraper.converter import Converter
	from rag_scraper.link_extractor import LinkExtractor, LinkType
	from rag_scraper.utils import URLUtils

	def is_github_repo(url_or_id):
	"""Check if the input is a GitHub repository URL or ID."""
	if "github.com" in url_or_id:
	return True
	if re.match(r'^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+$', url_or_id):
	return True
	return False

	def check_repomix_installed():
	"""Check if Repomix is installed."""
	try:
	result = subprocess.run(["repomix", "--version"],
	capture_output=True, text=True, check=False)
	return result.returncode == 0
	except Exception:
	return False

	def run_repomix(repo_url_or_id, progress=gr.Progress(track_tqdm=True)):
	"""Run Repomix on the GitHub repository and return the content."""
	progress(0, desc="Starting Repomix processing...")
	try:
	with tempfile.TemporaryDirectory() as temp_dir:
	# RepoMix typically outputs a zip file if not specifying a single output style,
	# or a specific file if --style is used.
	# For simplicity, let's assume we want markdown and it outputs to a known file or stdout.
	# The current repomix command in the original script uses --style markdown and --output.
	output_file_name = "repomix-output.md" # Assuming markdown output
	output_file_path = os.path.join(temp_dir, output_file_name)

	if '/' in repo_url_or_id and not repo_url_or_id.startswith('http'):
	repo_url = f"https://github.com/{repo_url_or_id}"
	else:
	repo_url = repo_url_or_id

	progress(0.2, desc=f"Running Repomix on {repo_url}...")
	cmd = [
	"repomix",
	"--remote", repo_url,
	"--output", output_file_path, # Direct output to a file
	"--style", "markdown", # Explicitly request markdown
	"--compress"
	]

	process = subprocess.run(cmd, capture_output=True, text=True, check=False)
	progress(0.8, desc="Repomix command executed.")

	if process.returncode != 0:
	return f"Error running Repomix: {process.stderr}", None

	if os.path.exists(output_file_path):
	with open(output_file_path, 'r', encoding='utf-8') as f:
	content = f.read()
	progress(1, desc="Repomix output processed.")
	return content, output_file_path # Return content and path for potential download
	else:
	return "Error: Repomix did not generate an output file.", None

	except Exception as e:
	progress(1, desc="Error during Repomix processing.")
	return f"Error processing GitHub repository: {str(e)}", None

	def scrape_and_convert_website(url, depth, progress=gr.Progress(track_tqdm=True)):
	"""Fetch HTML, extract links, convert to Markdown."""
	progress(0, desc=f"Starting web scrape for {url}...")
	visited_urls = set()
	all_markdown_content = ""

	def recursive_scrape(current_url, current_depth, total_links_estimate=1, link_index=0):
	if current_url in visited_urls or current_depth < 0:
	return ""

	visited_urls.add(current_url)

	try:
	progress_val = link_index / total_links_estimate if total_links_estimate > 0 else 0
	progress(progress_val, desc=f"Scraping: {current_url} (Depth: {depth - current_depth})")
	html_content = Scraper.fetch_html(current_url)
	except Exception as e:
	return f"Error fetching {current_url}: {str(e)}\n"

	markdown_content = f"## Extracted from: {current_url}\n\n"
	markdown_content += Converter.html_to_markdown(
	html=html_content,
	base_url=current_url,
	parser_features='html.parser',
	ignore_links=True
	)

	page_content = markdown_content + "\n\n"

	if current_depth > 0:
	try:
	links = LinkExtractor.scrape_url(current_url, link_type=LinkType.INTERNAL)
	# Filter out already visited links and external links more carefully
	valid_links = [
	link for link in links
	if URLUtils.is_internal(link, current_url) and link not in visited_urls
	]

	num_links = len(valid_links)
	for i, link_url in enumerate(valid_links):
	page_content += recursive_scrape(link_url, current_depth - 1, num_links, i)
	except Exception as e:
	page_content += f"Error extracting links from {current_url}: {str(e)}\n"
	return page_content

	all_markdown_content = recursive_scrape(url, depth)
	progress(1, desc="Web scraping complete.")

	# For web scraping, we create a temporary file with the content for download
	with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".md", encoding="utf-8") as tmp_file:
	tmp_file.write(all_markdown_content)
	return all_markdown_content, tmp_file.name


	# --- Data Conversion Functions (Stubs for now) ---
	def convert_to_json(markdown_content, source_url_or_id):
	"""Converts markdown content to a JSON string."""
	# Basic implementation: create a JSON object with source and content
	# More sophisticated parsing can be added later
	data = {"source": source_url_or_id, "content": markdown_content}
	return json.dumps(data, indent=2)

	def convert_to_csv(markdown_content, source_url_or_id):
	"""Converts markdown content to a CSV string."""
	# Basic implementation: create a CSV with source and content
	# This is a simplified CSV; real CSVs might need more structure
	output = tempfile.NamedTemporaryFile(mode='w+', delete=False, newline='', suffix=".csv", encoding="utf-8")
	writer = csv.writer(output)
	writer.writerow(["source", "content"]) # Header

	# Split content into manageable chunks or lines if necessary for CSV
	# For now, putting all content in one cell.
	writer.writerow([source_url_or_id, markdown_content])
	output.close()
	return output.name # Return path to the CSV file

	def save_output_to_file(content, output_format, source_url_or_id):
	"""Saves content to a temporary file based on format and returns its path."""
	suffix = f".{output_format.lower()}"
	if output_format == "JSON":
	processed_content = convert_to_json(content, source_url_or_id)
	elif output_format == "CSV":
	# convert_to_csv now returns a path directly
	return convert_to_csv(content, source_url_or_id)
	else: # Markdown/Text
	processed_content = content
	suffix = ".md"

	with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=suffix, encoding="utf-8") as tmp_file:
	tmp_file.write(processed_content)
	return tmp_file.name

	# --- Main Processing Function ---
	def process_input_updated(url_or_id, source_type, depth, output_format_selection, progress=gr.Progress(track_tqdm=True)):
	"""Main function to process URL or GitHub repo based on selected type and format."""
	progress(0, desc="Initializing...")
	raw_content = ""
	error_message = ""
	output_file_path = None

	if source_type == "GitHub Repository":
	if not check_repomix_installed():
	error_message = "Repomix is not installed or not accessible. Please ensure it's installed globally in your Docker environment."
	return error_message, None, None # Text output, Preview, File output

	raw_content, _ = run_repomix(url_or_id, progress=progress) # Repomix returns content and its original path
	if "Error" in raw_content: # Simple error check
	error_message = raw_content
	raw_content = ""

	elif source_type == "Webpage":
	raw_content, _ = scrape_and_convert_website(url_or_id, depth, progress=progress)
	if "Error" in raw_content: # Simple error check
	error_message = raw_content
	raw_content = ""
	else:
	error_message = "Invalid source type selected."
	return error_message, None, None

	if error_message:
	return error_message, None, None # Error text, no preview, no file

	# Save raw_content (which is markdown) to a file of the chosen output_format
	# This will handle conversion if necessary
	try:
	progress(0.9, desc=f"Converting to {output_format_selection}...")
	output_file_path = save_output_to_file(raw_content, output_format_selection, url_or_id)

	# For preview, we'll show the raw markdown, or a snippet of JSON/CSV
	preview_content = raw_content # Default to markdown
	if output_format_selection == "JSON":
	preview_content = convert_to_json(raw_content, url_or_id)
	elif output_format_selection == "CSV":
	# For CSV preview, maybe just show a message or first few lines
	preview_content = f"CSV file generated. Path: {output_file_path}\nFirst few lines might be shown here in a real app."
	# Or read a bit of the CSV for preview:
	# with open(output_file_path, 'r', encoding='utf-8') as f_csv:
	# preview_content = "".join(f_csv.readlines()[:5])

	progress(1, desc="Processing complete.")
	return f"Successfully processed: {url_or_id}", preview_content, output_file_path
	except Exception as e:
	return f"Error during file conversion/saving: {str(e)}", raw_content, None


	# --- Gradio Interface Definition ---
	with gr.Blocks(theme=gr.themes.Soft()) as iface:
	gr.Markdown("# RAG-Ready Content Scraper")
	gr.Markdown(
	"Scrape webpage content (using RAG-scraper) or GitHub repositories (using RepoMix) "
	"to generate RAG-ready datasets. Uses Docker for full functionality on HuggingFace Spaces."
	)

	with gr.Row():
	with gr.Column(scale=2):
	url_input = gr.Textbox(
	label="Enter URL or GitHub Repository ID",
	placeholder="e.g., https://example.com OR username/repo"
	)
	source_type_input = gr.Radio(
	choices=["Webpage", "GitHub Repository"],
	value="Webpage",
	label="Select Source Type"
	)
	depth_input = gr.Slider(
	minimum=0, maximum=3, step=1, value=0,
	label="Scraping Depth (for Webpages)",
	info="0: Only main page. Ignored for GitHub repos."
	)
	output_format_input = gr.Dropdown(
	choices=["Markdown", "JSON", "CSV"], # Markdown is like text file
	value="Markdown",
	label="Select Output Format"
	)
	submit_button = gr.Button("Process Content", variant="primary")

	with gr.Column(scale=3):
	status_output = gr.Textbox(label="Status", interactive=False)
	preview_output = gr.Code(label="Preview Content", language="markdown", interactive=False) # Default to markdown, can show JSON too
	file_download_output = gr.File(label="Download Processed File", interactive=False)

	progress_bar = gr.Progress(track_tqdm=True)

	# --- Examples ---
	gr.Examples(
	examples=[
	["https://gradio.app/docs/js", "Webpage", 1, "Markdown"],
	["gradio-app/gradio", "GitHub Repository", 0, "Markdown"],
	["https://en.wikipedia.org/wiki/Retrieval-augmented_generation", "Webpage", 0, "JSON"],
	],
	inputs=[url_input, source_type_input, depth_input, output_format_input],
	outputs=[status_output, preview_output, file_download_output], # Function needs to match this
	fn=process_input_updated, # Make sure the function signature matches
	cache_examples=False # For development, disable caching
	)

	# --- How it Works & GitHub Link ---
	with gr.Accordion("How it Works & More Info", open=False):
	gr.Markdown(
	"""
	Webpage Scraping:
	1. Enter a full URL (e.g., `https://example.com`).
	2. Select "Webpage" as the source type.
	3. Set the desired scraping depth (how many levels of internal links to follow).
	4. Choose your output format.
	5. The tool fetches HTML, converts it to Markdown, and follows internal links up to the specified depth.

	GitHub Repository Processing:
	1. Enter a GitHub repository URL (e.g., `https://github.com/username/repo`) or shorthand ID (e.g., `username/repo`).
	2. Select "GitHub Repository" as the source type. (Scraping depth is ignored).
	3. Choose your output format.
	4. The tool uses RepoMix to fetch and process the repository into a structured Markdown format.

	Output Formats:
	- Markdown: Plain text Markdown file, suitable for direct reading or further processing.
	- JSON: Structured JSON output, typically with fields like `source` and `content`.
	- CSV: Comma-Separated Values file, useful for tabular data or importing into spreadsheets.

	Note on HuggingFace Spaces: This application is designed to run in a Docker-based HuggingFace Space,
	which allows the use of `RepoMix` for GitHub repositories.

	[View Source Code on HuggingFace Spaces](https://huggingface.co/spaces/CultriX/RAG-Scraper)
	"""
	)

	submit_button.click(
	fn=process_input_updated,
	inputs=[url_input, source_type_input, depth_input, output_format_input, progress_bar],
	outputs=[status_output, preview_output, file_download_output]
	)

	if __name__ == "__main__":
	iface.launch()