Spaces:

Nymbo
/

Fetch

Running

App Files Files Community

Fetch / app.py

Nymbo

Update app.py

32db98e verified 18 days ago

raw

history blame

5.39 kB

	# File: app.py
	# Purpose: Provide a Gradio UI that fetches a URL and (by default) returns only the
	# relevant human-readable text instead of the entire HTML.
	# Includes robust error handling, timeouts, and fallbacks.

	import gradio as gr # UI framework
	import requests # makes the web request
	from bs4 import BeautifulSoup # parses HTML so we can work with it
	from readability import Document # distills a page down to its "main article" content
	import html # unescapes HTML entities like & → &
	import re # simple cleanup with regex

	# ---- helper: clean up text nicely -------------------------------------------
	def _normalize_text(text: str) -> str:
	"""
	Layman's terms: This tidies up the text we extracted so it looks nice.
	- Converts & things back to normal characters
	- Collapses too many blank lines
	- Trims leading/trailing whitespace
	"""
	text = html.unescape(text)
	# Replace Windows/Mac line endings with Unix and normalize spaces
	text = text.replace("\r\n", "\n").replace("\r", "\n")
	# Collapse 3+ newlines down to 2
	text = re.sub(r"\n{3,}", "\n\n", text)
	return text.strip()

	# ---- core fetcher: return main text or raw HTML ------------------------------
	def fetch_page(url: str, extract_text: bool = True) -> str:
	"""
	Layman's terms: We download the web page. If 'extract_text' is True,
	we try to grab only the main article/important text. Otherwise we
	return the raw HTML (like your original app).
	"""
	try:
	# Make the request with a friendly browser-like header and a timeout
	resp = requests.get(
	url,
	headers={"User-Agent": "Mozilla/5.0 (compatible; FetchMCP/1.0)"},
	timeout=15,
	allow_redirects=True,
	)
	resp.raise_for_status() # If site returns 4xx/5xx, this will raise an error

	except requests.exceptions.RequestException as e:
	# Layman's terms: If anything goes wrong with the request, report it nicely.
	return f"Request error: {e}"

	# If the user wants full HTML, behave like the original version
	if not extract_text:
	return resp.text

	# Try readability first (usually best for articles/blog posts)
	try:
	# readability extracts the "main" content and returns HTML of just that part
	doc = Document(resp.text)
	main_html = doc.summary(html_partial=True)

	# Parse the article-only HTML and get just the visible text
	soup = BeautifulSoup(main_html, "lxml")
	# Remove script/style just in case
	for tag in soup(["script", "style", "noscript"]):
	tag.decompose()

	main_text = soup.get_text(separator="\n")
	main_text = _normalize_text(main_text)

	# Fallback: if extraction produced nearly nothing, try a simpler approach
	if len(main_text.split()) < 40:
	raise ValueError("Readability extraction too short; falling back")

	return main_text

	except Exception:
	# Simpler fallback: strip tags from the whole page but ignore obviously noisy areas
	try:
	soup = BeautifulSoup(resp.text, "lxml")

	# Remove common noise: scripts, styles, nav, footer, header, forms
	for tag in soup(["script", "style", "noscript", "header", "footer", "nav", "form", "aside"]):
	tag.decompose()

	# If there's a <main> or an article-like block, prefer that
	candidate = soup.find("main") or soup.find("article") or soup.find("div", attrs={"role": "main"})
	if candidate:
	text = candidate.get_text(separator="\n")
	else:
	text = soup.get_text(separator="\n")

	return _normalize_text(text)

	except Exception as e:
	# Last resort: give raw HTML if even fallback parsing fails
	return f"Extraction fallback failed: {e}\n\n--- Raw HTML below ---\n{resp.text}"

	# ---- Gradio UI ---------------------------------------------------------------
	# Layman's terms: This is the app window. You paste a URL and choose whether to
	# extract readable text or keep full HTML. Then click "Fetch".
	with gr.Blocks(theme="Nymbo/Nymbo_Theme", title="Fetch MCP") as demo:
	gr.Markdown(
	"""
	# Fetch MCP
	Small utility that fetches a web page and returns just the readable text by default
	(toggle off to get the full HTML like before).
	"""
	)

	with gr.Row():
	url_input = gr.Textbox(
	label="URL",
	placeholder="https://example.com/article",
	lines=1,
	)
	with gr.Row():
	extract_toggle = gr.Checkbox(
	value=True,
	label="Extract only the main readable text (recommended)",
	)

	fetch_btn = gr.Button("Fetch", variant="primary")

	# Output as plain text so it’s easy to copy or pipe into other tools
	output = gr.Textbox(
	label="Output",
	lines=20,
	interactive=False,
	placeholder="Fetched content will appear here…",
	)

	# Wire the button to our function
	fetch_btn.click(fn=fetch_page, inputs=[url_input, extract_toggle], outputs=output)

	# Run as normal, keeping MCP server enabled
	if __name__ == "__main__":
	demo.launch(mcp_server=True)