|
|
|
|
|
|
|
|
|
|
|
import gradio as gr |
|
import requests |
|
from bs4 import BeautifulSoup |
|
from readability import Document |
|
import html |
|
import re |
|
|
|
|
|
def _normalize_text(text: str) -> str: |
|
""" |
|
Layman's terms: This tidies up the text we extracted so it looks nice. |
|
- Converts & things back to normal characters |
|
- Collapses too many blank lines |
|
- Trims leading/trailing whitespace |
|
""" |
|
text = html.unescape(text) |
|
|
|
text = text.replace("\r\n", "\n").replace("\r", "\n") |
|
|
|
text = re.sub(r"\n{3,}", "\n\n", text) |
|
return text.strip() |
|
|
|
|
|
def fetch_page(url: str, extract_text: bool = True) -> str: |
|
""" |
|
Layman's terms: We download the web page. If 'extract_text' is True, |
|
we try to grab only the main article/important text. Otherwise we |
|
return the raw HTML (like your original app). |
|
""" |
|
try: |
|
|
|
resp = requests.get( |
|
url, |
|
headers={"User-Agent": "Mozilla/5.0 (compatible; FetchMCP/1.0)"}, |
|
timeout=15, |
|
allow_redirects=True, |
|
) |
|
resp.raise_for_status() |
|
|
|
except requests.exceptions.RequestException as e: |
|
|
|
return f"Request error: {e}" |
|
|
|
|
|
if not extract_text: |
|
return resp.text |
|
|
|
|
|
try: |
|
|
|
doc = Document(resp.text) |
|
main_html = doc.summary(html_partial=True) |
|
|
|
|
|
soup = BeautifulSoup(main_html, "lxml") |
|
|
|
for tag in soup(["script", "style", "noscript"]): |
|
tag.decompose() |
|
|
|
main_text = soup.get_text(separator="\n") |
|
main_text = _normalize_text(main_text) |
|
|
|
|
|
if len(main_text.split()) < 40: |
|
raise ValueError("Readability extraction too short; falling back") |
|
|
|
return main_text |
|
|
|
except Exception: |
|
|
|
try: |
|
soup = BeautifulSoup(resp.text, "lxml") |
|
|
|
|
|
for tag in soup(["script", "style", "noscript", "header", "footer", "nav", "form", "aside"]): |
|
tag.decompose() |
|
|
|
|
|
candidate = soup.find("main") or soup.find("article") or soup.find("div", attrs={"role": "main"}) |
|
if candidate: |
|
text = candidate.get_text(separator="\n") |
|
else: |
|
text = soup.get_text(separator="\n") |
|
|
|
return _normalize_text(text) |
|
|
|
except Exception as e: |
|
|
|
return f"Extraction fallback failed: {e}\n\n--- Raw HTML below ---\n{resp.text}" |
|
|
|
|
|
|
|
|
|
with gr.Blocks(theme="Nymbo/Nymbo_Theme", title="Fetch MCP") as demo: |
|
gr.Markdown( |
|
""" |
|
# Fetch MCP |
|
Small utility that fetches a web page and returns **just the readable text** by default |
|
*(toggle off to get the full HTML like before)*. |
|
""" |
|
) |
|
|
|
with gr.Row(): |
|
url_input = gr.Textbox( |
|
label="URL", |
|
placeholder="https://example.com/article", |
|
lines=1, |
|
) |
|
with gr.Row(): |
|
extract_toggle = gr.Checkbox( |
|
value=True, |
|
label="Extract only the main readable text (recommended)", |
|
) |
|
|
|
fetch_btn = gr.Button("Fetch", variant="primary") |
|
|
|
|
|
output = gr.Textbox( |
|
label="Output", |
|
lines=20, |
|
interactive=False, |
|
placeholder="Fetched content will appear here…", |
|
) |
|
|
|
|
|
fetch_btn.click(fn=fetch_page, inputs=[url_input, extract_toggle], outputs=output) |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch(mcp_server=True) |
|
|