|
import gradio as gr |
|
import requests |
|
from bs4 import BeautifulSoup |
|
|
|
def fetch_content(url): |
|
""" |
|
This function takes a URL as input, fetches its HTML, |
|
parses it to extract the clean, relevant text content, |
|
and returns it as a formatted string. |
|
""" |
|
try: |
|
|
|
response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=10) |
|
response.raise_for_status() |
|
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
|
|
|
|
for script_or_style in soup(['script', 'style']): |
|
script_or_style.decompose() |
|
|
|
|
|
text = soup.get_text() |
|
lines = (line.strip() for line in text.splitlines()) |
|
chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) |
|
clean_text = '\n'.join(chunk for chunk in chunks if chunk) |
|
|
|
return clean_text |
|
|
|
except requests.exceptions.RequestException as e: |
|
return f"An error occurred: {e}" |
|
|
|
|
|
demo = gr.Interface( |
|
fn=fetch_content, |
|
inputs=gr.Textbox(label="URL", placeholder="Enter a webpage URL..."), |
|
outputs=gr.Textbox(label="Clean Text Content", lines=20), |
|
title="Webpage Text Extractor", |
|
description="Enter a URL to fetch the clean text content from the web page, stripped of all HTML, scripts, and styles.", |
|
allow_flagging="never", |
|
theme="Nymbo/Nymbo_Theme" |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |