Spaces:

Nymbo
/

Fetch

Running

Fetch / app.py

Update app.py

dca43df verified 18 days ago

1.75 kB

	import gradio as gr
	import requests
	from bs4 import BeautifulSoup

	def fetch_content(url):
	"""
	This function takes a URL as input, fetches its HTML,
	parses it to extract the clean, relevant text content,
	and returns it as a formatted string.
	"""
	try:
	# Step 1: Fetch the HTML content
	response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=10)
	response.raise_for_status() # Raises an HTTPError for bad responses (4xx or 5xx)

	# Step 2: Parse the HTML with BeautifulSoup
	soup = BeautifulSoup(response.text, 'html.parser')

	# Step 3: Remove script and style tags, as they don't contain readable content
	for script_or_style in soup(['script', 'style']):
	script_or_style.decompose()

	# Step 4: Get the text and clean it up
	text = soup.get_text()
	lines = (line.strip() for line in text.splitlines())
	chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
	clean_text = '\n'.join(chunk for chunk in chunks if chunk)

	return clean_text

	except requests.exceptions.RequestException as e:
	return f"An error occurred: {e}"

	# Define the Gradio interface with an updated description
	demo = gr.Interface(
	fn=fetch_content,
	inputs=gr.Textbox(label="URL", placeholder="Enter a webpage URL..."),
	outputs=gr.Textbox(label="Clean Text Content", lines=20),
	title="Webpage Text Extractor",
	description="Enter a URL to fetch the clean text content from the web page, stripped of all HTML, scripts, and styles.",
	allow_flagging="never",
	theme="Nymbo/Nymbo_Theme"
	)

	if __name__ == "__main__":
	demo.launch()