import gradio as gr import requests from bs4 import BeautifulSoup def fetch_content(url): """ This function takes a URL as input, fetches its HTML, parses it to extract the clean, relevant text content, and returns it as a formatted string. """ try: # Step 1: Fetch the HTML content response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=10) response.raise_for_status() # Raises an HTTPError for bad responses (4xx or 5xx) # Step 2: Parse the HTML with BeautifulSoup soup = BeautifulSoup(response.text, 'html.parser') # Step 3: Remove script and style tags, as they don't contain readable content for script_or_style in soup(['script', 'style']): script_or_style.decompose() # Step 4: Get the text and clean it up text = soup.get_text() lines = (line.strip() for line in text.splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) clean_text = '\n'.join(chunk for chunk in chunks if chunk) return clean_text except requests.exceptions.RequestException as e: return f"An error occurred: {e}" # Define the Gradio interface with an updated description demo = gr.Interface( fn=fetch_content, inputs=gr.Textbox(label="URL", placeholder="Enter a webpage URL..."), outputs=gr.Textbox(label="Clean Text Content", lines=20), title="Webpage Text Extractor", description="Enter a URL to fetch the clean text content from the web page, stripped of all HTML, scripts, and styles.", allow_flagging="never", theme="Nymbo/Nymbo_Theme" ) if __name__ == "__main__": demo.launch()