File size: 1,753 Bytes
e7c6d66
 
ed27cf5
 
5832786
e7c6d66
dca43df
 
 
e7c6d66
 
dca43df
 
 
 
 
5832786
dca43df
 
5832786
 
dca43df
 
5832786
 
 
 
dca43df
5832786
dca43df
321422d
5832786
 
dca43df
5832786
 
dca43df
 
5832786
dca43df
5832786
 
 
e7c6d66
321422d
dca43df
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import gradio as gr
import requests
from bs4 import BeautifulSoup

def fetch_content(url):
    """
    This function takes a URL as input, fetches its HTML,
    parses it to extract the clean, relevant text content,
    and returns it as a formatted string.
    """
    try:
        # Step 1: Fetch the HTML content
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=10)
        response.raise_for_status()  # Raises an HTTPError for bad responses (4xx or 5xx)
        
        # Step 2: Parse the HTML with BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Step 3: Remove script and style tags, as they don't contain readable content
        for script_or_style in soup(['script', 'style']):
            script_or_style.decompose()
            
        # Step 4: Get the text and clean it up
        text = soup.get_text()
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        clean_text = '\n'.join(chunk for chunk in chunks if chunk)
        
        return clean_text
        
    except requests.exceptions.RequestException as e:
        return f"An error occurred: {e}"

# Define the Gradio interface with an updated description
demo = gr.Interface(
    fn=fetch_content,
    inputs=gr.Textbox(label="URL", placeholder="Enter a webpage URL..."),
    outputs=gr.Textbox(label="Clean Text Content", lines=20),
    title="Webpage Text Extractor",
    description="Enter a URL to fetch the clean text content from the web page, stripped of all HTML, scripts, and styles.",
    allow_flagging="never",
    theme="Nymbo/Nymbo_Theme"
)

if __name__ == "__main__":
    demo.launch()