File size: 1,523 Bytes
b6e91ad
9f38b3a
 
b6e91ad
 
9f38b3a
b6e91ad
 
 
9f38b3a
b6e91ad
 
 
 
9f38b3a
b6e91ad
 
9f38b3a
b6e91ad
9f38b3a
 
 
 
 
984bb90
9f38b3a
b6e91ad
9f38b3a
b6e91ad
 
 
 
 
9f38b3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b6e91ad
 
9f38b3a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import streamlit as st
import requests
from bs4 import BeautifulSoup

def main():
    st.title("Website Content Extractor")
    
    # Get website URL from user input
    url = st.text_input("Enter a URL:", "")
    
    if st.button("Proceed"):
        if not url:
            st.warning("URL is empty.")
        else:
            extract_text(url)
  

def extract_text(url):  
    try:
        # Fetch and extract website content
        with st.spinner("Loading website data..."):
            html_content = get_website_text(url)
            
            st.subheader("Website Content:")
            if html_content:
                st.write(html_content)
            else:
                st.error("Error: Could not extract content.")
    
    except Exception as e:
        st.error(f"Error: {e}")


def get_website_text(url):
    try:
        # Send GET request to the URL
        response = requests.get(url)
        response.raise_for_status()  # Will raise an exception for bad responses (4xx, 5xx)
        
        # Parse the HTML content with BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract and clean text from the website
        texts = soup.stripped_strings  # Extracts all text and removes extra spaces/newlines
        return '\n'.join(texts)  # Join all text pieces into a single string
        
    except requests.exceptions.RequestException as e:
        st.error(f"Error fetching URL: {e}")
        return None


if __name__ == "__main__":
    main()