selenium_web_scrape

Running

File size: 1,523 Bytes

b6e91ad
9f38b3a
 
b6e91ad
 
9f38b3a
b6e91ad
 
 
9f38b3a
b6e91ad
 
 
 
9f38b3a
b6e91ad
 
9f38b3a
b6e91ad
9f38b3a
 
 
 
 
984bb90
9f38b3a
b6e91ad
9f38b3a
b6e91ad
 
 
 
 
9f38b3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b6e91ad
 
9f38b3a

import streamlit as st
import requests
from bs4 import BeautifulSoup

def main():
    st.title("Website Content Extractor")
    
    # Get website URL from user input
    url = st.text_input("Enter a URL:", "")
    
    if st.button("Proceed"):
        if not url:
            st.warning("URL is empty.")
        else:
            extract_text(url)
  

def extract_text(url):  
    try:
        # Fetch and extract website content
        with st.spinner("Loading website data..."):
            html_content = get_website_text(url)
            
            st.subheader("Website Content:")
            if html_content:
                st.write(html_content)
            else:
                st.error("Error: Could not extract content.")
    
    except Exception as e:
        st.error(f"Error: {e}")


def get_website_text(url):
    try:
        # Send GET request to the URL
        response = requests.get(url)
        response.raise_for_status()  # Will raise an exception for bad responses (4xx, 5xx)
        
        # Parse the HTML content with BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract and clean text from the website
        texts = soup.stripped_strings  # Extracts all text and removes extra spaces/newlines
        return '\n'.join(texts)  # Join all text pieces into a single string
        
    except requests.exceptions.RequestException as e:
        st.error(f"Error fetching URL: {e}")
        return None


if __name__ == "__main__":
    main()