selenium_web_scrape

Running

Update app.py

by Vapobbgg - opened Feb 2

←

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,42 +1,53 @@
 import streamlit as st
-from extract import take_webdata
-from PIL import Image
-from io import BytesIO
 def main():
-    st.title("Website Content Exctractor")
     # Get website URL from user input
     url = st.text_input("Enter a URL:", "")
     if st.button("Proceed"):
         if not url:
             st.warning("URL is empty.")
         else:
-            visualize(url)
-def visualize(url):
     try:
-    # Fetch and display the website content
-        with st.spinner("loading website data ..."):
-            # innerHTML = get_innerHTML(url)
-            html_image, html_content = take_webdata(url)
-            st.subheader("Website title:")
             if html_content:
-                st.info(html_content)
-            else:
-                st.error("Error: empty html content")
-            st.subheader("Website preview:")
-            if html_image:
-                st.image(html_image)
             else:
-                st.error("Error: empty html preview")
     except Exception as e:
         st.error(f"Error: {e}")
 if __name__ == "__main__":
-    main()

 import streamlit as st
+import requests
+from bs4 import BeautifulSoup
 def main():
+    st.title("Website Content Extractor")
     # Get website URL from user input
     url = st.text_input("Enter a URL:", "")
     if st.button("Proceed"):
         if not url:
             st.warning("URL is empty.")
         else:
+            extract_text(url)
+def extract_text(url):
     try:
+        # Fetch and extract website content
+        with st.spinner("Loading website data..."):
+            html_content = get_website_text(url)
+            st.subheader("Website Content:")
             if html_content:
+                st.write(html_content)
             else:
+                st.error("Error: Could not extract content.")
     except Exception as e:
         st.error(f"Error: {e}")
+def get_website_text(url):
+    try:
+        # Send GET request to the URL
+        response = requests.get(url)
+        response.raise_for_status()  # Will raise an exception for bad responses (4xx, 5xx)
+        # Parse the HTML content with BeautifulSoup
+        soup = BeautifulSoup(response.text, 'html.parser')
+        # Extract and clean text from the website
+        texts = soup.stripped_strings  # Extracts all text and removes extra spaces/newlines
+        return '\n'.join(texts)  # Join all text pieces into a single string
+    except requests.exceptions.RequestException as e:
+        st.error(f"Error fetching URL: {e}")
+        return None
 if __name__ == "__main__":
+    main()