Files changed (1) hide show
  1. app.py +31 -20
app.py CHANGED
@@ -1,42 +1,53 @@
1
  import streamlit as st
2
- from extract import take_webdata
3
- from PIL import Image
4
- from io import BytesIO
5
 
6
  def main():
7
- st.title("Website Content Exctractor")
8
 
9
  # Get website URL from user input
10
  url = st.text_input("Enter a URL:", "")
 
11
  if st.button("Proceed"):
12
  if not url:
13
  st.warning("URL is empty.")
14
  else:
15
- visualize(url)
16
 
17
 
18
- def visualize(url):
19
  try:
20
- # Fetch and display the website content
21
- with st.spinner("loading website data ..."):
22
- # innerHTML = get_innerHTML(url)
23
- html_image, html_content = take_webdata(url)
24
- st.subheader("Website title:")
25
  if html_content:
26
- st.info(html_content)
27
- else:
28
- st.error("Error: empty html content")
29
- st.subheader("Website preview:")
30
- if html_image:
31
- st.image(html_image)
32
  else:
33
- st.error("Error: empty html preview")
34
-
35
 
36
  except Exception as e:
37
  st.error(f"Error: {e}")
38
 
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  if __name__ == "__main__":
42
- main()
 
1
  import streamlit as st
2
+ import requests
3
+ from bs4 import BeautifulSoup
 
4
 
5
  def main():
6
+ st.title("Website Content Extractor")
7
 
8
  # Get website URL from user input
9
  url = st.text_input("Enter a URL:", "")
10
+
11
  if st.button("Proceed"):
12
  if not url:
13
  st.warning("URL is empty.")
14
  else:
15
+ extract_text(url)
16
 
17
 
18
+ def extract_text(url):
19
  try:
20
+ # Fetch and extract website content
21
+ with st.spinner("Loading website data..."):
22
+ html_content = get_website_text(url)
23
+
24
+ st.subheader("Website Content:")
25
  if html_content:
26
+ st.write(html_content)
 
 
 
 
 
27
  else:
28
+ st.error("Error: Could not extract content.")
 
29
 
30
  except Exception as e:
31
  st.error(f"Error: {e}")
32
 
33
 
34
+ def get_website_text(url):
35
+ try:
36
+ # Send GET request to the URL
37
+ response = requests.get(url)
38
+ response.raise_for_status() # Will raise an exception for bad responses (4xx, 5xx)
39
+
40
+ # Parse the HTML content with BeautifulSoup
41
+ soup = BeautifulSoup(response.text, 'html.parser')
42
+
43
+ # Extract and clean text from the website
44
+ texts = soup.stripped_strings # Extracts all text and removes extra spaces/newlines
45
+ return '\n'.join(texts) # Join all text pieces into a single string
46
+
47
+ except requests.exceptions.RequestException as e:
48
+ st.error(f"Error fetching URL: {e}")
49
+ return None
50
+
51
 
52
  if __name__ == "__main__":
53
+ main()