Spaces:
Running
Running
from enum import Enum, auto | |
from typing import Set | |
from urllib.parse import urljoin, urlparse | |
import requests | |
from bs4 import BeautifulSoup | |
class LinkType(Enum): | |
ALL = auto() | |
INTERNAL = auto() | |
EXTERNAL = auto() | |
class LinkExtractor: | |
def scrape_url( | |
url: str, link_type: LinkType = LinkType.ALL, **kwargs | |
) -> Set[str]: | |
""" | |
Scrape a given URL for unique links within a specified element, with an option to choose between internal, external, or all links. | |
Converts relative URLs to absolute URLs. | |
:param url: The URL of the website to scrape. | |
:param link_type: The type of links to scrape (LinkType.ALL, LinkType.INTERNAL, LinkType.EXTERNAL). | |
:param kwargs: Keyword arguments to specify element id and element type. | |
:return: A set of unique link URLs found within the specified element. | |
""" | |
element_id = kwargs.get("element_id") | |
element_type = kwargs.get("element_type", "nav") | |
base_url = "{uri.scheme}://{uri.netloc}".format(uri=urlparse(url)) | |
try: | |
response = requests.get(url) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.text, "html.parser") | |
if element_id: | |
fetched_element = soup.find_all(element_type, id=element_id) | |
else: | |
fetched_element = soup.find_all(element_type) | |
links = set() | |
# Iterate over all found elements and extract links | |
for element in fetched_element: | |
for a_tag in element.find_all("a", href=True): | |
href = a_tag["href"] | |
absolute_url = urljoin(url, href) | |
domain = urlparse(absolute_url).netloc | |
if ( | |
link_type == LinkType.INTERNAL | |
and domain == urlparse(base_url).netloc | |
): | |
links.add(absolute_url) | |
elif ( | |
link_type == LinkType.EXTERNAL | |
and domain != urlparse(base_url).netloc | |
): | |
links.add(absolute_url) | |
elif link_type == LinkType.ALL: | |
links.add(absolute_url) | |
return links | |
except requests.RequestException as e: | |
print(f"Request failed for {url}: {e}") | |
return set() | |
except Exception as e: | |
print(f"An error occurred: {e}") | |
return set() | |