Spaces:

CultriX
/

RAG-Scraper

Running

App Files Files Community

RAG-Scraper / rag_scraper /link_extractor.py

CultriX

First Commit

1151f26 6 months ago

raw

history blame

2.56 kB

	from enum import Enum, auto
	from typing import Set
	from urllib.parse import urljoin, urlparse

	import requests
	from bs4 import BeautifulSoup


	class LinkType(Enum):
	ALL = auto()
	INTERNAL = auto()
	EXTERNAL = auto()


	class LinkExtractor:
	@staticmethod
	def scrape_url(
	url: str, link_type: LinkType = LinkType.ALL, **kwargs
	) -> Set[str]:
	"""
	Scrape a given URL for unique links within a specified element, with an option to choose between internal, external, or all links.
	Converts relative URLs to absolute URLs.
	:param url: The URL of the website to scrape.
	:param link_type: The type of links to scrape (LinkType.ALL, LinkType.INTERNAL, LinkType.EXTERNAL).
	:param kwargs: Keyword arguments to specify element id and element type.
	:return: A set of unique link URLs found within the specified element.
	"""
	element_id = kwargs.get("element_id")
	element_type = kwargs.get("element_type", "nav")
	base_url = "{uri.scheme}://{uri.netloc}".format(uri=urlparse(url))

	try:
	response = requests.get(url)
	response.raise_for_status()
	soup = BeautifulSoup(response.text, "html.parser")

	if element_id:
	fetched_element = soup.find_all(element_type, id=element_id)
	else:
	fetched_element = soup.find_all(element_type)

	links = set()

	# Iterate over all found elements and extract links
	for element in fetched_element:
	for a_tag in element.find_all("a", href=True):
	href = a_tag["href"]
	absolute_url = urljoin(url, href)
	domain = urlparse(absolute_url).netloc

	if (
	link_type == LinkType.INTERNAL
	and domain == urlparse(base_url).netloc
	):
	links.add(absolute_url)
	elif (
	link_type == LinkType.EXTERNAL
	and domain != urlparse(base_url).netloc
	):
	links.add(absolute_url)
	elif link_type == LinkType.ALL:
	links.add(absolute_url)

	return links
	except requests.RequestException as e:
	print(f"Request failed for {url}: {e}")
	return set()
	except Exception as e:
	print(f"An error occurred: {e}")
	return set()