RAG-Scraper / rag_scraper /link_extractor.py
CultriX's picture
Added recursion
d70a98e
raw
history blame
2.28 kB
from enum import Enum, auto
from typing import Set
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
class LinkType(Enum):
ALL = auto()
INTERNAL = auto()
EXTERNAL = auto()
class LinkExtractor:
@staticmethod
def scrape_url(url: str, link_type: LinkType = LinkType.ALL, depth: int = 0, visited_urls: Set[str] = None) -> Set[str]:
"""
Scrape a given URL for unique links within a specified element,
with recursive depth support.
:param url: The URL of the website to scrape.
:param link_type: The type of links to scrape (ALL, INTERNAL, EXTERNAL).
:param depth: The recursion depth for extracting links.
:param visited_urls: A set to keep track of visited URLs.
:return: A set of unique link URLs found.
"""
if visited_urls is None:
visited_urls = set()
if url in visited_urls or depth < 0:
return set()
visited_urls.add(url)
base_url = "{uri.scheme}://{uri.netloc}".format(uri=urlparse(url))
extracted_links = set()
try:
response = requests.get(url, timeout=5)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
for a_tag in soup.find_all("a", href=True):
href = a_tag["href"]
absolute_url = urljoin(url, href)
domain = urlparse(absolute_url).netloc
if link_type == LinkType.INTERNAL and domain == urlparse(base_url).netloc:
extracted_links.add(absolute_url)
elif link_type == LinkType.EXTERNAL and domain != urlparse(base_url).netloc:
extracted_links.add(absolute_url)
elif link_type == LinkType.ALL:
extracted_links.add(absolute_url)
# Recursive scraping if depth > 0
if depth > 0:
for link in extracted_links.copy():
extracted_links.update(LinkExtractor.scrape_url(link, link_type, depth - 1, visited_urls))
return extracted_links
except requests.RequestException as e:
print(f"Request failed for {url}: {e}")
return set()