RAG-Scraper / rag_scraper /converter.py
CultriX's picture
First Commit
1151f26
import re
from typing import Optional
from urllib.parse import urljoin
import html2text
from bs4 import BeautifulSoup, Tag
class Converter:
@staticmethod
def html_to_markdown(
html: str,
base_url: str,
parser_features="html.parser",
**conversion_options,
) -> str:
soup = BeautifulSoup(html, parser_features)
cleaned_soup = Converter.replace_media_with_markdown(soup, base_url)
return Converter.convert_html_to_markdown(
str(cleaned_soup), **conversion_options
)
@staticmethod
def replace_media_with_markdown(
soup: BeautifulSoup, base_url: str
) -> BeautifulSoup:
data_uri_pattern = re.compile(
r"data:([a-zA-Z]+/[a-zA-Z+.-]+)?(;base64)?,[^,]*"
)
def process_media_tag(tag: Tag, media_type: str) -> Optional[str]:
media_url = tag.get("src", "")
if data_uri_pattern.match(media_url):
tag.decompose()
return None
if not media_url.startswith(("http://", "https://")):
media_url = urljoin(base_url, media_url.lstrip("/"))
alt_text = f"{media_type}: {tag.get('alt', '') or tag.get('title', '')}".strip()
return f"![{alt_text}]({media_url})"
for img in soup.find_all("img"):
markdown_img = process_media_tag(img, "Image")
if markdown_img:
img.replace_with(markdown_img)
for video in soup.find_all("video"):
markdown_video = process_media_tag(video, "Video")
if markdown_video:
video.replace_with(markdown_video)
return soup
@staticmethod
def convert_html_to_markdown(cleaned_html: str, **options) -> str:
converter = html2text.HTML2Text()
for key, value in options.items():
setattr(converter, key, value)
return converter.handle(cleaned_html)