Spaces:

CultriX
/

RAG-Scraper

Running

File size: 1,947 Bytes

1151f26

import re
from typing import Optional
from urllib.parse import urljoin

import html2text
from bs4 import BeautifulSoup, Tag


class Converter:
    @staticmethod
    def html_to_markdown(
        html: str,
        base_url: str,
        parser_features="html.parser",
        **conversion_options,
    ) -> str:
        soup = BeautifulSoup(html, parser_features)
        cleaned_soup = Converter.replace_media_with_markdown(soup, base_url)
        return Converter.convert_html_to_markdown(
            str(cleaned_soup), **conversion_options
        )

    @staticmethod
    def replace_media_with_markdown(
        soup: BeautifulSoup, base_url: str
    ) -> BeautifulSoup:
        data_uri_pattern = re.compile(
            r"data:([a-zA-Z]+/[a-zA-Z+.-]+)?(;base64)?,[^,]*"
        )

        def process_media_tag(tag: Tag, media_type: str) -> Optional[str]:
            media_url = tag.get("src", "")
            if data_uri_pattern.match(media_url):
                tag.decompose()
                return None
            if not media_url.startswith(("http://", "https://")):
                media_url = urljoin(base_url, media_url.lstrip("/"))
            alt_text = f"{media_type}: {tag.get('alt', '') or tag.get('title', '')}".strip()
            return f"![{alt_text}]({media_url})"

        for img in soup.find_all("img"):
            markdown_img = process_media_tag(img, "Image")
            if markdown_img:
                img.replace_with(markdown_img)

        for video in soup.find_all("video"):
            markdown_video = process_media_tag(video, "Video")
            if markdown_video:
                video.replace_with(markdown_video)

        return soup

    @staticmethod
    def convert_html_to_markdown(cleaned_html: str, **options) -> str:
        converter = html2text.HTML2Text()
        for key, value in options.items():
            setattr(converter, key, value)
        return converter.handle(cleaned_html)