Spaces:
Running
Running
import re | |
from typing import Optional | |
from urllib.parse import urljoin | |
import html2text | |
from bs4 import BeautifulSoup, Tag | |
class Converter: | |
def html_to_markdown( | |
html: str, | |
base_url: str, | |
parser_features="html.parser", | |
**conversion_options, | |
) -> str: | |
soup = BeautifulSoup(html, parser_features) | |
cleaned_soup = Converter.replace_media_with_markdown(soup, base_url) | |
return Converter.convert_html_to_markdown( | |
str(cleaned_soup), **conversion_options | |
) | |
def replace_media_with_markdown( | |
soup: BeautifulSoup, base_url: str | |
) -> BeautifulSoup: | |
data_uri_pattern = re.compile( | |
r"data:([a-zA-Z]+/[a-zA-Z+.-]+)?(;base64)?,[^,]*" | |
) | |
def process_media_tag(tag: Tag, media_type: str) -> Optional[str]: | |
media_url = tag.get("src", "") | |
if data_uri_pattern.match(media_url): | |
tag.decompose() | |
return None | |
if not media_url.startswith(("http://", "https://")): | |
media_url = urljoin(base_url, media_url.lstrip("/")) | |
alt_text = f"{media_type}: {tag.get('alt', '') or tag.get('title', '')}".strip() | |
return f"" | |
for img in soup.find_all("img"): | |
markdown_img = process_media_tag(img, "Image") | |
if markdown_img: | |
img.replace_with(markdown_img) | |
for video in soup.find_all("video"): | |
markdown_video = process_media_tag(video, "Video") | |
if markdown_video: | |
video.replace_with(markdown_video) | |
return soup | |
def convert_html_to_markdown(cleaned_html: str, **options) -> str: | |
converter = html2text.HTML2Text() | |
for key, value in options.items(): | |
setattr(converter, key, value) | |
return converter.handle(cleaned_html) | |