"""Content formatting and rendering utilities for the Markit application.""" import markdown import json import base64 import html import logging from src.core.logging_config import get_logger logger = get_logger(__name__) def format_markdown_content(content): """Convert markdown content to HTML.""" if not content: return content # Convert the content to HTML using markdown library html_content = markdown.markdown(str(content), extensions=['tables']) return html_content def render_latex_to_html(latex_content): """Convert LaTeX content to HTML using Mathpix Markdown like GOT-OCR demo.""" # Clean up the content similar to GOT-OCR demo content = latex_content.strip() if content.endswith("<|im_end|>"): content = content[:-len("<|im_end|>")] # Fix unbalanced delimiters exactly like GOT-OCR demo right_num = content.count("\\right") left_num = content.count("\\left") if right_num != left_num: content = ( content.replace("\\left(", "(") .replace("\\right)", ")") .replace("\\left[", "[") .replace("\\right]", "]") .replace("\\left{", "{") .replace("\\right}", "}") .replace("\\left|", "|") .replace("\\right|", "|") .replace("\\left.", ".") .replace("\\right.", ".") ) # Process content like GOT-OCR demo: remove $ signs and replace quotes content = content.replace('"', "``").replace("$", "") # Split into lines and create JavaScript string like GOT-OCR demo outputs_list = content.split("\n") js_text_parts = [] for line in outputs_list: # Escape backslashes and add line break escaped_line = line.replace("\\", "\\\\") js_text_parts.append(f'"{escaped_line}\\n"') # Join with + like in GOT-OCR demo js_text = " + ".join(js_text_parts) # Create HTML using Mathpix Markdown like GOT-OCR demo html_content = f""" LaTeX Content
""" return html_content def format_latex_content(content): """Format LaTeX content for display in UI using MathJax rendering like GOT-OCR demo.""" if not content: return content try: # Generate rendered HTML rendered_html = render_latex_to_html(content) # Encode for iframe display (similar to GOT-OCR demo) encoded_html = base64.b64encode(rendered_html.encode("utf-8")).decode("utf-8") iframe_src = f"data:text/html;base64,{encoded_html}" # Create the display with both rendered and raw views formatted_content = f"""
📄 LaTeX Content (Rendered with MathJax)
💡 LaTeX content rendered with MathJax. Tables and formulas are displayed as they would appear in a LaTeX document.
📝 View Raw LaTeX Source
{content}
                    
""" except Exception as e: # Fallback to simple formatting if rendering fails logger.error(f"Error rendering LaTeX content: {e}") escaped_content = html.escape(str(content)) formatted_content = f"""
📄 LaTeX Content (Fallback View)
{escaped_content}
                
⚠️ Rendering failed, showing raw LaTeX. Error: {str(e)}
""" return formatted_content