Spaces:
Running
on
Zero
Running
on
Zero
File size: 7,056 Bytes
6ea41ec |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 |
"""Content formatting and rendering utilities for the Markit application."""
import markdown
import json
import base64
import html
import logging
from src.core.logging_config import get_logger
logger = get_logger(__name__)
def format_markdown_content(content):
"""Convert markdown content to HTML."""
if not content:
return content
# Convert the content to HTML using markdown library
html_content = markdown.markdown(str(content), extensions=['tables'])
return html_content
def render_latex_to_html(latex_content):
"""Convert LaTeX content to HTML using Mathpix Markdown like GOT-OCR demo."""
# Clean up the content similar to GOT-OCR demo
content = latex_content.strip()
if content.endswith("<|im_end|>"):
content = content[:-len("<|im_end|>")]
# Fix unbalanced delimiters exactly like GOT-OCR demo
right_num = content.count("\\right")
left_num = content.count("\\left")
if right_num != left_num:
content = (
content.replace("\\left(", "(")
.replace("\\right)", ")")
.replace("\\left[", "[")
.replace("\\right]", "]")
.replace("\\left{", "{")
.replace("\\right}", "}")
.replace("\\left|", "|")
.replace("\\right|", "|")
.replace("\\left.", ".")
.replace("\\right.", ".")
)
# Process content like GOT-OCR demo: remove $ signs and replace quotes
content = content.replace('"', "``").replace("$", "")
# Split into lines and create JavaScript string like GOT-OCR demo
outputs_list = content.split("\n")
js_text_parts = []
for line in outputs_list:
# Escape backslashes and add line break
escaped_line = line.replace("\\", "\\\\")
js_text_parts.append(f'"{escaped_line}\\n"')
# Join with + like in GOT-OCR demo
js_text = " + ".join(js_text_parts)
# Create HTML using Mathpix Markdown like GOT-OCR demo
html_content = f"""<!DOCTYPE html>
<html lang="en" data-lt-installed="true">
<head>
<meta charset="UTF-8">
<title>LaTeX Content</title>
<script>
const text = {js_text};
</script>
<style>
#content {{
max-width: 800px;
margin: auto;
padding: 20px;
}}
body {{
font-family: 'Times New Roman', serif;
line-height: 1.6;
background-color: #ffffff;
color: #333;
}}
table {{
border-collapse: collapse;
width: 100%;
margin: 20px 0;
}}
td, th {{
border: 1px solid #333;
padding: 8px 12px;
text-align: center;
vertical-align: middle;
}}
</style>
<script>
let script = document.createElement('script');
script.src = "https://cdn.jsdelivr.net/npm/mathpix-markdown-it@1.3.6/es5/bundle.js";
document.head.append(script);
script.onload = function() {{
const isLoaded = window.loadMathJax();
if (isLoaded) {{
console.log('Styles loaded!')
}}
const el = window.document.getElementById('content-text');
if (el) {{
const options = {{
htmlTags: true
}};
const html = window.render(text, options);
el.outerHTML = html;
}}
}};
</script>
</head>
<body>
<div id="content">
<div id="content-text"></div>
</div>
</body>
</html>"""
return html_content
def format_latex_content(content):
"""Format LaTeX content for display in UI using MathJax rendering like GOT-OCR demo."""
if not content:
return content
try:
# Generate rendered HTML
rendered_html = render_latex_to_html(content)
# Encode for iframe display (similar to GOT-OCR demo)
encoded_html = base64.b64encode(rendered_html.encode("utf-8")).decode("utf-8")
iframe_src = f"data:text/html;base64,{encoded_html}"
# Create the display with both rendered and raw views
formatted_content = f"""
<div style="background-color: #f8f9fa; border-radius: 8px; border: 1px solid #e9ecef; margin: 10px 0;">
<div style="background-color: #e9ecef; padding: 10px; border-radius: 8px 8px 0 0; font-weight: bold; color: #495057;">
π LaTeX Content (Rendered with MathJax)
</div>
<div style="padding: 0;">
<iframe src="{iframe_src}" width="100%" height="500px" style="border: none; border-radius: 0 0 8px 8px;"></iframe>
</div>
<div style="background-color: #e9ecef; padding: 8px 15px; border-radius: 0; font-size: 12px; color: #6c757d; border-top: 1px solid #dee2e6;">
π‘ LaTeX content rendered with MathJax. Tables and formulas are displayed as they would appear in a LaTeX document.
</div>
<details style="margin: 0; border-top: 1px solid #dee2e6;">
<summary style="padding: 8px 15px; background-color: #e9ecef; cursor: pointer; font-size: 12px; color: #6c757d;">
π View Raw LaTeX Source
</summary>
<div style="padding: 15px; background-color: #f8f9fa;">
<pre style="background-color: transparent; margin: 0; padding: 0;
font-family: 'Courier New', monospace; font-size: 12px; line-height: 1.4;
white-space: pre-wrap; word-wrap: break-word; color: #2c3e50; max-height: 200px; overflow-y: auto;">
{content}
</pre>
</div>
</details>
</div>
"""
except Exception as e:
# Fallback to simple formatting if rendering fails
logger.error(f"Error rendering LaTeX content: {e}")
escaped_content = html.escape(str(content))
formatted_content = f"""
<div style="background-color: #f8f9fa; border-radius: 8px; border: 1px solid #e9ecef; margin: 10px 0;">
<div style="background-color: #e9ecef; padding: 10px; border-radius: 8px 8px 0 0; font-weight: bold; color: #495057;">
π LaTeX Content (Fallback View)
</div>
<div style="padding: 15px;">
<pre style="background-color: transparent; margin: 0; padding: 0;
font-family: 'Courier New', monospace; font-size: 14px; line-height: 1.4;
white-space: pre-wrap; word-wrap: break-word; color: #2c3e50;">
{escaped_content}
</pre>
</div>
<div style="background-color: #e9ecef; padding: 8px 15px; border-radius: 0 0 8px 8px; font-size: 12px; color: #6c757d;">
β οΈ Rendering failed, showing raw LaTeX. Error: {str(e)}
</div>
</div>
"""
return formatted_content |