File size: 7,056 Bytes
6ea41ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
"""Content formatting and rendering utilities for the Markit application."""

import markdown
import json
import base64
import html
import logging

from src.core.logging_config import get_logger

logger = get_logger(__name__)


def format_markdown_content(content):
    """Convert markdown content to HTML."""
    if not content:
        return content
    
    # Convert the content to HTML using markdown library
    html_content = markdown.markdown(str(content), extensions=['tables'])
    return html_content


def render_latex_to_html(latex_content):
    """Convert LaTeX content to HTML using Mathpix Markdown like GOT-OCR demo."""
    # Clean up the content similar to GOT-OCR demo
    content = latex_content.strip()
    if content.endswith("<|im_end|>"):
        content = content[:-len("<|im_end|>")]
    
    # Fix unbalanced delimiters exactly like GOT-OCR demo
    right_num = content.count("\\right")
    left_num = content.count("\\left")
    
    if right_num != left_num:
        content = (
            content.replace("\\left(", "(")
            .replace("\\right)", ")")
            .replace("\\left[", "[")
            .replace("\\right]", "]")
            .replace("\\left{", "{")
            .replace("\\right}", "}")
            .replace("\\left|", "|")
            .replace("\\right|", "|")
            .replace("\\left.", ".")
            .replace("\\right.", ".")
        )
    
    # Process content like GOT-OCR demo: remove $ signs and replace quotes
    content = content.replace('"', "``").replace("$", "")
    
    # Split into lines and create JavaScript string like GOT-OCR demo
    outputs_list = content.split("\n")
    js_text_parts = []
    for line in outputs_list:
        # Escape backslashes and add line break
        escaped_line = line.replace("\\", "\\\\")
        js_text_parts.append(f'"{escaped_line}\\n"')
    
    # Join with + like in GOT-OCR demo
    js_text = " + ".join(js_text_parts)
    
    # Create HTML using Mathpix Markdown like GOT-OCR demo
    html_content = f"""<!DOCTYPE html>
<html lang="en" data-lt-installed="true">
<head>
    <meta charset="UTF-8">
    <title>LaTeX Content</title>
    <script>
        const text = {js_text};
    </script>
    <style>
        #content {{
            max-width: 800px;
            margin: auto;
            padding: 20px;
        }}
        body {{
            font-family: 'Times New Roman', serif;
            line-height: 1.6;
            background-color: #ffffff;
            color: #333;
        }}
        table {{
            border-collapse: collapse;
            width: 100%;
            margin: 20px 0;
        }}
        td, th {{
            border: 1px solid #333;
            padding: 8px 12px;
            text-align: center;
            vertical-align: middle;
        }}
    </style>
    <script>
        let script = document.createElement('script');
        script.src = "https://cdn.jsdelivr.net/npm/mathpix-markdown-it@1.3.6/es5/bundle.js";
        document.head.append(script);
        script.onload = function() {{
            const isLoaded = window.loadMathJax();
            if (isLoaded) {{
                console.log('Styles loaded!')
            }}
            const el = window.document.getElementById('content-text');
            if (el) {{
                const options = {{
                    htmlTags: true
                }};
                const html = window.render(text, options);
                el.outerHTML = html;
            }}
        }};
    </script>
</head>
<body>
    <div id="content">
        <div id="content-text"></div>
    </div>
</body>
</html>"""
    
    return html_content


def format_latex_content(content):
    """Format LaTeX content for display in UI using MathJax rendering like GOT-OCR demo."""
    if not content:
        return content
    
    try:
        # Generate rendered HTML
        rendered_html = render_latex_to_html(content)
        
        # Encode for iframe display (similar to GOT-OCR demo)
        encoded_html = base64.b64encode(rendered_html.encode("utf-8")).decode("utf-8")
        iframe_src = f"data:text/html;base64,{encoded_html}"
        
        # Create the display with both rendered and raw views
        formatted_content = f"""
        <div style="background-color: #f8f9fa; border-radius: 8px; border: 1px solid #e9ecef; margin: 10px 0;">
            <div style="background-color: #e9ecef; padding: 10px; border-radius: 8px 8px 0 0; font-weight: bold; color: #495057;">
                πŸ“„ LaTeX Content (Rendered with MathJax)
            </div>
            <div style="padding: 0;">
                <iframe src="{iframe_src}" width="100%" height="500px" style="border: none; border-radius: 0 0 8px 8px;"></iframe>
            </div>
            <div style="background-color: #e9ecef; padding: 8px 15px; border-radius: 0; font-size: 12px; color: #6c757d; border-top: 1px solid #dee2e6;">
                πŸ’‘ LaTeX content rendered with MathJax. Tables and formulas are displayed as they would appear in a LaTeX document.
            </div>
            <details style="margin: 0; border-top: 1px solid #dee2e6;">
                <summary style="padding: 8px 15px; background-color: #e9ecef; cursor: pointer; font-size: 12px; color: #6c757d;">
                    πŸ“ View Raw LaTeX Source
                </summary>
                <div style="padding: 15px; background-color: #f8f9fa;">
                    <pre style="background-color: transparent; margin: 0; padding: 0;
                                font-family: 'Courier New', monospace; font-size: 12px; line-height: 1.4; 
                                white-space: pre-wrap; word-wrap: break-word; color: #2c3e50; max-height: 200px; overflow-y: auto;">
{content}
                    </pre>
                </div>
            </details>
        </div>
        """
        
    except Exception as e:
        # Fallback to simple formatting if rendering fails
        logger.error(f"Error rendering LaTeX content: {e}")
        escaped_content = html.escape(str(content))
        formatted_content = f"""
        <div style="background-color: #f8f9fa; border-radius: 8px; border: 1px solid #e9ecef; margin: 10px 0;">
            <div style="background-color: #e9ecef; padding: 10px; border-radius: 8px 8px 0 0; font-weight: bold; color: #495057;">
                πŸ“„ LaTeX Content (Fallback View)
            </div>
            <div style="padding: 15px;">
                <pre style="background-color: transparent; margin: 0; padding: 0;
                            font-family: 'Courier New', monospace; font-size: 14px; line-height: 1.4; 
                            white-space: pre-wrap; word-wrap: break-word; color: #2c3e50;">
{escaped_content}
                </pre>
            </div>
            <div style="background-color: #e9ecef; padding: 8px 15px; border-radius: 0 0 8px 8px; font-size: 12px; color: #6c757d;">
                ⚠️ Rendering failed, showing raw LaTeX. Error: {str(e)}
            </div>
        </div>
        """
    
    return formatted_content