Spaces:
Running
Running
Refactor document ingestion and chunking to support LaTeX content
Browse files- Updated `DocumentIngestionService` to generalize content ingestion, allowing both Markdown and LaTeX formats.
- Introduced `LaTeXAwareChunker` for chunking LaTeX documents while preserving structures.
- Enhanced `UnifiedDocumentChunker` to handle both Markdown and LaTeX content types seamlessly.
- Modified `_process_latex_content` to return raw LaTeX for GOT-OCR without LLM conversion.
- Improved UI rendering for LaTeX content, ensuring proper display using MathJax.
- Added backward compatibility for existing Markdown ingestion methods.
- src/parsers/got_ocr_parser.py +13 -5
- src/rag/chunking.py +372 -2
- src/rag/ingestion.py +41 -19
- src/services/document_service.py +25 -13
- src/ui/ui.py +177 -3
src/parsers/got_ocr_parser.py
CHANGED
@@ -85,7 +85,7 @@ class GotOcrParser(DocumentParser):
|
|
85 |
**kwargs: Additional arguments to pass to the model
|
86 |
|
87 |
Returns:
|
88 |
-
Extracted text from the image
|
89 |
"""
|
90 |
# Verify dependencies are installed without initializing CUDA
|
91 |
if not self._check_dependencies():
|
@@ -131,15 +131,23 @@ class GotOcrParser(DocumentParser):
|
|
131 |
image_path_str = str(file_path)
|
132 |
|
133 |
# Call the wrapper function that handles ZeroGPU safely
|
134 |
-
|
135 |
else:
|
136 |
# Fallback for environments without spaces
|
137 |
-
|
138 |
str(file_path),
|
139 |
use_format=use_format,
|
140 |
**safe_kwargs
|
141 |
)
|
142 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
except Exception as e:
|
144 |
logger.error(f"Error processing image with GOT-OCR: {str(e)}")
|
145 |
|
@@ -195,7 +203,7 @@ class GotOcrParser(DocumentParser):
|
|
195 |
image = load_image(image_path)
|
196 |
|
197 |
# Load processor and model
|
198 |
-
processor = AutoProcessor.from_pretrained(MODEL_NAME)
|
199 |
|
200 |
# Use CPU if in main process to avoid CUDA initialization issues
|
201 |
device = "cpu"
|
@@ -285,7 +293,7 @@ class GotOcrParser(DocumentParser):
|
|
285 |
logger.info(f"Loading GOT-OCR model from {MODEL_NAME} on {device}")
|
286 |
|
287 |
# Load processor
|
288 |
-
processor = AutoProcessor.from_pretrained(MODEL_NAME)
|
289 |
|
290 |
# Load model
|
291 |
model = AutoModelForImageTextToText.from_pretrained(
|
|
|
85 |
**kwargs: Additional arguments to pass to the model
|
86 |
|
87 |
Returns:
|
88 |
+
Extracted text from the image as raw LaTeX
|
89 |
"""
|
90 |
# Verify dependencies are installed without initializing CUDA
|
91 |
if not self._check_dependencies():
|
|
|
131 |
image_path_str = str(file_path)
|
132 |
|
133 |
# Call the wrapper function that handles ZeroGPU safely
|
134 |
+
result = self._safe_gpu_process(image_path_str, use_format, **safe_kwargs)
|
135 |
else:
|
136 |
# Fallback for environments without spaces
|
137 |
+
result = self._process_image_without_gpu(
|
138 |
str(file_path),
|
139 |
use_format=use_format,
|
140 |
**safe_kwargs
|
141 |
)
|
142 |
|
143 |
+
# Add a small delay to replace LLM conversion time
|
144 |
+
import time
|
145 |
+
time.sleep(2) # 2 second delay to simulate processing time
|
146 |
+
|
147 |
+
# Return raw LaTeX output (no LLM conversion)
|
148 |
+
logger.info("Returning raw LaTeX output (no LLM conversion)")
|
149 |
+
return result
|
150 |
+
|
151 |
except Exception as e:
|
152 |
logger.error(f"Error processing image with GOT-OCR: {str(e)}")
|
153 |
|
|
|
203 |
image = load_image(image_path)
|
204 |
|
205 |
# Load processor and model
|
206 |
+
processor = AutoProcessor.from_pretrained(MODEL_NAME, use_fast=True)
|
207 |
|
208 |
# Use CPU if in main process to avoid CUDA initialization issues
|
209 |
device = "cpu"
|
|
|
293 |
logger.info(f"Loading GOT-OCR model from {MODEL_NAME} on {device}")
|
294 |
|
295 |
# Load processor
|
296 |
+
processor = AutoProcessor.from_pretrained(MODEL_NAME, use_fast=True)
|
297 |
|
298 |
# Load model
|
299 |
model = AutoModelForImageTextToText.from_pretrained(
|
src/rag/chunking.py
CHANGED
@@ -8,6 +8,253 @@ from src.core.logging_config import get_logger
|
|
8 |
|
9 |
logger = get_logger(__name__)
|
10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
class MarkdownAwareChunker:
|
12 |
"""Handles markdown-aware document chunking that preserves tables and structures."""
|
13 |
|
@@ -269,5 +516,128 @@ class MarkdownAwareChunker:
|
|
269 |
|
270 |
return preview
|
271 |
|
272 |
-
|
273 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
logger = get_logger(__name__)
|
10 |
|
11 |
+
class LaTeXAwareChunker:
|
12 |
+
"""Handles LaTeX-aware document chunking that preserves LaTeX structures."""
|
13 |
+
|
14 |
+
def __init__(self, chunk_size: int = 1200, chunk_overlap: int = 150):
|
15 |
+
"""
|
16 |
+
Initialize the LaTeX-aware document chunker.
|
17 |
+
|
18 |
+
Args:
|
19 |
+
chunk_size: Maximum size of each chunk in characters
|
20 |
+
chunk_overlap: Number of characters to overlap between chunks
|
21 |
+
"""
|
22 |
+
self.chunk_size = chunk_size
|
23 |
+
self.chunk_overlap = chunk_overlap
|
24 |
+
|
25 |
+
# Initialize the text splitter with LaTeX-aware settings
|
26 |
+
self.text_splitter = RecursiveCharacterTextSplitter(
|
27 |
+
chunk_size=chunk_size,
|
28 |
+
chunk_overlap=chunk_overlap,
|
29 |
+
length_function=len,
|
30 |
+
separators=[
|
31 |
+
"\n\\section{", # Section headers
|
32 |
+
"\n\\subsection{", # Subsection headers
|
33 |
+
"\n\\subsubsection{", # Subsubsection headers
|
34 |
+
"\n\\title{", # Title commands
|
35 |
+
"\n\\begin{", # Begin environments
|
36 |
+
"\n\\end{", # End environments
|
37 |
+
"\n\n", # Paragraph breaks
|
38 |
+
"\n", # Line breaks
|
39 |
+
". ", # Sentence breaks
|
40 |
+
" ", # Word breaks
|
41 |
+
"" # Character breaks
|
42 |
+
],
|
43 |
+
keep_separator=True,
|
44 |
+
add_start_index=True
|
45 |
+
)
|
46 |
+
|
47 |
+
# Regex patterns for LaTeX structures
|
48 |
+
self.latex_table_pattern = re.compile(
|
49 |
+
r'\\begin\{tabular\}.*?\\end\{tabular\}',
|
50 |
+
re.DOTALL | re.MULTILINE
|
51 |
+
)
|
52 |
+
|
53 |
+
self.latex_title_pattern = re.compile(
|
54 |
+
r'\\title\{[^}]*\}',
|
55 |
+
re.MULTILINE
|
56 |
+
)
|
57 |
+
|
58 |
+
self.latex_section_pattern = re.compile(
|
59 |
+
r'\\(?:sub)*section\*?\{[^}]*\}',
|
60 |
+
re.MULTILINE
|
61 |
+
)
|
62 |
+
|
63 |
+
self.latex_environment_pattern = re.compile(
|
64 |
+
r'\\begin\{[^}]+\}.*?\\end\{[^}]+\}',
|
65 |
+
re.DOTALL | re.MULTILINE
|
66 |
+
)
|
67 |
+
|
68 |
+
logger.info(f"LaTeX-aware chunker initialized with chunk_size={chunk_size}, overlap={chunk_overlap}")
|
69 |
+
|
70 |
+
def extract_latex_structures(self, content: str) -> Tuple[List[Tuple[int, int, str]], str]:
|
71 |
+
"""
|
72 |
+
Extract LaTeX tables and environments, replacing them with placeholders.
|
73 |
+
|
74 |
+
Args:
|
75 |
+
content: Original LaTeX content
|
76 |
+
|
77 |
+
Returns:
|
78 |
+
Tuple of (structures_list, content_with_placeholders)
|
79 |
+
"""
|
80 |
+
structures = []
|
81 |
+
|
82 |
+
# Find all tabular environments (highest priority)
|
83 |
+
for match in self.latex_table_pattern.finditer(content):
|
84 |
+
structures.append((
|
85 |
+
match.start(),
|
86 |
+
match.end(),
|
87 |
+
"latex_table",
|
88 |
+
match.group()
|
89 |
+
))
|
90 |
+
|
91 |
+
# Find other LaTeX environments (avoid overlapping with tables)
|
92 |
+
for match in self.latex_environment_pattern.finditer(content):
|
93 |
+
# Check if this environment overlaps with any table
|
94 |
+
overlaps_with_table = any(
|
95 |
+
table_start <= match.start() < table_end or
|
96 |
+
table_start < match.end() <= table_end
|
97 |
+
for table_start, table_end, struct_type, _ in structures
|
98 |
+
if struct_type == "latex_table"
|
99 |
+
)
|
100 |
+
|
101 |
+
if not overlaps_with_table and "tabular" not in match.group():
|
102 |
+
structures.append((
|
103 |
+
match.start(),
|
104 |
+
match.end(),
|
105 |
+
"latex_environment",
|
106 |
+
match.group()
|
107 |
+
))
|
108 |
+
|
109 |
+
# Find titles and sections
|
110 |
+
for match in self.latex_title_pattern.finditer(content):
|
111 |
+
structures.append((
|
112 |
+
match.start(),
|
113 |
+
match.end(),
|
114 |
+
"latex_title",
|
115 |
+
match.group()
|
116 |
+
))
|
117 |
+
|
118 |
+
for match in self.latex_section_pattern.finditer(content):
|
119 |
+
structures.append((
|
120 |
+
match.start(),
|
121 |
+
match.end(),
|
122 |
+
"latex_section",
|
123 |
+
match.group()
|
124 |
+
))
|
125 |
+
|
126 |
+
# Sort by start position
|
127 |
+
structures.sort(key=lambda x: x[0])
|
128 |
+
|
129 |
+
# Replace structures with placeholders
|
130 |
+
content_with_placeholders = content
|
131 |
+
offset = 0
|
132 |
+
|
133 |
+
for i, (start, end, struct_type, struct_content) in enumerate(structures):
|
134 |
+
placeholder = f"\n\n__LATEX_STRUCTURE_{i}_{struct_type.upper()}__\n\n"
|
135 |
+
|
136 |
+
# Adjust positions based on previous replacements
|
137 |
+
adjusted_start = start - offset
|
138 |
+
adjusted_end = end - offset
|
139 |
+
|
140 |
+
content_with_placeholders = (
|
141 |
+
content_with_placeholders[:adjusted_start] +
|
142 |
+
placeholder +
|
143 |
+
content_with_placeholders[adjusted_end:]
|
144 |
+
)
|
145 |
+
|
146 |
+
# Update offset for next replacement
|
147 |
+
offset += (end - start) - len(placeholder)
|
148 |
+
|
149 |
+
return structures, content_with_placeholders
|
150 |
+
|
151 |
+
def restore_latex_structures(self, chunks: List[str], structures: List[Tuple[int, int, str, str]]) -> List[str]:
|
152 |
+
"""
|
153 |
+
Restore LaTeX structures in chunks, keeping tables and environments intact.
|
154 |
+
|
155 |
+
Args:
|
156 |
+
chunks: List of text chunks with placeholders
|
157 |
+
structures: List of original structures
|
158 |
+
|
159 |
+
Returns:
|
160 |
+
List of chunks with restored structures
|
161 |
+
"""
|
162 |
+
restored_chunks = []
|
163 |
+
|
164 |
+
for chunk in chunks:
|
165 |
+
restored_chunk = chunk
|
166 |
+
|
167 |
+
# Find placeholders in this chunk
|
168 |
+
placeholder_pattern = re.compile(r'__LATEX_STRUCTURE_(\d+)_(\w+)__')
|
169 |
+
|
170 |
+
for match in placeholder_pattern.finditer(chunk):
|
171 |
+
structure_index = int(match.group(1))
|
172 |
+
|
173 |
+
if structure_index < len(structures):
|
174 |
+
original_structure = structures[structure_index][3]
|
175 |
+
restored_chunk = restored_chunk.replace(match.group(), original_structure)
|
176 |
+
|
177 |
+
restored_chunks.append(restored_chunk)
|
178 |
+
|
179 |
+
return restored_chunks
|
180 |
+
|
181 |
+
def chunk_document(self, content: str, source_metadata: Dict[str, Any]) -> List[Document]:
|
182 |
+
"""
|
183 |
+
Chunk a LaTeX document while preserving LaTeX structures.
|
184 |
+
|
185 |
+
Args:
|
186 |
+
content: The LaTeX content to chunk
|
187 |
+
source_metadata: Metadata about the source document
|
188 |
+
|
189 |
+
Returns:
|
190 |
+
List of Document objects with chunked content and enhanced metadata
|
191 |
+
"""
|
192 |
+
try:
|
193 |
+
# Extract LaTeX structures and replace with placeholders
|
194 |
+
structures, content_with_placeholders = self.extract_latex_structures(content)
|
195 |
+
|
196 |
+
# Create a document object with placeholders
|
197 |
+
doc = Document(
|
198 |
+
page_content=content_with_placeholders,
|
199 |
+
metadata=source_metadata
|
200 |
+
)
|
201 |
+
|
202 |
+
# Split the document into chunks
|
203 |
+
chunks = self.text_splitter.split_documents([doc])
|
204 |
+
|
205 |
+
# Restore LaTeX structures in chunks
|
206 |
+
chunk_contents = [chunk.page_content for chunk in chunks]
|
207 |
+
restored_contents = self.restore_latex_structures(chunk_contents, structures)
|
208 |
+
|
209 |
+
# Create enhanced chunks with restored content
|
210 |
+
enhanced_chunks = []
|
211 |
+
for i, (chunk, restored_content) in enumerate(zip(chunks, restored_contents)):
|
212 |
+
# Add chunk-specific metadata
|
213 |
+
chunk.metadata.update({
|
214 |
+
"chunk_index": i,
|
215 |
+
"total_chunks": len(chunks),
|
216 |
+
"chunk_size": len(restored_content),
|
217 |
+
"chunk_id": f"{source_metadata.get('source_id', 'unknown')}_{i}",
|
218 |
+
"has_latex_table": "\\begin{tabular}" in restored_content,
|
219 |
+
"has_latex_environment": "\\begin{" in restored_content and "\\end{" in restored_content,
|
220 |
+
"has_latex_math": "\\(" in restored_content or "$" in restored_content,
|
221 |
+
"content_type": "latex"
|
222 |
+
})
|
223 |
+
|
224 |
+
# Update the chunk content with restored structures
|
225 |
+
chunk.page_content = restored_content
|
226 |
+
enhanced_chunks.append(chunk)
|
227 |
+
|
228 |
+
logger.info(f"LaTeX document chunked into {len(enhanced_chunks)} structure-aware pieces")
|
229 |
+
return enhanced_chunks
|
230 |
+
|
231 |
+
except Exception as e:
|
232 |
+
logger.error(f"Error chunking LaTeX document: {e}")
|
233 |
+
# Fallback to regular chunking if LaTeX processing fails
|
234 |
+
return self._fallback_chunk(content, source_metadata)
|
235 |
+
|
236 |
+
def _fallback_chunk(self, content: str, source_metadata: Dict[str, Any]) -> List[Document]:
|
237 |
+
"""Fallback chunking method if LaTeX-aware chunking fails."""
|
238 |
+
try:
|
239 |
+
doc = Document(page_content=content, metadata=source_metadata)
|
240 |
+
chunks = self.text_splitter.split_documents([doc])
|
241 |
+
|
242 |
+
for i, chunk in enumerate(chunks):
|
243 |
+
chunk.metadata.update({
|
244 |
+
"chunk_index": i,
|
245 |
+
"total_chunks": len(chunks),
|
246 |
+
"chunk_size": len(chunk.page_content),
|
247 |
+
"chunk_id": f"{source_metadata.get('source_id', 'unknown')}_{i}",
|
248 |
+
"content_type": "latex"
|
249 |
+
})
|
250 |
+
|
251 |
+
logger.warning(f"Used fallback chunking for LaTeX content: {len(chunks)} pieces")
|
252 |
+
return chunks
|
253 |
+
|
254 |
+
except Exception as e:
|
255 |
+
logger.error(f"Error in LaTeX fallback chunking: {e}")
|
256 |
+
raise
|
257 |
+
|
258 |
class MarkdownAwareChunker:
|
259 |
"""Handles markdown-aware document chunking that preserves tables and structures."""
|
260 |
|
|
|
516 |
|
517 |
return preview
|
518 |
|
519 |
+
class UnifiedDocumentChunker:
|
520 |
+
"""Unified chunker that handles both Markdown and LaTeX content types."""
|
521 |
+
|
522 |
+
def __init__(self):
|
523 |
+
"""Initialize the unified chunker with both markdown and LaTeX chunkers."""
|
524 |
+
self.markdown_chunker = MarkdownAwareChunker(chunk_size=1000, chunk_overlap=200)
|
525 |
+
self.latex_chunker = LaTeXAwareChunker(chunk_size=1200, chunk_overlap=150)
|
526 |
+
logger.info("Unified document chunker initialized with both Markdown and LaTeX support")
|
527 |
+
|
528 |
+
def chunk_document(self, content: str, source_metadata: Dict[str, Any]) -> List[Document]:
|
529 |
+
"""
|
530 |
+
Chunk a document using the appropriate chunker based on content type.
|
531 |
+
|
532 |
+
Args:
|
533 |
+
content: The document content to chunk
|
534 |
+
source_metadata: Metadata about the source document
|
535 |
+
|
536 |
+
Returns:
|
537 |
+
List of Document objects with chunked content and enhanced metadata
|
538 |
+
"""
|
539 |
+
# Determine content type from metadata or content analysis
|
540 |
+
content_type = source_metadata.get('doc_type', 'markdown').lower()
|
541 |
+
|
542 |
+
# Override content type detection for GOT-OCR results
|
543 |
+
if source_metadata.get('conversion_method', '').startswith('GOT-OCR'):
|
544 |
+
content_type = 'latex'
|
545 |
+
|
546 |
+
# Auto-detect content type if not specified
|
547 |
+
if content_type not in ['markdown', 'latex']:
|
548 |
+
if self._is_latex_content(content):
|
549 |
+
content_type = 'latex'
|
550 |
+
else:
|
551 |
+
content_type = 'markdown'
|
552 |
+
|
553 |
+
# Use appropriate chunker
|
554 |
+
if content_type == 'latex':
|
555 |
+
logger.info("Using LaTeX-aware chunker for document")
|
556 |
+
return self.latex_chunker.chunk_document(content, source_metadata)
|
557 |
+
else:
|
558 |
+
logger.info("Using Markdown-aware chunker for document")
|
559 |
+
return self.markdown_chunker.chunk_document(content, source_metadata)
|
560 |
+
|
561 |
+
def _is_latex_content(self, content: str) -> bool:
|
562 |
+
"""
|
563 |
+
Auto-detect if content is LaTeX based on common LaTeX commands.
|
564 |
+
|
565 |
+
Args:
|
566 |
+
content: Content to analyze
|
567 |
+
|
568 |
+
Returns:
|
569 |
+
True if content appears to be LaTeX, False otherwise
|
570 |
+
"""
|
571 |
+
latex_indicators = [
|
572 |
+
r'\\begin\{',
|
573 |
+
r'\\end\{',
|
574 |
+
r'\\title\{',
|
575 |
+
r'\\section',
|
576 |
+
r'\\subsection',
|
577 |
+
r'\\hline',
|
578 |
+
r'\\multirow',
|
579 |
+
r'\\multicolumn'
|
580 |
+
]
|
581 |
+
|
582 |
+
# Count LaTeX indicators
|
583 |
+
latex_count = sum(1 for indicator in latex_indicators if re.search(indicator, content))
|
584 |
+
|
585 |
+
# If we find multiple LaTeX indicators, treat as LaTeX
|
586 |
+
return latex_count >= 2
|
587 |
+
|
588 |
+
def chunk_multiple_documents(self, documents: List[Dict[str, Any]]) -> List[Document]:
|
589 |
+
"""
|
590 |
+
Chunk multiple documents using appropriate chunkers.
|
591 |
+
|
592 |
+
Args:
|
593 |
+
documents: List of dictionaries with 'content' and 'metadata' keys
|
594 |
+
|
595 |
+
Returns:
|
596 |
+
List of chunked Document objects
|
597 |
+
"""
|
598 |
+
all_chunks = []
|
599 |
+
|
600 |
+
for doc_data in documents:
|
601 |
+
content = doc_data.get('content', '')
|
602 |
+
metadata = doc_data.get('metadata', {})
|
603 |
+
|
604 |
+
if content.strip(): # Only process non-empty content
|
605 |
+
chunks = self.chunk_document(content, metadata)
|
606 |
+
all_chunks.extend(chunks)
|
607 |
+
|
608 |
+
logger.info(f"Chunked {len(documents)} documents into {len(all_chunks)} total chunks")
|
609 |
+
return all_chunks
|
610 |
+
|
611 |
+
def get_chunk_preview(self, chunks: List[Document], max_chunks: int = 5) -> str:
|
612 |
+
"""
|
613 |
+
Generate a preview of chunks for debugging/logging.
|
614 |
+
|
615 |
+
Args:
|
616 |
+
chunks: List of Document chunks
|
617 |
+
max_chunks: Maximum number of chunks to include in preview
|
618 |
+
|
619 |
+
Returns:
|
620 |
+
String preview of chunks
|
621 |
+
"""
|
622 |
+
preview = f"Document Chunks Preview ({len(chunks)} total chunks):\n"
|
623 |
+
preview += "=" * 50 + "\n"
|
624 |
+
|
625 |
+
for i, chunk in enumerate(chunks[:max_chunks]):
|
626 |
+
content_type = chunk.metadata.get('content_type', 'unknown')
|
627 |
+
has_table = chunk.metadata.get('has_table', False) or chunk.metadata.get('has_latex_table', False)
|
628 |
+
has_code = chunk.metadata.get('has_code', False) or chunk.metadata.get('has_latex_environment', False)
|
629 |
+
|
630 |
+
preview += f"Chunk {i + 1} ({content_type}):\n"
|
631 |
+
preview += f" Length: {len(chunk.page_content)} characters\n"
|
632 |
+
preview += f" Has Table: {has_table}, Has Code/Environment: {has_code}\n"
|
633 |
+
preview += f" Metadata: {chunk.metadata}\n"
|
634 |
+
preview += f" Content preview: {chunk.page_content[:100]}...\n"
|
635 |
+
preview += "-" * 30 + "\n"
|
636 |
+
|
637 |
+
if len(chunks) > max_chunks:
|
638 |
+
preview += f"... and {len(chunks) - max_chunks} more chunks\n"
|
639 |
+
|
640 |
+
return preview
|
641 |
+
|
642 |
+
# Global unified chunker instance that supports both Markdown and LaTeX
|
643 |
+
document_chunker = UnifiedDocumentChunker()
|
src/rag/ingestion.py
CHANGED
@@ -75,16 +75,18 @@ class DocumentIngestionService:
|
|
75 |
logger.error(f"Error deleting existing document: {e}")
|
76 |
return False
|
77 |
|
78 |
-
def
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
|
|
83 |
"""
|
84 |
-
Ingest
|
85 |
|
86 |
Args:
|
87 |
-
|
|
|
88 |
source_path: Optional source path/filename
|
89 |
metadata: Optional additional metadata
|
90 |
original_file_content: Original file content for hash calculation
|
@@ -93,11 +95,11 @@ class DocumentIngestionService:
|
|
93 |
Tuple of (success, message, ingestion_stats)
|
94 |
"""
|
95 |
try:
|
96 |
-
if not
|
97 |
return False, "No content provided for ingestion", {}
|
98 |
|
99 |
-
# Create file hash using original content if available, otherwise use
|
100 |
-
file_content_for_hash = original_file_content or
|
101 |
file_hash = self.create_file_hash(file_content_for_hash)
|
102 |
|
103 |
# Check for duplicates in vector store
|
@@ -115,16 +117,16 @@ class DocumentIngestionService:
|
|
115 |
# Prepare document metadata with file hash
|
116 |
doc_metadata = self.prepare_document_metadata(
|
117 |
source_path=source_path,
|
118 |
-
doc_type="markdown"
|
119 |
additional_metadata=metadata
|
120 |
)
|
121 |
doc_metadata["file_hash"] = file_hash
|
122 |
-
doc_metadata["content_length"] = len(
|
123 |
doc_metadata["upload_timestamp"] = datetime.now().isoformat()
|
124 |
|
125 |
-
# Chunk the document using
|
126 |
-
logger.info(f"Chunking document: {file_hash}")
|
127 |
-
chunks = document_chunker.chunk_document(
|
128 |
|
129 |
if not chunks:
|
130 |
return False, "Failed to create document chunks", {}
|
@@ -142,7 +144,7 @@ class DocumentIngestionService:
|
|
142 |
"file_hash": file_hash,
|
143 |
"total_chunks": len(chunks),
|
144 |
"document_ids": doc_ids,
|
145 |
-
"content_length": len(
|
146 |
"has_tables": any(chunk.metadata.get("has_table", False) for chunk in chunks),
|
147 |
"has_code": any(chunk.metadata.get("has_code", False) for chunk in chunks),
|
148 |
"processed_at": datetime.now().isoformat(),
|
@@ -160,6 +162,22 @@ class DocumentIngestionService:
|
|
160 |
logger.error(error_msg)
|
161 |
return False, error_msg, {"status": "error", "error": str(e)}
|
162 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
def ingest_from_conversion_result(self, conversion_result: Dict[str, Any]) -> Tuple[bool, str, Dict[str, Any]]:
|
164 |
"""
|
165 |
Ingest a document from Markit conversion result.
|
@@ -189,9 +207,13 @@ class DocumentIngestionService:
|
|
189 |
"conversion_time": conversion_result.get("conversion_time", 0)
|
190 |
}
|
191 |
|
192 |
-
#
|
193 |
-
|
194 |
-
|
|
|
|
|
|
|
|
|
195 |
source_path=original_filename,
|
196 |
metadata=additional_metadata,
|
197 |
original_file_content=original_file_content
|
|
|
75 |
logger.error(f"Error deleting existing document: {e}")
|
76 |
return False
|
77 |
|
78 |
+
def ingest_text_content(self,
|
79 |
+
text_content: str,
|
80 |
+
content_type: str = "markdown",
|
81 |
+
source_path: Optional[str] = None,
|
82 |
+
metadata: Optional[Dict[str, Any]] = None,
|
83 |
+
original_file_content: Optional[str] = None) -> Tuple[bool, str, Dict[str, Any]]:
|
84 |
"""
|
85 |
+
Ingest text content (markdown or LaTeX) into the RAG system.
|
86 |
|
87 |
Args:
|
88 |
+
text_content: The text content to ingest (markdown or LaTeX)
|
89 |
+
content_type: Type of content ("markdown" or "latex")
|
90 |
source_path: Optional source path/filename
|
91 |
metadata: Optional additional metadata
|
92 |
original_file_content: Original file content for hash calculation
|
|
|
95 |
Tuple of (success, message, ingestion_stats)
|
96 |
"""
|
97 |
try:
|
98 |
+
if not text_content or not text_content.strip():
|
99 |
return False, "No content provided for ingestion", {}
|
100 |
|
101 |
+
# Create file hash using original content if available, otherwise use text content
|
102 |
+
file_content_for_hash = original_file_content or text_content
|
103 |
file_hash = self.create_file_hash(file_content_for_hash)
|
104 |
|
105 |
# Check for duplicates in vector store
|
|
|
117 |
# Prepare document metadata with file hash
|
118 |
doc_metadata = self.prepare_document_metadata(
|
119 |
source_path=source_path,
|
120 |
+
doc_type=content_type, # Use content_type instead of hardcoded "markdown"
|
121 |
additional_metadata=metadata
|
122 |
)
|
123 |
doc_metadata["file_hash"] = file_hash
|
124 |
+
doc_metadata["content_length"] = len(text_content)
|
125 |
doc_metadata["upload_timestamp"] = datetime.now().isoformat()
|
126 |
|
127 |
+
# Chunk the document using text-aware chunking
|
128 |
+
logger.info(f"Chunking {content_type} document: {file_hash}")
|
129 |
+
chunks = document_chunker.chunk_document(text_content, doc_metadata)
|
130 |
|
131 |
if not chunks:
|
132 |
return False, "Failed to create document chunks", {}
|
|
|
144 |
"file_hash": file_hash,
|
145 |
"total_chunks": len(chunks),
|
146 |
"document_ids": doc_ids,
|
147 |
+
"content_length": len(text_content),
|
148 |
"has_tables": any(chunk.metadata.get("has_table", False) for chunk in chunks),
|
149 |
"has_code": any(chunk.metadata.get("has_code", False) for chunk in chunks),
|
150 |
"processed_at": datetime.now().isoformat(),
|
|
|
162 |
logger.error(error_msg)
|
163 |
return False, error_msg, {"status": "error", "error": str(e)}
|
164 |
|
165 |
+
def ingest_markdown_content(self,
|
166 |
+
markdown_content: str,
|
167 |
+
source_path: Optional[str] = None,
|
168 |
+
metadata: Optional[Dict[str, Any]] = None,
|
169 |
+
original_file_content: Optional[str] = None) -> Tuple[bool, str, Dict[str, Any]]:
|
170 |
+
"""
|
171 |
+
Backward compatibility method for ingesting markdown content.
|
172 |
+
"""
|
173 |
+
return self.ingest_text_content(
|
174 |
+
text_content=markdown_content,
|
175 |
+
content_type="markdown",
|
176 |
+
source_path=source_path,
|
177 |
+
metadata=metadata,
|
178 |
+
original_file_content=original_file_content
|
179 |
+
)
|
180 |
+
|
181 |
def ingest_from_conversion_result(self, conversion_result: Dict[str, Any]) -> Tuple[bool, str, Dict[str, Any]]:
|
182 |
"""
|
183 |
Ingest a document from Markit conversion result.
|
|
|
207 |
"conversion_time": conversion_result.get("conversion_time", 0)
|
208 |
}
|
209 |
|
210 |
+
# Determine content type based on conversion method
|
211 |
+
content_type = "latex" if "GOT-OCR" in conversion_method else "markdown"
|
212 |
+
|
213 |
+
# Ingest the content with original file content for proper hashing
|
214 |
+
return self.ingest_text_content(
|
215 |
+
text_content=markdown_content,
|
216 |
+
content_type=content_type,
|
217 |
source_path=original_filename,
|
218 |
metadata=additional_metadata,
|
219 |
original_file_content=original_file_content
|
src/services/document_service.py
CHANGED
@@ -94,9 +94,15 @@ class DocumentService:
|
|
94 |
return temp_path
|
95 |
|
96 |
def _process_latex_content(self, content: str, parser_name: str, ocr_method_name: str) -> str:
|
97 |
-
"""Process LaTeX content for GOT-OCR
|
98 |
-
|
99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
config.api.google_api_key):
|
101 |
|
102 |
logging.info("Converting LaTeX output to Markdown using Gemini API")
|
@@ -106,6 +112,7 @@ class DocumentService:
|
|
106 |
raise ConversionError("Conversion cancelled before LaTeX conversion")
|
107 |
|
108 |
try:
|
|
|
109 |
markdown_content = convert_latex_to_markdown(content)
|
110 |
if markdown_content:
|
111 |
logging.info(f"LaTeX conversion completed in {time.time() - start_convert:.2f} seconds")
|
@@ -118,16 +125,21 @@ class DocumentService:
|
|
118 |
|
119 |
return content
|
120 |
|
121 |
-
def _create_output_file(self, content: str, output_format: str, original_file_path: Optional[str] = None) -> str:
|
122 |
"""Create output file with proper extension and preserved filename."""
|
123 |
-
# Determine file extension
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
|
|
|
|
|
|
|
|
|
|
131 |
|
132 |
if self._check_cancellation():
|
133 |
raise ConversionError("Conversion cancelled before output file creation")
|
@@ -247,7 +259,7 @@ class DocumentService:
|
|
247 |
raise ConversionError("Conversion cancelled")
|
248 |
|
249 |
# Create output file
|
250 |
-
output_path = self._create_output_file(content, output_format, file_path)
|
251 |
|
252 |
return content, output_path
|
253 |
|
|
|
94 |
return temp_path
|
95 |
|
96 |
def _process_latex_content(self, content: str, parser_name: str, ocr_method_name: str) -> str:
|
97 |
+
"""Process LaTeX content - for GOT-OCR, return raw LaTeX without conversion."""
|
98 |
+
# For GOT-OCR, skip LLM conversion and return raw LaTeX
|
99 |
+
if parser_name == "GOT-OCR (jpg,png only)":
|
100 |
+
logging.info("GOT-OCR detected: returning raw LaTeX output (no LLM conversion)")
|
101 |
+
return content
|
102 |
+
|
103 |
+
# For other parsers with LaTeX content, process as before
|
104 |
+
if (content and
|
105 |
+
("\\begin" in content or "\\end" in content or "$" in content) and
|
106 |
config.api.google_api_key):
|
107 |
|
108 |
logging.info("Converting LaTeX output to Markdown using Gemini API")
|
|
|
112 |
raise ConversionError("Conversion cancelled before LaTeX conversion")
|
113 |
|
114 |
try:
|
115 |
+
from src.core.latex_to_markdown_converter import convert_latex_to_markdown
|
116 |
markdown_content = convert_latex_to_markdown(content)
|
117 |
if markdown_content:
|
118 |
logging.info(f"LaTeX conversion completed in {time.time() - start_convert:.2f} seconds")
|
|
|
125 |
|
126 |
return content
|
127 |
|
128 |
+
def _create_output_file(self, content: str, output_format: str, original_file_path: Optional[str] = None, parser_name: Optional[str] = None) -> str:
|
129 |
"""Create output file with proper extension and preserved filename."""
|
130 |
+
# Determine file extension based on parser and format
|
131 |
+
if parser_name == "GOT-OCR (jpg,png only)":
|
132 |
+
# For GOT-OCR, use .tex extension
|
133 |
+
ext = ".tex"
|
134 |
+
else:
|
135 |
+
# For other parsers, use format-based extensions
|
136 |
+
format_extensions = {
|
137 |
+
"markdown": ".md",
|
138 |
+
"json": ".json",
|
139 |
+
"text": ".txt",
|
140 |
+
"document tags": ".doctags"
|
141 |
+
}
|
142 |
+
ext = format_extensions.get(output_format.lower(), ".txt")
|
143 |
|
144 |
if self._check_cancellation():
|
145 |
raise ConversionError("Conversion cancelled before output file creation")
|
|
|
259 |
raise ConversionError("Conversion cancelled")
|
260 |
|
261 |
# Create output file
|
262 |
+
output_path = self._create_output_file(content, output_format, file_path, parser_name)
|
263 |
|
264 |
return content, output_path
|
265 |
|
src/ui/ui.py
CHANGED
@@ -131,6 +131,174 @@ def format_markdown_content(content):
|
|
131 |
html_content = markdown.markdown(str(content), extensions=['tables'])
|
132 |
return html_content
|
133 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
# Function to run conversion in a separate thread
|
135 |
def run_conversion_thread(file_path, parser_name, ocr_method_name, output_format):
|
136 |
"""Run the conversion in a separate thread and return the thread object"""
|
@@ -264,9 +432,15 @@ def handle_convert(files, parser_name, ocr_method_name, output_format, processin
|
|
264 |
logger.info("Converter returned cancellation message")
|
265 |
return content, None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
|
266 |
|
267 |
-
# Format the content
|
268 |
-
|
269 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
270 |
|
271 |
logger.info("Conversion completed successfully")
|
272 |
|
|
|
131 |
html_content = markdown.markdown(str(content), extensions=['tables'])
|
132 |
return html_content
|
133 |
|
134 |
+
def render_latex_to_html(latex_content):
|
135 |
+
"""Convert LaTeX content to HTML using Mathpix Markdown like GOT-OCR demo."""
|
136 |
+
import json
|
137 |
+
|
138 |
+
# Clean up the content similar to GOT-OCR demo
|
139 |
+
content = latex_content.strip()
|
140 |
+
if content.endswith("<|im_end|>"):
|
141 |
+
content = content[:-len("<|im_end|>")]
|
142 |
+
|
143 |
+
# Fix unbalanced delimiters exactly like GOT-OCR demo
|
144 |
+
right_num = content.count("\\right")
|
145 |
+
left_num = content.count("\\left")
|
146 |
+
|
147 |
+
if right_num != left_num:
|
148 |
+
content = (
|
149 |
+
content.replace("\\left(", "(")
|
150 |
+
.replace("\\right)", ")")
|
151 |
+
.replace("\\left[", "[")
|
152 |
+
.replace("\\right]", "]")
|
153 |
+
.replace("\\left{", "{")
|
154 |
+
.replace("\\right}", "}")
|
155 |
+
.replace("\\left|", "|")
|
156 |
+
.replace("\\right|", "|")
|
157 |
+
.replace("\\left.", ".")
|
158 |
+
.replace("\\right.", ".")
|
159 |
+
)
|
160 |
+
|
161 |
+
# Process content like GOT-OCR demo: remove $ signs and replace quotes
|
162 |
+
content = content.replace('"', "``").replace("$", "")
|
163 |
+
|
164 |
+
# Split into lines and create JavaScript string like GOT-OCR demo
|
165 |
+
outputs_list = content.split("\n")
|
166 |
+
js_text_parts = []
|
167 |
+
for line in outputs_list:
|
168 |
+
# Escape backslashes and add line break
|
169 |
+
escaped_line = line.replace("\\", "\\\\")
|
170 |
+
js_text_parts.append(f'"{escaped_line}\\n"')
|
171 |
+
|
172 |
+
# Join with + like in GOT-OCR demo
|
173 |
+
js_text = " + ".join(js_text_parts)
|
174 |
+
|
175 |
+
# Create HTML using Mathpix Markdown like GOT-OCR demo
|
176 |
+
html_content = f"""<!DOCTYPE html>
|
177 |
+
<html lang="en" data-lt-installed="true">
|
178 |
+
<head>
|
179 |
+
<meta charset="UTF-8">
|
180 |
+
<title>LaTeX Content</title>
|
181 |
+
<script>
|
182 |
+
const text = {js_text};
|
183 |
+
</script>
|
184 |
+
<style>
|
185 |
+
#content {{
|
186 |
+
max-width: 800px;
|
187 |
+
margin: auto;
|
188 |
+
padding: 20px;
|
189 |
+
}}
|
190 |
+
body {{
|
191 |
+
font-family: 'Times New Roman', serif;
|
192 |
+
line-height: 1.6;
|
193 |
+
background-color: #ffffff;
|
194 |
+
color: #333;
|
195 |
+
}}
|
196 |
+
table {{
|
197 |
+
border-collapse: collapse;
|
198 |
+
width: 100%;
|
199 |
+
margin: 20px 0;
|
200 |
+
}}
|
201 |
+
td, th {{
|
202 |
+
border: 1px solid #333;
|
203 |
+
padding: 8px 12px;
|
204 |
+
text-align: center;
|
205 |
+
vertical-align: middle;
|
206 |
+
}}
|
207 |
+
</style>
|
208 |
+
<script>
|
209 |
+
let script = document.createElement('script');
|
210 |
+
script.src = "https://cdn.jsdelivr.net/npm/mathpix-markdown-it@1.3.6/es5/bundle.js";
|
211 |
+
document.head.append(script);
|
212 |
+
script.onload = function() {{
|
213 |
+
const isLoaded = window.loadMathJax();
|
214 |
+
if (isLoaded) {{
|
215 |
+
console.log('Styles loaded!')
|
216 |
+
}}
|
217 |
+
const el = window.document.getElementById('content-text');
|
218 |
+
if (el) {{
|
219 |
+
const options = {{
|
220 |
+
htmlTags: true
|
221 |
+
}};
|
222 |
+
const html = window.render(text, options);
|
223 |
+
el.outerHTML = html;
|
224 |
+
}}
|
225 |
+
}};
|
226 |
+
</script>
|
227 |
+
</head>
|
228 |
+
<body>
|
229 |
+
<div id="content">
|
230 |
+
<div id="content-text"></div>
|
231 |
+
</div>
|
232 |
+
</body>
|
233 |
+
</html>"""
|
234 |
+
|
235 |
+
return html_content
|
236 |
+
|
237 |
+
def format_latex_content(content):
|
238 |
+
"""Format LaTeX content for display in UI using MathJax rendering like GOT-OCR demo."""
|
239 |
+
if not content:
|
240 |
+
return content
|
241 |
+
|
242 |
+
try:
|
243 |
+
# Generate rendered HTML
|
244 |
+
rendered_html = render_latex_to_html(content)
|
245 |
+
|
246 |
+
# Encode for iframe display (similar to GOT-OCR demo)
|
247 |
+
import base64
|
248 |
+
encoded_html = base64.b64encode(rendered_html.encode("utf-8")).decode("utf-8")
|
249 |
+
iframe_src = f"data:text/html;base64,{encoded_html}"
|
250 |
+
|
251 |
+
# Create the display with both rendered and raw views
|
252 |
+
formatted_content = f"""
|
253 |
+
<div style="background-color: #f8f9fa; border-radius: 8px; border: 1px solid #e9ecef; margin: 10px 0;">
|
254 |
+
<div style="background-color: #e9ecef; padding: 10px; border-radius: 8px 8px 0 0; font-weight: bold; color: #495057;">
|
255 |
+
π LaTeX Content (Rendered with MathJax)
|
256 |
+
</div>
|
257 |
+
<div style="padding: 0;">
|
258 |
+
<iframe src="{iframe_src}" width="100%" height="500px" style="border: none; border-radius: 0 0 8px 8px;"></iframe>
|
259 |
+
</div>
|
260 |
+
<div style="background-color: #e9ecef; padding: 8px 15px; border-radius: 0; font-size: 12px; color: #6c757d; border-top: 1px solid #dee2e6;">
|
261 |
+
π‘ LaTeX content rendered with MathJax. Tables and formulas are displayed as they would appear in a LaTeX document.
|
262 |
+
</div>
|
263 |
+
<details style="margin: 0; border-top: 1px solid #dee2e6;">
|
264 |
+
<summary style="padding: 8px 15px; background-color: #e9ecef; cursor: pointer; font-size: 12px; color: #6c757d;">
|
265 |
+
π View Raw LaTeX Source
|
266 |
+
</summary>
|
267 |
+
<div style="padding: 15px; background-color: #f8f9fa;">
|
268 |
+
<pre style="background-color: transparent; margin: 0; padding: 0;
|
269 |
+
font-family: 'Courier New', monospace; font-size: 12px; line-height: 1.4;
|
270 |
+
white-space: pre-wrap; word-wrap: break-word; color: #2c3e50; max-height: 200px; overflow-y: auto;">
|
271 |
+
{content}
|
272 |
+
</pre>
|
273 |
+
</div>
|
274 |
+
</details>
|
275 |
+
</div>
|
276 |
+
"""
|
277 |
+
|
278 |
+
except Exception as e:
|
279 |
+
# Fallback to simple formatting if rendering fails
|
280 |
+
import html
|
281 |
+
escaped_content = html.escape(str(content))
|
282 |
+
formatted_content = f"""
|
283 |
+
<div style="background-color: #f8f9fa; border-radius: 8px; border: 1px solid #e9ecef; margin: 10px 0;">
|
284 |
+
<div style="background-color: #e9ecef; padding: 10px; border-radius: 8px 8px 0 0; font-weight: bold; color: #495057;">
|
285 |
+
π LaTeX Content (Fallback View)
|
286 |
+
</div>
|
287 |
+
<div style="padding: 15px;">
|
288 |
+
<pre style="background-color: transparent; margin: 0; padding: 0;
|
289 |
+
font-family: 'Courier New', monospace; font-size: 14px; line-height: 1.4;
|
290 |
+
white-space: pre-wrap; word-wrap: break-word; color: #2c3e50;">
|
291 |
+
{escaped_content}
|
292 |
+
</pre>
|
293 |
+
</div>
|
294 |
+
<div style="background-color: #e9ecef; padding: 8px 15px; border-radius: 0 0 8px 8px; font-size: 12px; color: #6c757d;">
|
295 |
+
β οΈ Rendering failed, showing raw LaTeX. Error: {str(e)}
|
296 |
+
</div>
|
297 |
+
</div>
|
298 |
+
"""
|
299 |
+
|
300 |
+
return formatted_content
|
301 |
+
|
302 |
# Function to run conversion in a separate thread
|
303 |
def run_conversion_thread(file_path, parser_name, ocr_method_name, output_format):
|
304 |
"""Run the conversion in a separate thread and return the thread object"""
|
|
|
432 |
logger.info("Converter returned cancellation message")
|
433 |
return content, None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
|
434 |
|
435 |
+
# Format the content based on parser type
|
436 |
+
if "GOT-OCR" in parser_name:
|
437 |
+
# For GOT-OCR, display as LaTeX
|
438 |
+
formatted_content = format_latex_content(str(content))
|
439 |
+
html_output = f"<div class='output-container'>{formatted_content}</div>"
|
440 |
+
else:
|
441 |
+
# For other parsers, display as Markdown
|
442 |
+
formatted_content = format_markdown_content(str(content))
|
443 |
+
html_output = f"<div class='output-container'>{formatted_content}</div>"
|
444 |
|
445 |
logger.info("Conversion completed successfully")
|
446 |
|