AnseMin commited on
Commit
63279a9
Β·
1 Parent(s): 623ad58

Refactor document ingestion and chunking to support LaTeX content

Browse files

- Updated `DocumentIngestionService` to generalize content ingestion, allowing both Markdown and LaTeX formats.
- Introduced `LaTeXAwareChunker` for chunking LaTeX documents while preserving structures.
- Enhanced `UnifiedDocumentChunker` to handle both Markdown and LaTeX content types seamlessly.
- Modified `_process_latex_content` to return raw LaTeX for GOT-OCR without LLM conversion.
- Improved UI rendering for LaTeX content, ensuring proper display using MathJax.
- Added backward compatibility for existing Markdown ingestion methods.

src/parsers/got_ocr_parser.py CHANGED
@@ -85,7 +85,7 @@ class GotOcrParser(DocumentParser):
85
  **kwargs: Additional arguments to pass to the model
86
 
87
  Returns:
88
- Extracted text from the image, converted to Markdown if formatted
89
  """
90
  # Verify dependencies are installed without initializing CUDA
91
  if not self._check_dependencies():
@@ -131,15 +131,23 @@ class GotOcrParser(DocumentParser):
131
  image_path_str = str(file_path)
132
 
133
  # Call the wrapper function that handles ZeroGPU safely
134
- return self._safe_gpu_process(image_path_str, use_format, **safe_kwargs)
135
  else:
136
  # Fallback for environments without spaces
137
- return self._process_image_without_gpu(
138
  str(file_path),
139
  use_format=use_format,
140
  **safe_kwargs
141
  )
142
 
 
 
 
 
 
 
 
 
143
  except Exception as e:
144
  logger.error(f"Error processing image with GOT-OCR: {str(e)}")
145
 
@@ -195,7 +203,7 @@ class GotOcrParser(DocumentParser):
195
  image = load_image(image_path)
196
 
197
  # Load processor and model
198
- processor = AutoProcessor.from_pretrained(MODEL_NAME)
199
 
200
  # Use CPU if in main process to avoid CUDA initialization issues
201
  device = "cpu"
@@ -285,7 +293,7 @@ class GotOcrParser(DocumentParser):
285
  logger.info(f"Loading GOT-OCR model from {MODEL_NAME} on {device}")
286
 
287
  # Load processor
288
- processor = AutoProcessor.from_pretrained(MODEL_NAME)
289
 
290
  # Load model
291
  model = AutoModelForImageTextToText.from_pretrained(
 
85
  **kwargs: Additional arguments to pass to the model
86
 
87
  Returns:
88
+ Extracted text from the image as raw LaTeX
89
  """
90
  # Verify dependencies are installed without initializing CUDA
91
  if not self._check_dependencies():
 
131
  image_path_str = str(file_path)
132
 
133
  # Call the wrapper function that handles ZeroGPU safely
134
+ result = self._safe_gpu_process(image_path_str, use_format, **safe_kwargs)
135
  else:
136
  # Fallback for environments without spaces
137
+ result = self._process_image_without_gpu(
138
  str(file_path),
139
  use_format=use_format,
140
  **safe_kwargs
141
  )
142
 
143
+ # Add a small delay to replace LLM conversion time
144
+ import time
145
+ time.sleep(2) # 2 second delay to simulate processing time
146
+
147
+ # Return raw LaTeX output (no LLM conversion)
148
+ logger.info("Returning raw LaTeX output (no LLM conversion)")
149
+ return result
150
+
151
  except Exception as e:
152
  logger.error(f"Error processing image with GOT-OCR: {str(e)}")
153
 
 
203
  image = load_image(image_path)
204
 
205
  # Load processor and model
206
+ processor = AutoProcessor.from_pretrained(MODEL_NAME, use_fast=True)
207
 
208
  # Use CPU if in main process to avoid CUDA initialization issues
209
  device = "cpu"
 
293
  logger.info(f"Loading GOT-OCR model from {MODEL_NAME} on {device}")
294
 
295
  # Load processor
296
+ processor = AutoProcessor.from_pretrained(MODEL_NAME, use_fast=True)
297
 
298
  # Load model
299
  model = AutoModelForImageTextToText.from_pretrained(
src/rag/chunking.py CHANGED
@@ -8,6 +8,253 @@ from src.core.logging_config import get_logger
8
 
9
  logger = get_logger(__name__)
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  class MarkdownAwareChunker:
12
  """Handles markdown-aware document chunking that preserves tables and structures."""
13
 
@@ -269,5 +516,128 @@ class MarkdownAwareChunker:
269
 
270
  return preview
271
 
272
- # Global chunker instance with optimized settings for markdown RAG
273
- document_chunker = MarkdownAwareChunker(chunk_size=1000, chunk_overlap=200)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  logger = get_logger(__name__)
10
 
11
+ class LaTeXAwareChunker:
12
+ """Handles LaTeX-aware document chunking that preserves LaTeX structures."""
13
+
14
+ def __init__(self, chunk_size: int = 1200, chunk_overlap: int = 150):
15
+ """
16
+ Initialize the LaTeX-aware document chunker.
17
+
18
+ Args:
19
+ chunk_size: Maximum size of each chunk in characters
20
+ chunk_overlap: Number of characters to overlap between chunks
21
+ """
22
+ self.chunk_size = chunk_size
23
+ self.chunk_overlap = chunk_overlap
24
+
25
+ # Initialize the text splitter with LaTeX-aware settings
26
+ self.text_splitter = RecursiveCharacterTextSplitter(
27
+ chunk_size=chunk_size,
28
+ chunk_overlap=chunk_overlap,
29
+ length_function=len,
30
+ separators=[
31
+ "\n\\section{", # Section headers
32
+ "\n\\subsection{", # Subsection headers
33
+ "\n\\subsubsection{", # Subsubsection headers
34
+ "\n\\title{", # Title commands
35
+ "\n\\begin{", # Begin environments
36
+ "\n\\end{", # End environments
37
+ "\n\n", # Paragraph breaks
38
+ "\n", # Line breaks
39
+ ". ", # Sentence breaks
40
+ " ", # Word breaks
41
+ "" # Character breaks
42
+ ],
43
+ keep_separator=True,
44
+ add_start_index=True
45
+ )
46
+
47
+ # Regex patterns for LaTeX structures
48
+ self.latex_table_pattern = re.compile(
49
+ r'\\begin\{tabular\}.*?\\end\{tabular\}',
50
+ re.DOTALL | re.MULTILINE
51
+ )
52
+
53
+ self.latex_title_pattern = re.compile(
54
+ r'\\title\{[^}]*\}',
55
+ re.MULTILINE
56
+ )
57
+
58
+ self.latex_section_pattern = re.compile(
59
+ r'\\(?:sub)*section\*?\{[^}]*\}',
60
+ re.MULTILINE
61
+ )
62
+
63
+ self.latex_environment_pattern = re.compile(
64
+ r'\\begin\{[^}]+\}.*?\\end\{[^}]+\}',
65
+ re.DOTALL | re.MULTILINE
66
+ )
67
+
68
+ logger.info(f"LaTeX-aware chunker initialized with chunk_size={chunk_size}, overlap={chunk_overlap}")
69
+
70
+ def extract_latex_structures(self, content: str) -> Tuple[List[Tuple[int, int, str]], str]:
71
+ """
72
+ Extract LaTeX tables and environments, replacing them with placeholders.
73
+
74
+ Args:
75
+ content: Original LaTeX content
76
+
77
+ Returns:
78
+ Tuple of (structures_list, content_with_placeholders)
79
+ """
80
+ structures = []
81
+
82
+ # Find all tabular environments (highest priority)
83
+ for match in self.latex_table_pattern.finditer(content):
84
+ structures.append((
85
+ match.start(),
86
+ match.end(),
87
+ "latex_table",
88
+ match.group()
89
+ ))
90
+
91
+ # Find other LaTeX environments (avoid overlapping with tables)
92
+ for match in self.latex_environment_pattern.finditer(content):
93
+ # Check if this environment overlaps with any table
94
+ overlaps_with_table = any(
95
+ table_start <= match.start() < table_end or
96
+ table_start < match.end() <= table_end
97
+ for table_start, table_end, struct_type, _ in structures
98
+ if struct_type == "latex_table"
99
+ )
100
+
101
+ if not overlaps_with_table and "tabular" not in match.group():
102
+ structures.append((
103
+ match.start(),
104
+ match.end(),
105
+ "latex_environment",
106
+ match.group()
107
+ ))
108
+
109
+ # Find titles and sections
110
+ for match in self.latex_title_pattern.finditer(content):
111
+ structures.append((
112
+ match.start(),
113
+ match.end(),
114
+ "latex_title",
115
+ match.group()
116
+ ))
117
+
118
+ for match in self.latex_section_pattern.finditer(content):
119
+ structures.append((
120
+ match.start(),
121
+ match.end(),
122
+ "latex_section",
123
+ match.group()
124
+ ))
125
+
126
+ # Sort by start position
127
+ structures.sort(key=lambda x: x[0])
128
+
129
+ # Replace structures with placeholders
130
+ content_with_placeholders = content
131
+ offset = 0
132
+
133
+ for i, (start, end, struct_type, struct_content) in enumerate(structures):
134
+ placeholder = f"\n\n__LATEX_STRUCTURE_{i}_{struct_type.upper()}__\n\n"
135
+
136
+ # Adjust positions based on previous replacements
137
+ adjusted_start = start - offset
138
+ adjusted_end = end - offset
139
+
140
+ content_with_placeholders = (
141
+ content_with_placeholders[:adjusted_start] +
142
+ placeholder +
143
+ content_with_placeholders[adjusted_end:]
144
+ )
145
+
146
+ # Update offset for next replacement
147
+ offset += (end - start) - len(placeholder)
148
+
149
+ return structures, content_with_placeholders
150
+
151
+ def restore_latex_structures(self, chunks: List[str], structures: List[Tuple[int, int, str, str]]) -> List[str]:
152
+ """
153
+ Restore LaTeX structures in chunks, keeping tables and environments intact.
154
+
155
+ Args:
156
+ chunks: List of text chunks with placeholders
157
+ structures: List of original structures
158
+
159
+ Returns:
160
+ List of chunks with restored structures
161
+ """
162
+ restored_chunks = []
163
+
164
+ for chunk in chunks:
165
+ restored_chunk = chunk
166
+
167
+ # Find placeholders in this chunk
168
+ placeholder_pattern = re.compile(r'__LATEX_STRUCTURE_(\d+)_(\w+)__')
169
+
170
+ for match in placeholder_pattern.finditer(chunk):
171
+ structure_index = int(match.group(1))
172
+
173
+ if structure_index < len(structures):
174
+ original_structure = structures[structure_index][3]
175
+ restored_chunk = restored_chunk.replace(match.group(), original_structure)
176
+
177
+ restored_chunks.append(restored_chunk)
178
+
179
+ return restored_chunks
180
+
181
+ def chunk_document(self, content: str, source_metadata: Dict[str, Any]) -> List[Document]:
182
+ """
183
+ Chunk a LaTeX document while preserving LaTeX structures.
184
+
185
+ Args:
186
+ content: The LaTeX content to chunk
187
+ source_metadata: Metadata about the source document
188
+
189
+ Returns:
190
+ List of Document objects with chunked content and enhanced metadata
191
+ """
192
+ try:
193
+ # Extract LaTeX structures and replace with placeholders
194
+ structures, content_with_placeholders = self.extract_latex_structures(content)
195
+
196
+ # Create a document object with placeholders
197
+ doc = Document(
198
+ page_content=content_with_placeholders,
199
+ metadata=source_metadata
200
+ )
201
+
202
+ # Split the document into chunks
203
+ chunks = self.text_splitter.split_documents([doc])
204
+
205
+ # Restore LaTeX structures in chunks
206
+ chunk_contents = [chunk.page_content for chunk in chunks]
207
+ restored_contents = self.restore_latex_structures(chunk_contents, structures)
208
+
209
+ # Create enhanced chunks with restored content
210
+ enhanced_chunks = []
211
+ for i, (chunk, restored_content) in enumerate(zip(chunks, restored_contents)):
212
+ # Add chunk-specific metadata
213
+ chunk.metadata.update({
214
+ "chunk_index": i,
215
+ "total_chunks": len(chunks),
216
+ "chunk_size": len(restored_content),
217
+ "chunk_id": f"{source_metadata.get('source_id', 'unknown')}_{i}",
218
+ "has_latex_table": "\\begin{tabular}" in restored_content,
219
+ "has_latex_environment": "\\begin{" in restored_content and "\\end{" in restored_content,
220
+ "has_latex_math": "\\(" in restored_content or "$" in restored_content,
221
+ "content_type": "latex"
222
+ })
223
+
224
+ # Update the chunk content with restored structures
225
+ chunk.page_content = restored_content
226
+ enhanced_chunks.append(chunk)
227
+
228
+ logger.info(f"LaTeX document chunked into {len(enhanced_chunks)} structure-aware pieces")
229
+ return enhanced_chunks
230
+
231
+ except Exception as e:
232
+ logger.error(f"Error chunking LaTeX document: {e}")
233
+ # Fallback to regular chunking if LaTeX processing fails
234
+ return self._fallback_chunk(content, source_metadata)
235
+
236
+ def _fallback_chunk(self, content: str, source_metadata: Dict[str, Any]) -> List[Document]:
237
+ """Fallback chunking method if LaTeX-aware chunking fails."""
238
+ try:
239
+ doc = Document(page_content=content, metadata=source_metadata)
240
+ chunks = self.text_splitter.split_documents([doc])
241
+
242
+ for i, chunk in enumerate(chunks):
243
+ chunk.metadata.update({
244
+ "chunk_index": i,
245
+ "total_chunks": len(chunks),
246
+ "chunk_size": len(chunk.page_content),
247
+ "chunk_id": f"{source_metadata.get('source_id', 'unknown')}_{i}",
248
+ "content_type": "latex"
249
+ })
250
+
251
+ logger.warning(f"Used fallback chunking for LaTeX content: {len(chunks)} pieces")
252
+ return chunks
253
+
254
+ except Exception as e:
255
+ logger.error(f"Error in LaTeX fallback chunking: {e}")
256
+ raise
257
+
258
  class MarkdownAwareChunker:
259
  """Handles markdown-aware document chunking that preserves tables and structures."""
260
 
 
516
 
517
  return preview
518
 
519
+ class UnifiedDocumentChunker:
520
+ """Unified chunker that handles both Markdown and LaTeX content types."""
521
+
522
+ def __init__(self):
523
+ """Initialize the unified chunker with both markdown and LaTeX chunkers."""
524
+ self.markdown_chunker = MarkdownAwareChunker(chunk_size=1000, chunk_overlap=200)
525
+ self.latex_chunker = LaTeXAwareChunker(chunk_size=1200, chunk_overlap=150)
526
+ logger.info("Unified document chunker initialized with both Markdown and LaTeX support")
527
+
528
+ def chunk_document(self, content: str, source_metadata: Dict[str, Any]) -> List[Document]:
529
+ """
530
+ Chunk a document using the appropriate chunker based on content type.
531
+
532
+ Args:
533
+ content: The document content to chunk
534
+ source_metadata: Metadata about the source document
535
+
536
+ Returns:
537
+ List of Document objects with chunked content and enhanced metadata
538
+ """
539
+ # Determine content type from metadata or content analysis
540
+ content_type = source_metadata.get('doc_type', 'markdown').lower()
541
+
542
+ # Override content type detection for GOT-OCR results
543
+ if source_metadata.get('conversion_method', '').startswith('GOT-OCR'):
544
+ content_type = 'latex'
545
+
546
+ # Auto-detect content type if not specified
547
+ if content_type not in ['markdown', 'latex']:
548
+ if self._is_latex_content(content):
549
+ content_type = 'latex'
550
+ else:
551
+ content_type = 'markdown'
552
+
553
+ # Use appropriate chunker
554
+ if content_type == 'latex':
555
+ logger.info("Using LaTeX-aware chunker for document")
556
+ return self.latex_chunker.chunk_document(content, source_metadata)
557
+ else:
558
+ logger.info("Using Markdown-aware chunker for document")
559
+ return self.markdown_chunker.chunk_document(content, source_metadata)
560
+
561
+ def _is_latex_content(self, content: str) -> bool:
562
+ """
563
+ Auto-detect if content is LaTeX based on common LaTeX commands.
564
+
565
+ Args:
566
+ content: Content to analyze
567
+
568
+ Returns:
569
+ True if content appears to be LaTeX, False otherwise
570
+ """
571
+ latex_indicators = [
572
+ r'\\begin\{',
573
+ r'\\end\{',
574
+ r'\\title\{',
575
+ r'\\section',
576
+ r'\\subsection',
577
+ r'\\hline',
578
+ r'\\multirow',
579
+ r'\\multicolumn'
580
+ ]
581
+
582
+ # Count LaTeX indicators
583
+ latex_count = sum(1 for indicator in latex_indicators if re.search(indicator, content))
584
+
585
+ # If we find multiple LaTeX indicators, treat as LaTeX
586
+ return latex_count >= 2
587
+
588
+ def chunk_multiple_documents(self, documents: List[Dict[str, Any]]) -> List[Document]:
589
+ """
590
+ Chunk multiple documents using appropriate chunkers.
591
+
592
+ Args:
593
+ documents: List of dictionaries with 'content' and 'metadata' keys
594
+
595
+ Returns:
596
+ List of chunked Document objects
597
+ """
598
+ all_chunks = []
599
+
600
+ for doc_data in documents:
601
+ content = doc_data.get('content', '')
602
+ metadata = doc_data.get('metadata', {})
603
+
604
+ if content.strip(): # Only process non-empty content
605
+ chunks = self.chunk_document(content, metadata)
606
+ all_chunks.extend(chunks)
607
+
608
+ logger.info(f"Chunked {len(documents)} documents into {len(all_chunks)} total chunks")
609
+ return all_chunks
610
+
611
+ def get_chunk_preview(self, chunks: List[Document], max_chunks: int = 5) -> str:
612
+ """
613
+ Generate a preview of chunks for debugging/logging.
614
+
615
+ Args:
616
+ chunks: List of Document chunks
617
+ max_chunks: Maximum number of chunks to include in preview
618
+
619
+ Returns:
620
+ String preview of chunks
621
+ """
622
+ preview = f"Document Chunks Preview ({len(chunks)} total chunks):\n"
623
+ preview += "=" * 50 + "\n"
624
+
625
+ for i, chunk in enumerate(chunks[:max_chunks]):
626
+ content_type = chunk.metadata.get('content_type', 'unknown')
627
+ has_table = chunk.metadata.get('has_table', False) or chunk.metadata.get('has_latex_table', False)
628
+ has_code = chunk.metadata.get('has_code', False) or chunk.metadata.get('has_latex_environment', False)
629
+
630
+ preview += f"Chunk {i + 1} ({content_type}):\n"
631
+ preview += f" Length: {len(chunk.page_content)} characters\n"
632
+ preview += f" Has Table: {has_table}, Has Code/Environment: {has_code}\n"
633
+ preview += f" Metadata: {chunk.metadata}\n"
634
+ preview += f" Content preview: {chunk.page_content[:100]}...\n"
635
+ preview += "-" * 30 + "\n"
636
+
637
+ if len(chunks) > max_chunks:
638
+ preview += f"... and {len(chunks) - max_chunks} more chunks\n"
639
+
640
+ return preview
641
+
642
+ # Global unified chunker instance that supports both Markdown and LaTeX
643
+ document_chunker = UnifiedDocumentChunker()
src/rag/ingestion.py CHANGED
@@ -75,16 +75,18 @@ class DocumentIngestionService:
75
  logger.error(f"Error deleting existing document: {e}")
76
  return False
77
 
78
- def ingest_markdown_content(self,
79
- markdown_content: str,
80
- source_path: Optional[str] = None,
81
- metadata: Optional[Dict[str, Any]] = None,
82
- original_file_content: Optional[str] = None) -> Tuple[bool, str, Dict[str, Any]]:
 
83
  """
84
- Ingest markdown content into the RAG system.
85
 
86
  Args:
87
- markdown_content: The markdown content to ingest
 
88
  source_path: Optional source path/filename
89
  metadata: Optional additional metadata
90
  original_file_content: Original file content for hash calculation
@@ -93,11 +95,11 @@ class DocumentIngestionService:
93
  Tuple of (success, message, ingestion_stats)
94
  """
95
  try:
96
- if not markdown_content or not markdown_content.strip():
97
  return False, "No content provided for ingestion", {}
98
 
99
- # Create file hash using original content if available, otherwise use markdown content
100
- file_content_for_hash = original_file_content or markdown_content
101
  file_hash = self.create_file_hash(file_content_for_hash)
102
 
103
  # Check for duplicates in vector store
@@ -115,16 +117,16 @@ class DocumentIngestionService:
115
  # Prepare document metadata with file hash
116
  doc_metadata = self.prepare_document_metadata(
117
  source_path=source_path,
118
- doc_type="markdown",
119
  additional_metadata=metadata
120
  )
121
  doc_metadata["file_hash"] = file_hash
122
- doc_metadata["content_length"] = len(markdown_content)
123
  doc_metadata["upload_timestamp"] = datetime.now().isoformat()
124
 
125
- # Chunk the document using markdown-aware chunking
126
- logger.info(f"Chunking document: {file_hash}")
127
- chunks = document_chunker.chunk_document(markdown_content, doc_metadata)
128
 
129
  if not chunks:
130
  return False, "Failed to create document chunks", {}
@@ -142,7 +144,7 @@ class DocumentIngestionService:
142
  "file_hash": file_hash,
143
  "total_chunks": len(chunks),
144
  "document_ids": doc_ids,
145
- "content_length": len(markdown_content),
146
  "has_tables": any(chunk.metadata.get("has_table", False) for chunk in chunks),
147
  "has_code": any(chunk.metadata.get("has_code", False) for chunk in chunks),
148
  "processed_at": datetime.now().isoformat(),
@@ -160,6 +162,22 @@ class DocumentIngestionService:
160
  logger.error(error_msg)
161
  return False, error_msg, {"status": "error", "error": str(e)}
162
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  def ingest_from_conversion_result(self, conversion_result: Dict[str, Any]) -> Tuple[bool, str, Dict[str, Any]]:
164
  """
165
  Ingest a document from Markit conversion result.
@@ -189,9 +207,13 @@ class DocumentIngestionService:
189
  "conversion_time": conversion_result.get("conversion_time", 0)
190
  }
191
 
192
- # Ingest the markdown content with original file content for proper hashing
193
- return self.ingest_markdown_content(
194
- markdown_content=markdown_content,
 
 
 
 
195
  source_path=original_filename,
196
  metadata=additional_metadata,
197
  original_file_content=original_file_content
 
75
  logger.error(f"Error deleting existing document: {e}")
76
  return False
77
 
78
+ def ingest_text_content(self,
79
+ text_content: str,
80
+ content_type: str = "markdown",
81
+ source_path: Optional[str] = None,
82
+ metadata: Optional[Dict[str, Any]] = None,
83
+ original_file_content: Optional[str] = None) -> Tuple[bool, str, Dict[str, Any]]:
84
  """
85
+ Ingest text content (markdown or LaTeX) into the RAG system.
86
 
87
  Args:
88
+ text_content: The text content to ingest (markdown or LaTeX)
89
+ content_type: Type of content ("markdown" or "latex")
90
  source_path: Optional source path/filename
91
  metadata: Optional additional metadata
92
  original_file_content: Original file content for hash calculation
 
95
  Tuple of (success, message, ingestion_stats)
96
  """
97
  try:
98
+ if not text_content or not text_content.strip():
99
  return False, "No content provided for ingestion", {}
100
 
101
+ # Create file hash using original content if available, otherwise use text content
102
+ file_content_for_hash = original_file_content or text_content
103
  file_hash = self.create_file_hash(file_content_for_hash)
104
 
105
  # Check for duplicates in vector store
 
117
  # Prepare document metadata with file hash
118
  doc_metadata = self.prepare_document_metadata(
119
  source_path=source_path,
120
+ doc_type=content_type, # Use content_type instead of hardcoded "markdown"
121
  additional_metadata=metadata
122
  )
123
  doc_metadata["file_hash"] = file_hash
124
+ doc_metadata["content_length"] = len(text_content)
125
  doc_metadata["upload_timestamp"] = datetime.now().isoformat()
126
 
127
+ # Chunk the document using text-aware chunking
128
+ logger.info(f"Chunking {content_type} document: {file_hash}")
129
+ chunks = document_chunker.chunk_document(text_content, doc_metadata)
130
 
131
  if not chunks:
132
  return False, "Failed to create document chunks", {}
 
144
  "file_hash": file_hash,
145
  "total_chunks": len(chunks),
146
  "document_ids": doc_ids,
147
+ "content_length": len(text_content),
148
  "has_tables": any(chunk.metadata.get("has_table", False) for chunk in chunks),
149
  "has_code": any(chunk.metadata.get("has_code", False) for chunk in chunks),
150
  "processed_at": datetime.now().isoformat(),
 
162
  logger.error(error_msg)
163
  return False, error_msg, {"status": "error", "error": str(e)}
164
 
165
+ def ingest_markdown_content(self,
166
+ markdown_content: str,
167
+ source_path: Optional[str] = None,
168
+ metadata: Optional[Dict[str, Any]] = None,
169
+ original_file_content: Optional[str] = None) -> Tuple[bool, str, Dict[str, Any]]:
170
+ """
171
+ Backward compatibility method for ingesting markdown content.
172
+ """
173
+ return self.ingest_text_content(
174
+ text_content=markdown_content,
175
+ content_type="markdown",
176
+ source_path=source_path,
177
+ metadata=metadata,
178
+ original_file_content=original_file_content
179
+ )
180
+
181
  def ingest_from_conversion_result(self, conversion_result: Dict[str, Any]) -> Tuple[bool, str, Dict[str, Any]]:
182
  """
183
  Ingest a document from Markit conversion result.
 
207
  "conversion_time": conversion_result.get("conversion_time", 0)
208
  }
209
 
210
+ # Determine content type based on conversion method
211
+ content_type = "latex" if "GOT-OCR" in conversion_method else "markdown"
212
+
213
+ # Ingest the content with original file content for proper hashing
214
+ return self.ingest_text_content(
215
+ text_content=markdown_content,
216
+ content_type=content_type,
217
  source_path=original_filename,
218
  metadata=additional_metadata,
219
  original_file_content=original_file_content
src/services/document_service.py CHANGED
@@ -94,9 +94,15 @@ class DocumentService:
94
  return temp_path
95
 
96
  def _process_latex_content(self, content: str, parser_name: str, ocr_method_name: str) -> str:
97
- """Process LaTeX content for GOT-OCR formatted text."""
98
- if (parser_name == "GOT-OCR (jpg,png only)" and
99
- ocr_method_name == "Formatted Text" and
 
 
 
 
 
 
100
  config.api.google_api_key):
101
 
102
  logging.info("Converting LaTeX output to Markdown using Gemini API")
@@ -106,6 +112,7 @@ class DocumentService:
106
  raise ConversionError("Conversion cancelled before LaTeX conversion")
107
 
108
  try:
 
109
  markdown_content = convert_latex_to_markdown(content)
110
  if markdown_content:
111
  logging.info(f"LaTeX conversion completed in {time.time() - start_convert:.2f} seconds")
@@ -118,16 +125,21 @@ class DocumentService:
118
 
119
  return content
120
 
121
- def _create_output_file(self, content: str, output_format: str, original_file_path: Optional[str] = None) -> str:
122
  """Create output file with proper extension and preserved filename."""
123
- # Determine file extension
124
- format_extensions = {
125
- "markdown": ".md",
126
- "json": ".json",
127
- "text": ".txt",
128
- "document tags": ".doctags"
129
- }
130
- ext = format_extensions.get(output_format.lower(), ".txt")
 
 
 
 
 
131
 
132
  if self._check_cancellation():
133
  raise ConversionError("Conversion cancelled before output file creation")
@@ -247,7 +259,7 @@ class DocumentService:
247
  raise ConversionError("Conversion cancelled")
248
 
249
  # Create output file
250
- output_path = self._create_output_file(content, output_format, file_path)
251
 
252
  return content, output_path
253
 
 
94
  return temp_path
95
 
96
  def _process_latex_content(self, content: str, parser_name: str, ocr_method_name: str) -> str:
97
+ """Process LaTeX content - for GOT-OCR, return raw LaTeX without conversion."""
98
+ # For GOT-OCR, skip LLM conversion and return raw LaTeX
99
+ if parser_name == "GOT-OCR (jpg,png only)":
100
+ logging.info("GOT-OCR detected: returning raw LaTeX output (no LLM conversion)")
101
+ return content
102
+
103
+ # For other parsers with LaTeX content, process as before
104
+ if (content and
105
+ ("\\begin" in content or "\\end" in content or "$" in content) and
106
  config.api.google_api_key):
107
 
108
  logging.info("Converting LaTeX output to Markdown using Gemini API")
 
112
  raise ConversionError("Conversion cancelled before LaTeX conversion")
113
 
114
  try:
115
+ from src.core.latex_to_markdown_converter import convert_latex_to_markdown
116
  markdown_content = convert_latex_to_markdown(content)
117
  if markdown_content:
118
  logging.info(f"LaTeX conversion completed in {time.time() - start_convert:.2f} seconds")
 
125
 
126
  return content
127
 
128
+ def _create_output_file(self, content: str, output_format: str, original_file_path: Optional[str] = None, parser_name: Optional[str] = None) -> str:
129
  """Create output file with proper extension and preserved filename."""
130
+ # Determine file extension based on parser and format
131
+ if parser_name == "GOT-OCR (jpg,png only)":
132
+ # For GOT-OCR, use .tex extension
133
+ ext = ".tex"
134
+ else:
135
+ # For other parsers, use format-based extensions
136
+ format_extensions = {
137
+ "markdown": ".md",
138
+ "json": ".json",
139
+ "text": ".txt",
140
+ "document tags": ".doctags"
141
+ }
142
+ ext = format_extensions.get(output_format.lower(), ".txt")
143
 
144
  if self._check_cancellation():
145
  raise ConversionError("Conversion cancelled before output file creation")
 
259
  raise ConversionError("Conversion cancelled")
260
 
261
  # Create output file
262
+ output_path = self._create_output_file(content, output_format, file_path, parser_name)
263
 
264
  return content, output_path
265
 
src/ui/ui.py CHANGED
@@ -131,6 +131,174 @@ def format_markdown_content(content):
131
  html_content = markdown.markdown(str(content), extensions=['tables'])
132
  return html_content
133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  # Function to run conversion in a separate thread
135
  def run_conversion_thread(file_path, parser_name, ocr_method_name, output_format):
136
  """Run the conversion in a separate thread and return the thread object"""
@@ -264,9 +432,15 @@ def handle_convert(files, parser_name, ocr_method_name, output_format, processin
264
  logger.info("Converter returned cancellation message")
265
  return content, None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
266
 
267
- # Format the content and wrap it in the scrollable container
268
- formatted_content = format_markdown_content(str(content))
269
- html_output = f"<div class='output-container'>{formatted_content}</div>"
 
 
 
 
 
 
270
 
271
  logger.info("Conversion completed successfully")
272
 
 
131
  html_content = markdown.markdown(str(content), extensions=['tables'])
132
  return html_content
133
 
134
+ def render_latex_to_html(latex_content):
135
+ """Convert LaTeX content to HTML using Mathpix Markdown like GOT-OCR demo."""
136
+ import json
137
+
138
+ # Clean up the content similar to GOT-OCR demo
139
+ content = latex_content.strip()
140
+ if content.endswith("<|im_end|>"):
141
+ content = content[:-len("<|im_end|>")]
142
+
143
+ # Fix unbalanced delimiters exactly like GOT-OCR demo
144
+ right_num = content.count("\\right")
145
+ left_num = content.count("\\left")
146
+
147
+ if right_num != left_num:
148
+ content = (
149
+ content.replace("\\left(", "(")
150
+ .replace("\\right)", ")")
151
+ .replace("\\left[", "[")
152
+ .replace("\\right]", "]")
153
+ .replace("\\left{", "{")
154
+ .replace("\\right}", "}")
155
+ .replace("\\left|", "|")
156
+ .replace("\\right|", "|")
157
+ .replace("\\left.", ".")
158
+ .replace("\\right.", ".")
159
+ )
160
+
161
+ # Process content like GOT-OCR demo: remove $ signs and replace quotes
162
+ content = content.replace('"', "``").replace("$", "")
163
+
164
+ # Split into lines and create JavaScript string like GOT-OCR demo
165
+ outputs_list = content.split("\n")
166
+ js_text_parts = []
167
+ for line in outputs_list:
168
+ # Escape backslashes and add line break
169
+ escaped_line = line.replace("\\", "\\\\")
170
+ js_text_parts.append(f'"{escaped_line}\\n"')
171
+
172
+ # Join with + like in GOT-OCR demo
173
+ js_text = " + ".join(js_text_parts)
174
+
175
+ # Create HTML using Mathpix Markdown like GOT-OCR demo
176
+ html_content = f"""<!DOCTYPE html>
177
+ <html lang="en" data-lt-installed="true">
178
+ <head>
179
+ <meta charset="UTF-8">
180
+ <title>LaTeX Content</title>
181
+ <script>
182
+ const text = {js_text};
183
+ </script>
184
+ <style>
185
+ #content {{
186
+ max-width: 800px;
187
+ margin: auto;
188
+ padding: 20px;
189
+ }}
190
+ body {{
191
+ font-family: 'Times New Roman', serif;
192
+ line-height: 1.6;
193
+ background-color: #ffffff;
194
+ color: #333;
195
+ }}
196
+ table {{
197
+ border-collapse: collapse;
198
+ width: 100%;
199
+ margin: 20px 0;
200
+ }}
201
+ td, th {{
202
+ border: 1px solid #333;
203
+ padding: 8px 12px;
204
+ text-align: center;
205
+ vertical-align: middle;
206
+ }}
207
+ </style>
208
+ <script>
209
+ let script = document.createElement('script');
210
+ script.src = "https://cdn.jsdelivr.net/npm/mathpix-markdown-it@1.3.6/es5/bundle.js";
211
+ document.head.append(script);
212
+ script.onload = function() {{
213
+ const isLoaded = window.loadMathJax();
214
+ if (isLoaded) {{
215
+ console.log('Styles loaded!')
216
+ }}
217
+ const el = window.document.getElementById('content-text');
218
+ if (el) {{
219
+ const options = {{
220
+ htmlTags: true
221
+ }};
222
+ const html = window.render(text, options);
223
+ el.outerHTML = html;
224
+ }}
225
+ }};
226
+ </script>
227
+ </head>
228
+ <body>
229
+ <div id="content">
230
+ <div id="content-text"></div>
231
+ </div>
232
+ </body>
233
+ </html>"""
234
+
235
+ return html_content
236
+
237
+ def format_latex_content(content):
238
+ """Format LaTeX content for display in UI using MathJax rendering like GOT-OCR demo."""
239
+ if not content:
240
+ return content
241
+
242
+ try:
243
+ # Generate rendered HTML
244
+ rendered_html = render_latex_to_html(content)
245
+
246
+ # Encode for iframe display (similar to GOT-OCR demo)
247
+ import base64
248
+ encoded_html = base64.b64encode(rendered_html.encode("utf-8")).decode("utf-8")
249
+ iframe_src = f"data:text/html;base64,{encoded_html}"
250
+
251
+ # Create the display with both rendered and raw views
252
+ formatted_content = f"""
253
+ <div style="background-color: #f8f9fa; border-radius: 8px; border: 1px solid #e9ecef; margin: 10px 0;">
254
+ <div style="background-color: #e9ecef; padding: 10px; border-radius: 8px 8px 0 0; font-weight: bold; color: #495057;">
255
+ πŸ“„ LaTeX Content (Rendered with MathJax)
256
+ </div>
257
+ <div style="padding: 0;">
258
+ <iframe src="{iframe_src}" width="100%" height="500px" style="border: none; border-radius: 0 0 8px 8px;"></iframe>
259
+ </div>
260
+ <div style="background-color: #e9ecef; padding: 8px 15px; border-radius: 0; font-size: 12px; color: #6c757d; border-top: 1px solid #dee2e6;">
261
+ πŸ’‘ LaTeX content rendered with MathJax. Tables and formulas are displayed as they would appear in a LaTeX document.
262
+ </div>
263
+ <details style="margin: 0; border-top: 1px solid #dee2e6;">
264
+ <summary style="padding: 8px 15px; background-color: #e9ecef; cursor: pointer; font-size: 12px; color: #6c757d;">
265
+ πŸ“ View Raw LaTeX Source
266
+ </summary>
267
+ <div style="padding: 15px; background-color: #f8f9fa;">
268
+ <pre style="background-color: transparent; margin: 0; padding: 0;
269
+ font-family: 'Courier New', monospace; font-size: 12px; line-height: 1.4;
270
+ white-space: pre-wrap; word-wrap: break-word; color: #2c3e50; max-height: 200px; overflow-y: auto;">
271
+ {content}
272
+ </pre>
273
+ </div>
274
+ </details>
275
+ </div>
276
+ """
277
+
278
+ except Exception as e:
279
+ # Fallback to simple formatting if rendering fails
280
+ import html
281
+ escaped_content = html.escape(str(content))
282
+ formatted_content = f"""
283
+ <div style="background-color: #f8f9fa; border-radius: 8px; border: 1px solid #e9ecef; margin: 10px 0;">
284
+ <div style="background-color: #e9ecef; padding: 10px; border-radius: 8px 8px 0 0; font-weight: bold; color: #495057;">
285
+ πŸ“„ LaTeX Content (Fallback View)
286
+ </div>
287
+ <div style="padding: 15px;">
288
+ <pre style="background-color: transparent; margin: 0; padding: 0;
289
+ font-family: 'Courier New', monospace; font-size: 14px; line-height: 1.4;
290
+ white-space: pre-wrap; word-wrap: break-word; color: #2c3e50;">
291
+ {escaped_content}
292
+ </pre>
293
+ </div>
294
+ <div style="background-color: #e9ecef; padding: 8px 15px; border-radius: 0 0 8px 8px; font-size: 12px; color: #6c757d;">
295
+ ⚠️ Rendering failed, showing raw LaTeX. Error: {str(e)}
296
+ </div>
297
+ </div>
298
+ """
299
+
300
+ return formatted_content
301
+
302
  # Function to run conversion in a separate thread
303
  def run_conversion_thread(file_path, parser_name, ocr_method_name, output_format):
304
  """Run the conversion in a separate thread and return the thread object"""
 
432
  logger.info("Converter returned cancellation message")
433
  return content, None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
434
 
435
+ # Format the content based on parser type
436
+ if "GOT-OCR" in parser_name:
437
+ # For GOT-OCR, display as LaTeX
438
+ formatted_content = format_latex_content(str(content))
439
+ html_output = f"<div class='output-container'>{formatted_content}</div>"
440
+ else:
441
+ # For other parsers, display as Markdown
442
+ formatted_content = format_markdown_content(str(content))
443
+ html_output = f"<div class='output-container'>{formatted_content}</div>"
444
 
445
  logger.info("Conversion completed successfully")
446