Spaces:

Ansemin101
/

Markit_v2

Running

AnseMin commited on Jun 26

Commit

63279a9

1 Parent(s): 623ad58

Refactor document ingestion and chunking to support LaTeX content

- Updated `DocumentIngestionService` to generalize content ingestion, allowing both Markdown and LaTeX formats.
- Introduced `LaTeXAwareChunker` for chunking LaTeX documents while preserving structures.
- Enhanced `UnifiedDocumentChunker` to handle both Markdown and LaTeX content types seamlessly.
- Modified `_process_latex_content` to return raw LaTeX for GOT-OCR without LLM conversion.
- Improved UI rendering for LaTeX content, ensuring proper display using MathJax.
- Added backward compatibility for existing Markdown ingestion methods.

Files changed (5) hide show

src/parsers/got_ocr_parser.py +13 -5
src/rag/chunking.py +372 -2
src/rag/ingestion.py +41 -19
src/services/document_service.py +25 -13
src/ui/ui.py +177 -3

src/parsers/got_ocr_parser.py CHANGED Viewed

@@ -85,7 +85,7 @@ class GotOcrParser(DocumentParser):
             **kwargs: Additional arguments to pass to the model
         Returns:
-            Extracted text from the image, converted to Markdown if formatted
         """
         # Verify dependencies are installed without initializing CUDA
         if not self._check_dependencies():
@@ -131,15 +131,23 @@ class GotOcrParser(DocumentParser):
                 image_path_str = str(file_path)
                 # Call the wrapper function that handles ZeroGPU safely
-                return self._safe_gpu_process(image_path_str, use_format, **safe_kwargs)
             else:
                 # Fallback for environments without spaces
-                return self._process_image_without_gpu(
                     str(file_path),
                     use_format=use_format,
                     **safe_kwargs
                 )
         except Exception as e:
             logger.error(f"Error processing image with GOT-OCR: {str(e)}")
@@ -195,7 +203,7 @@ class GotOcrParser(DocumentParser):
         image = load_image(image_path)
         # Load processor and model
-        processor = AutoProcessor.from_pretrained(MODEL_NAME)
         # Use CPU if in main process to avoid CUDA initialization issues
         device = "cpu"
@@ -285,7 +293,7 @@ class GotOcrParser(DocumentParser):
             logger.info(f"Loading GOT-OCR model from {MODEL_NAME} on {device}")
             # Load processor
-            processor = AutoProcessor.from_pretrained(MODEL_NAME)
             # Load model
             model = AutoModelForImageTextToText.from_pretrained(

             **kwargs: Additional arguments to pass to the model
         Returns:
+            Extracted text from the image as raw LaTeX
         """
         # Verify dependencies are installed without initializing CUDA
         if not self._check_dependencies():
                 image_path_str = str(file_path)
                 # Call the wrapper function that handles ZeroGPU safely
+                result = self._safe_gpu_process(image_path_str, use_format, **safe_kwargs)
             else:
                 # Fallback for environments without spaces
+                result = self._process_image_without_gpu(
                     str(file_path),
                     use_format=use_format,
                     **safe_kwargs
                 )
+            # Add a small delay to replace LLM conversion time
+            import time
+            time.sleep(2)  # 2 second delay to simulate processing time
+            # Return raw LaTeX output (no LLM conversion)
+            logger.info("Returning raw LaTeX output (no LLM conversion)")
+            return result
         except Exception as e:
             logger.error(f"Error processing image with GOT-OCR: {str(e)}")
         image = load_image(image_path)
         # Load processor and model
+        processor = AutoProcessor.from_pretrained(MODEL_NAME, use_fast=True)
         # Use CPU if in main process to avoid CUDA initialization issues
         device = "cpu"
             logger.info(f"Loading GOT-OCR model from {MODEL_NAME} on {device}")
             # Load processor
+            processor = AutoProcessor.from_pretrained(MODEL_NAME, use_fast=True)
             # Load model
             model = AutoModelForImageTextToText.from_pretrained(

src/rag/chunking.py CHANGED Viewed

@@ -8,6 +8,253 @@ from src.core.logging_config import get_logger
 logger = get_logger(__name__)
 class MarkdownAwareChunker:
     """Handles markdown-aware document chunking that preserves tables and structures."""
@@ -269,5 +516,128 @@ class MarkdownAwareChunker:
         return preview
-# Global chunker instance with optimized settings for markdown RAG
-document_chunker = MarkdownAwareChunker(chunk_size=1000, chunk_overlap=200)

 logger = get_logger(__name__)
+class LaTeXAwareChunker:
+    """Handles LaTeX-aware document chunking that preserves LaTeX structures."""
+    def __init__(self, chunk_size: int = 1200, chunk_overlap: int = 150):
+        """
+        Initialize the LaTeX-aware document chunker.
+        Args:
+            chunk_size: Maximum size of each chunk in characters
+            chunk_overlap: Number of characters to overlap between chunks
+        """
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        # Initialize the text splitter with LaTeX-aware settings
+        self.text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap,
+            length_function=len,
+            separators=[
+                "\n\\section{",      # Section headers
+                "\n\\subsection{",  # Subsection headers
+                "\n\\subsubsection{", # Subsubsection headers
+                "\n\\title{",       # Title commands
+                "\n\\begin{",       # Begin environments
+                "\n\\end{",         # End environments
+                "\n\n",             # Paragraph breaks
+                "\n",               # Line breaks
+                ". ",               # Sentence breaks
+                " ",                # Word breaks
+                ""                  # Character breaks
+            ],
+            keep_separator=True,
+            add_start_index=True
+        )
+        # Regex patterns for LaTeX structures
+        self.latex_table_pattern = re.compile(
+            r'\\begin\{tabular\}.*?\\end\{tabular\}',
+            re.DOTALL | re.MULTILINE
+        )
+        self.latex_title_pattern = re.compile(
+            r'\\title\{[^}]*\}',
+            re.MULTILINE
+        )
+        self.latex_section_pattern = re.compile(
+            r'\\(?:sub)*section\*?\{[^}]*\}',
+            re.MULTILINE
+        )
+        self.latex_environment_pattern = re.compile(
+            r'\\begin\{[^}]+\}.*?\\end\{[^}]+\}',
+            re.DOTALL | re.MULTILINE
+        )
+        logger.info(f"LaTeX-aware chunker initialized with chunk_size={chunk_size}, overlap={chunk_overlap}")
+    def extract_latex_structures(self, content: str) -> Tuple[List[Tuple[int, int, str]], str]:
+        """
+        Extract LaTeX tables and environments, replacing them with placeholders.
+        Args:
+            content: Original LaTeX content
+        Returns:
+            Tuple of (structures_list, content_with_placeholders)
+        """
+        structures = []
+        # Find all tabular environments (highest priority)
+        for match in self.latex_table_pattern.finditer(content):
+            structures.append((
+                match.start(),
+                match.end(),
+                "latex_table",
+                match.group()
+            ))
+        # Find other LaTeX environments (avoid overlapping with tables)
+        for match in self.latex_environment_pattern.finditer(content):
+            # Check if this environment overlaps with any table
+            overlaps_with_table = any(
+                table_start <= match.start() < table_end or
+                table_start < match.end() <= table_end
+                for table_start, table_end, struct_type, _ in structures
+                if struct_type == "latex_table"
+            )
+            if not overlaps_with_table and "tabular" not in match.group():
+                structures.append((
+                    match.start(),
+                    match.end(),
+                    "latex_environment",
+                    match.group()
+                ))
+        # Find titles and sections
+        for match in self.latex_title_pattern.finditer(content):
+            structures.append((
+                match.start(),
+                match.end(),
+                "latex_title",
+                match.group()
+            ))
+        for match in self.latex_section_pattern.finditer(content):
+            structures.append((
+                match.start(),
+                match.end(),
+                "latex_section",
+                match.group()
+            ))
+        # Sort by start position
+        structures.sort(key=lambda x: x[0])
+        # Replace structures with placeholders
+        content_with_placeholders = content
+        offset = 0
+        for i, (start, end, struct_type, struct_content) in enumerate(structures):
+            placeholder = f"\n\n__LATEX_STRUCTURE_{i}_{struct_type.upper()}__\n\n"
+            # Adjust positions based on previous replacements
+            adjusted_start = start - offset
+            adjusted_end = end - offset
+            content_with_placeholders = (
+                content_with_placeholders[:adjusted_start] +
+                placeholder +
+                content_with_placeholders[adjusted_end:]
+            )
+            # Update offset for next replacement
+            offset += (end - start) - len(placeholder)
+        return structures, content_with_placeholders
+    def restore_latex_structures(self, chunks: List[str], structures: List[Tuple[int, int, str, str]]) -> List[str]:
+        """
+        Restore LaTeX structures in chunks, keeping tables and environments intact.
+        Args:
+            chunks: List of text chunks with placeholders
+            structures: List of original structures
+        Returns:
+            List of chunks with restored structures
+        """
+        restored_chunks = []
+        for chunk in chunks:
+            restored_chunk = chunk
+            # Find placeholders in this chunk
+            placeholder_pattern = re.compile(r'__LATEX_STRUCTURE_(\d+)_(\w+)__')
+            for match in placeholder_pattern.finditer(chunk):
+                structure_index = int(match.group(1))
+                if structure_index < len(structures):
+                    original_structure = structures[structure_index][3]
+                    restored_chunk = restored_chunk.replace(match.group(), original_structure)
+            restored_chunks.append(restored_chunk)
+        return restored_chunks
+    def chunk_document(self, content: str, source_metadata: Dict[str, Any]) -> List[Document]:
+        """
+        Chunk a LaTeX document while preserving LaTeX structures.
+        Args:
+            content: The LaTeX content to chunk
+            source_metadata: Metadata about the source document
+        Returns:
+            List of Document objects with chunked content and enhanced metadata
+        """
+        try:
+            # Extract LaTeX structures and replace with placeholders
+            structures, content_with_placeholders = self.extract_latex_structures(content)
+            # Create a document object with placeholders
+            doc = Document(
+                page_content=content_with_placeholders,
+                metadata=source_metadata
+            )
+            # Split the document into chunks
+            chunks = self.text_splitter.split_documents([doc])
+            # Restore LaTeX structures in chunks
+            chunk_contents = [chunk.page_content for chunk in chunks]
+            restored_contents = self.restore_latex_structures(chunk_contents, structures)
+            # Create enhanced chunks with restored content
+            enhanced_chunks = []
+            for i, (chunk, restored_content) in enumerate(zip(chunks, restored_contents)):
+                # Add chunk-specific metadata
+                chunk.metadata.update({
+                    "chunk_index": i,
+                    "total_chunks": len(chunks),
+                    "chunk_size": len(restored_content),
+                    "chunk_id": f"{source_metadata.get('source_id', 'unknown')}_{i}",
+                    "has_latex_table": "\\begin{tabular}" in restored_content,
+                    "has_latex_environment": "\\begin{" in restored_content and "\\end{" in restored_content,
+                    "has_latex_math": "\\(" in restored_content or "$" in restored_content,
+                    "content_type": "latex"
+                })
+                # Update the chunk content with restored structures
+                chunk.page_content = restored_content
+                enhanced_chunks.append(chunk)
+            logger.info(f"LaTeX document chunked into {len(enhanced_chunks)} structure-aware pieces")
+            return enhanced_chunks
+        except Exception as e:
+            logger.error(f"Error chunking LaTeX document: {e}")
+            # Fallback to regular chunking if LaTeX processing fails
+            return self._fallback_chunk(content, source_metadata)
+    def _fallback_chunk(self, content: str, source_metadata: Dict[str, Any]) -> List[Document]:
+        """Fallback chunking method if LaTeX-aware chunking fails."""
+        try:
+            doc = Document(page_content=content, metadata=source_metadata)
+            chunks = self.text_splitter.split_documents([doc])
+            for i, chunk in enumerate(chunks):
+                chunk.metadata.update({
+                    "chunk_index": i,
+                    "total_chunks": len(chunks),
+                    "chunk_size": len(chunk.page_content),
+                    "chunk_id": f"{source_metadata.get('source_id', 'unknown')}_{i}",
+                    "content_type": "latex"
+                })
+            logger.warning(f"Used fallback chunking for LaTeX content: {len(chunks)} pieces")
+            return chunks
+        except Exception as e:
+            logger.error(f"Error in LaTeX fallback chunking: {e}")
+            raise
 class MarkdownAwareChunker:
     """Handles markdown-aware document chunking that preserves tables and structures."""
         return preview
+class UnifiedDocumentChunker:
+    """Unified chunker that handles both Markdown and LaTeX content types."""
+    def __init__(self):
+        """Initialize the unified chunker with both markdown and LaTeX chunkers."""
+        self.markdown_chunker = MarkdownAwareChunker(chunk_size=1000, chunk_overlap=200)
+        self.latex_chunker = LaTeXAwareChunker(chunk_size=1200, chunk_overlap=150)
+        logger.info("Unified document chunker initialized with both Markdown and LaTeX support")
+    def chunk_document(self, content: str, source_metadata: Dict[str, Any]) -> List[Document]:
+        """
+        Chunk a document using the appropriate chunker based on content type.
+        Args:
+            content: The document content to chunk
+            source_metadata: Metadata about the source document
+        Returns:
+            List of Document objects with chunked content and enhanced metadata
+        """
+        # Determine content type from metadata or content analysis
+        content_type = source_metadata.get('doc_type', 'markdown').lower()
+        # Override content type detection for GOT-OCR results
+        if source_metadata.get('conversion_method', '').startswith('GOT-OCR'):
+            content_type = 'latex'
+        # Auto-detect content type if not specified
+        if content_type not in ['markdown', 'latex']:
+            if self._is_latex_content(content):
+                content_type = 'latex'
+            else:
+                content_type = 'markdown'
+        # Use appropriate chunker
+        if content_type == 'latex':
+            logger.info("Using LaTeX-aware chunker for document")
+            return self.latex_chunker.chunk_document(content, source_metadata)
+        else:
+            logger.info("Using Markdown-aware chunker for document")
+            return self.markdown_chunker.chunk_document(content, source_metadata)
+    def _is_latex_content(self, content: str) -> bool:
+        """
+        Auto-detect if content is LaTeX based on common LaTeX commands.
+        Args:
+            content: Content to analyze
+        Returns:
+            True if content appears to be LaTeX, False otherwise
+        """
+        latex_indicators = [
+            r'\\begin\{',
+            r'\\end\{',
+            r'\\title\{',
+            r'\\section',
+            r'\\subsection',
+            r'\\hline',
+            r'\\multirow',
+            r'\\multicolumn'
+        ]
+        # Count LaTeX indicators
+        latex_count = sum(1 for indicator in latex_indicators if re.search(indicator, content))
+        # If we find multiple LaTeX indicators, treat as LaTeX
+        return latex_count >= 2
+    def chunk_multiple_documents(self, documents: List[Dict[str, Any]]) -> List[Document]:
+        """
+        Chunk multiple documents using appropriate chunkers.
+        Args:
+            documents: List of dictionaries with 'content' and 'metadata' keys
+        Returns:
+            List of chunked Document objects
+        """
+        all_chunks = []
+        for doc_data in documents:
+            content = doc_data.get('content', '')
+            metadata = doc_data.get('metadata', {})
+            if content.strip():  # Only process non-empty content
+                chunks = self.chunk_document(content, metadata)
+                all_chunks.extend(chunks)
+        logger.info(f"Chunked {len(documents)} documents into {len(all_chunks)} total chunks")
+        return all_chunks
+    def get_chunk_preview(self, chunks: List[Document], max_chunks: int = 5) -> str:
+        """
+        Generate a preview of chunks for debugging/logging.
+        Args:
+            chunks: List of Document chunks
+            max_chunks: Maximum number of chunks to include in preview
+        Returns:
+            String preview of chunks
+        """
+        preview = f"Document Chunks Preview ({len(chunks)} total chunks):\n"
+        preview += "=" * 50 + "\n"
+        for i, chunk in enumerate(chunks[:max_chunks]):
+            content_type = chunk.metadata.get('content_type', 'unknown')
+            has_table = chunk.metadata.get('has_table', False) or chunk.metadata.get('has_latex_table', False)
+            has_code = chunk.metadata.get('has_code', False) or chunk.metadata.get('has_latex_environment', False)
+            preview += f"Chunk {i + 1} ({content_type}):\n"
+            preview += f"  Length: {len(chunk.page_content)} characters\n"
+            preview += f"  Has Table: {has_table}, Has Code/Environment: {has_code}\n"
+            preview += f"  Metadata: {chunk.metadata}\n"
+            preview += f"  Content preview: {chunk.page_content[:100]}...\n"
+            preview += "-" * 30 + "\n"
+        if len(chunks) > max_chunks:
+            preview += f"... and {len(chunks) - max_chunks} more chunks\n"
+        return preview
+# Global unified chunker instance that supports both Markdown and LaTeX
+document_chunker = UnifiedDocumentChunker()

src/rag/ingestion.py CHANGED Viewed

@@ -75,16 +75,18 @@ class DocumentIngestionService:
             logger.error(f"Error deleting existing document: {e}")
             return False
-    def ingest_markdown_content(self,
-                              markdown_content: str,
-                              source_path: Optional[str] = None,
-                              metadata: Optional[Dict[str, Any]] = None,
-                              original_file_content: Optional[str] = None) -> Tuple[bool, str, Dict[str, Any]]:
         """
-        Ingest markdown content into the RAG system.
         Args:
-            markdown_content: The markdown content to ingest
             source_path: Optional source path/filename
             metadata: Optional additional metadata
             original_file_content: Original file content for hash calculation
@@ -93,11 +95,11 @@ class DocumentIngestionService:
             Tuple of (success, message, ingestion_stats)
         """
         try:
-            if not markdown_content or not markdown_content.strip():
                 return False, "No content provided for ingestion", {}
-            # Create file hash using original content if available, otherwise use markdown content
-            file_content_for_hash = original_file_content or markdown_content
             file_hash = self.create_file_hash(file_content_for_hash)
             # Check for duplicates in vector store
@@ -115,16 +117,16 @@ class DocumentIngestionService:
             # Prepare document metadata with file hash
             doc_metadata = self.prepare_document_metadata(
                 source_path=source_path,
-                doc_type="markdown",
                 additional_metadata=metadata
             )
             doc_metadata["file_hash"] = file_hash
-            doc_metadata["content_length"] = len(markdown_content)
             doc_metadata["upload_timestamp"] = datetime.now().isoformat()
-            # Chunk the document using markdown-aware chunking
-            logger.info(f"Chunking document: {file_hash}")
-            chunks = document_chunker.chunk_document(markdown_content, doc_metadata)
             if not chunks:
                 return False, "Failed to create document chunks", {}
@@ -142,7 +144,7 @@ class DocumentIngestionService:
                 "file_hash": file_hash,
                 "total_chunks": len(chunks),
                 "document_ids": doc_ids,
-                "content_length": len(markdown_content),
                 "has_tables": any(chunk.metadata.get("has_table", False) for chunk in chunks),
                 "has_code": any(chunk.metadata.get("has_code", False) for chunk in chunks),
                 "processed_at": datetime.now().isoformat(),
@@ -160,6 +162,22 @@ class DocumentIngestionService:
             logger.error(error_msg)
             return False, error_msg, {"status": "error", "error": str(e)}
     def ingest_from_conversion_result(self, conversion_result: Dict[str, Any]) -> Tuple[bool, str, Dict[str, Any]]:
         """
         Ingest a document from Markit conversion result.
@@ -189,9 +207,13 @@ class DocumentIngestionService:
                 "conversion_time": conversion_result.get("conversion_time", 0)
             }
-            # Ingest the markdown content with original file content for proper hashing
-            return self.ingest_markdown_content(
-                markdown_content=markdown_content,
                 source_path=original_filename,
                 metadata=additional_metadata,
                 original_file_content=original_file_content

             logger.error(f"Error deleting existing document: {e}")
             return False
+    def ingest_text_content(self,
+                           text_content: str,
+                           content_type: str = "markdown",
+                           source_path: Optional[str] = None,
+                           metadata: Optional[Dict[str, Any]] = None,
+                           original_file_content: Optional[str] = None) -> Tuple[bool, str, Dict[str, Any]]:
         """
+        Ingest text content (markdown or LaTeX) into the RAG system.
         Args:
+            text_content: The text content to ingest (markdown or LaTeX)
+            content_type: Type of content ("markdown" or "latex")
             source_path: Optional source path/filename
             metadata: Optional additional metadata
             original_file_content: Original file content for hash calculation
             Tuple of (success, message, ingestion_stats)
         """
         try:
+            if not text_content or not text_content.strip():
                 return False, "No content provided for ingestion", {}
+            # Create file hash using original content if available, otherwise use text content
+            file_content_for_hash = original_file_content or text_content
             file_hash = self.create_file_hash(file_content_for_hash)
             # Check for duplicates in vector store
             # Prepare document metadata with file hash
             doc_metadata = self.prepare_document_metadata(
                 source_path=source_path,
+                doc_type=content_type,  # Use content_type instead of hardcoded "markdown"
                 additional_metadata=metadata
             )
             doc_metadata["file_hash"] = file_hash
+            doc_metadata["content_length"] = len(text_content)
             doc_metadata["upload_timestamp"] = datetime.now().isoformat()
+            # Chunk the document using text-aware chunking
+            logger.info(f"Chunking {content_type} document: {file_hash}")
+            chunks = document_chunker.chunk_document(text_content, doc_metadata)
             if not chunks:
                 return False, "Failed to create document chunks", {}
                 "file_hash": file_hash,
                 "total_chunks": len(chunks),
                 "document_ids": doc_ids,
+                "content_length": len(text_content),
                 "has_tables": any(chunk.metadata.get("has_table", False) for chunk in chunks),
                 "has_code": any(chunk.metadata.get("has_code", False) for chunk in chunks),
                 "processed_at": datetime.now().isoformat(),
             logger.error(error_msg)
             return False, error_msg, {"status": "error", "error": str(e)}
+    def ingest_markdown_content(self,
+                              markdown_content: str,
+                              source_path: Optional[str] = None,
+                              metadata: Optional[Dict[str, Any]] = None,
+                              original_file_content: Optional[str] = None) -> Tuple[bool, str, Dict[str, Any]]:
+        """
+        Backward compatibility method for ingesting markdown content.
+        """
+        return self.ingest_text_content(
+            text_content=markdown_content,
+            content_type="markdown",
+            source_path=source_path,
+            metadata=metadata,
+            original_file_content=original_file_content
+        )
     def ingest_from_conversion_result(self, conversion_result: Dict[str, Any]) -> Tuple[bool, str, Dict[str, Any]]:
         """
         Ingest a document from Markit conversion result.
                 "conversion_time": conversion_result.get("conversion_time", 0)
             }
+            # Determine content type based on conversion method
+            content_type = "latex" if "GOT-OCR" in conversion_method else "markdown"
+            # Ingest the content with original file content for proper hashing
+            return self.ingest_text_content(
+                text_content=markdown_content,
+                content_type=content_type,
                 source_path=original_filename,
                 metadata=additional_metadata,
                 original_file_content=original_file_content

src/services/document_service.py CHANGED Viewed

@@ -94,9 +94,15 @@ class DocumentService:
         return temp_path
     def _process_latex_content(self, content: str, parser_name: str, ocr_method_name: str) -> str:
-        """Process LaTeX content for GOT-OCR formatted text."""
-        if (parser_name == "GOT-OCR (jpg,png only)" and
-            ocr_method_name == "Formatted Text" and
             config.api.google_api_key):
             logging.info("Converting LaTeX output to Markdown using Gemini API")
@@ -106,6 +112,7 @@ class DocumentService:
                 raise ConversionError("Conversion cancelled before LaTeX conversion")
             try:
                 markdown_content = convert_latex_to_markdown(content)
                 if markdown_content:
                     logging.info(f"LaTeX conversion completed in {time.time() - start_convert:.2f} seconds")
@@ -118,16 +125,21 @@ class DocumentService:
         return content
-    def _create_output_file(self, content: str, output_format: str, original_file_path: Optional[str] = None) -> str:
         """Create output file with proper extension and preserved filename."""
-        # Determine file extension
-        format_extensions = {
-            "markdown": ".md",
-            "json": ".json",
-            "text": ".txt",
-            "document tags": ".doctags"
-        }
-        ext = format_extensions.get(output_format.lower(), ".txt")
         if self._check_cancellation():
             raise ConversionError("Conversion cancelled before output file creation")
@@ -247,7 +259,7 @@ class DocumentService:
                 raise ConversionError("Conversion cancelled")
             # Create output file
-            output_path = self._create_output_file(content, output_format, file_path)
             return content, output_path

         return temp_path
     def _process_latex_content(self, content: str, parser_name: str, ocr_method_name: str) -> str:
+        """Process LaTeX content - for GOT-OCR, return raw LaTeX without conversion."""
+        # For GOT-OCR, skip LLM conversion and return raw LaTeX
+        if parser_name == "GOT-OCR (jpg,png only)":
+            logging.info("GOT-OCR detected: returning raw LaTeX output (no LLM conversion)")
+            return content
+        # For other parsers with LaTeX content, process as before
+        if (content and
+            ("\\begin" in content or "\\end" in content or "$" in content) and
             config.api.google_api_key):
             logging.info("Converting LaTeX output to Markdown using Gemini API")
                 raise ConversionError("Conversion cancelled before LaTeX conversion")
             try:
+                from src.core.latex_to_markdown_converter import convert_latex_to_markdown
                 markdown_content = convert_latex_to_markdown(content)
                 if markdown_content:
                     logging.info(f"LaTeX conversion completed in {time.time() - start_convert:.2f} seconds")
         return content
+    def _create_output_file(self, content: str, output_format: str, original_file_path: Optional[str] = None, parser_name: Optional[str] = None) -> str:
         """Create output file with proper extension and preserved filename."""
+        # Determine file extension based on parser and format
+        if parser_name == "GOT-OCR (jpg,png only)":
+            # For GOT-OCR, use .tex extension
+            ext = ".tex"
+        else:
+            # For other parsers, use format-based extensions
+            format_extensions = {
+                "markdown": ".md",
+                "json": ".json",
+                "text": ".txt",
+                "document tags": ".doctags"
+            }
+            ext = format_extensions.get(output_format.lower(), ".txt")
         if self._check_cancellation():
             raise ConversionError("Conversion cancelled before output file creation")
                 raise ConversionError("Conversion cancelled")
             # Create output file
+            output_path = self._create_output_file(content, output_format, file_path, parser_name)
             return content, output_path

src/ui/ui.py CHANGED Viewed

@@ -131,6 +131,174 @@ def format_markdown_content(content):
     html_content = markdown.markdown(str(content), extensions=['tables'])
     return html_content
 # Function to run conversion in a separate thread
 def run_conversion_thread(file_path, parser_name, ocr_method_name, output_format):
     """Run the conversion in a separate thread and return the thread object"""
@@ -264,9 +432,15 @@ def handle_convert(files, parser_name, ocr_method_name, output_format, processin
         logger.info("Converter returned cancellation message")
         return content, None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
-    # Format the content and wrap it in the scrollable container
-    formatted_content = format_markdown_content(str(content))
-    html_output = f"<div class='output-container'>{formatted_content}</div>"
     logger.info("Conversion completed successfully")

     html_content = markdown.markdown(str(content), extensions=['tables'])
     return html_content
+def render_latex_to_html(latex_content):
+    """Convert LaTeX content to HTML using Mathpix Markdown like GOT-OCR demo."""
+    import json
+    # Clean up the content similar to GOT-OCR demo
+    content = latex_content.strip()
+    if content.endswith("<|im_end|>"):
+        content = content[:-len("<|im_end|>")]
+    # Fix unbalanced delimiters exactly like GOT-OCR demo
+    right_num = content.count("\\right")
+    left_num = content.count("\\left")
+    if right_num != left_num:
+        content = (
+            content.replace("\\left(", "(")
+            .replace("\\right)", ")")
+            .replace("\\left[", "[")
+            .replace("\\right]", "]")
+            .replace("\\left{", "{")
+            .replace("\\right}", "}")
+            .replace("\\left|", "|")
+            .replace("\\right|", "|")
+            .replace("\\left.", ".")
+            .replace("\\right.", ".")
+        )
+    # Process content like GOT-OCR demo: remove $ signs and replace quotes
+    content = content.replace('"', "``").replace("$", "")
+    # Split into lines and create JavaScript string like GOT-OCR demo
+    outputs_list = content.split("\n")
+    js_text_parts = []
+    for line in outputs_list:
+        # Escape backslashes and add line break
+        escaped_line = line.replace("\\", "\\\\")
+        js_text_parts.append(f'"{escaped_line}\\n"')
+    # Join with + like in GOT-OCR demo
+    js_text = " + ".join(js_text_parts)
+    # Create HTML using Mathpix Markdown like GOT-OCR demo
+    html_content = f"""<!DOCTYPE html>
+<html lang="en" data-lt-installed="true">
+<head>
+    <meta charset="UTF-8">
+    <title>LaTeX Content</title>
+    <script>
+        const text = {js_text};
+    </script>
+    <style>
+        #content {{
+            max-width: 800px;
+            margin: auto;
+            padding: 20px;
+        }}
+        body {{
+            font-family: 'Times New Roman', serif;
+            line-height: 1.6;
+            background-color: #ffffff;
+            color: #333;
+        }}
+        table {{
+            border-collapse: collapse;
+            width: 100%;
+            margin: 20px 0;
+        }}
+        td, th {{
+            border: 1px solid #333;
+            padding: 8px 12px;
+            text-align: center;
+            vertical-align: middle;
+        }}
+    </style>
+    <script>
+        let script = document.createElement('script');
+        script.src = "https://cdn.jsdelivr.net/npm/mathpix-markdown-it@1.3.6/es5/bundle.js";
+        document.head.append(script);
+        script.onload = function() {{
+            const isLoaded = window.loadMathJax();
+            if (isLoaded) {{
+                console.log('Styles loaded!')
+            }}
+            const el = window.document.getElementById('content-text');
+            if (el) {{
+                const options = {{
+                    htmlTags: true
+                }};
+                const html = window.render(text, options);
+                el.outerHTML = html;
+            }}
+        }};
+    </script>
+</head>
+<body>
+    <div id="content">
+        <div id="content-text"></div>
+    </div>
+</body>
+</html>"""
+    return html_content
+def format_latex_content(content):
+    """Format LaTeX content for display in UI using MathJax rendering like GOT-OCR demo."""
+    if not content:
+        return content
+    try:
+        # Generate rendered HTML
+        rendered_html = render_latex_to_html(content)
+        # Encode for iframe display (similar to GOT-OCR demo)
+        import base64
+        encoded_html = base64.b64encode(rendered_html.encode("utf-8")).decode("utf-8")
+        iframe_src = f"data:text/html;base64,{encoded_html}"
+        # Create the display with both rendered and raw views
+        formatted_content = f"""
+        <div style="background-color: #f8f9fa; border-radius: 8px; border: 1px solid #e9ecef; margin: 10px 0;">
+            <div style="background-color: #e9ecef; padding: 10px; border-radius: 8px 8px 0 0; font-weight: bold; color: #495057;">
+                📄 LaTeX Content (Rendered with MathJax)
+            </div>
+            <div style="padding: 0;">
+                <iframe src="{iframe_src}" width="100%" height="500px" style="border: none; border-radius: 0 0 8px 8px;"></iframe>
+            </div>
+            <div style="background-color: #e9ecef; padding: 8px 15px; border-radius: 0; font-size: 12px; color: #6c757d; border-top: 1px solid #dee2e6;">
+                💡 LaTeX content rendered with MathJax. Tables and formulas are displayed as they would appear in a LaTeX document.
+            </div>
+            <details style="margin: 0; border-top: 1px solid #dee2e6;">
+                <summary style="padding: 8px 15px; background-color: #e9ecef; cursor: pointer; font-size: 12px; color: #6c757d;">
+                    📝 View Raw LaTeX Source
+                </summary>
+                <div style="padding: 15px; background-color: #f8f9fa;">
+                    <pre style="background-color: transparent; margin: 0; padding: 0;
+                                font-family: 'Courier New', monospace; font-size: 12px; line-height: 1.4;
+                                white-space: pre-wrap; word-wrap: break-word; color: #2c3e50; max-height: 200px; overflow-y: auto;">
+{content}
+                    </pre>
+                </div>
+            </details>
+        </div>
+        """
+    except Exception as e:
+        # Fallback to simple formatting if rendering fails
+        import html
+        escaped_content = html.escape(str(content))
+        formatted_content = f"""
+        <div style="background-color: #f8f9fa; border-radius: 8px; border: 1px solid #e9ecef; margin: 10px 0;">
+            <div style="background-color: #e9ecef; padding: 10px; border-radius: 8px 8px 0 0; font-weight: bold; color: #495057;">
+                📄 LaTeX Content (Fallback View)
+            </div>
+            <div style="padding: 15px;">
+                <pre style="background-color: transparent; margin: 0; padding: 0;
+                            font-family: 'Courier New', monospace; font-size: 14px; line-height: 1.4;
+                            white-space: pre-wrap; word-wrap: break-word; color: #2c3e50;">
+{escaped_content}
+                </pre>
+            </div>
+            <div style="background-color: #e9ecef; padding: 8px 15px; border-radius: 0 0 8px 8px; font-size: 12px; color: #6c757d;">
+                ⚠️ Rendering failed, showing raw LaTeX. Error: {str(e)}
+            </div>
+        </div>
+        """
+    return formatted_content
 # Function to run conversion in a separate thread
 def run_conversion_thread(file_path, parser_name, ocr_method_name, output_format):
     """Run the conversion in a separate thread and return the thread object"""
         logger.info("Converter returned cancellation message")
         return content, None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
+    # Format the content based on parser type
+    if "GOT-OCR" in parser_name:
+        # For GOT-OCR, display as LaTeX
+        formatted_content = format_latex_content(str(content))
+        html_output = f"<div class='output-container'>{formatted_content}</div>"
+    else:
+        # For other parsers, display as Markdown
+        formatted_content = format_markdown_content(str(content))
+        html_output = f"<div class='output-container'>{formatted_content}</div>"
     logger.info("Conversion completed successfully")