File size: 15,101 Bytes
6ea41ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
033e4ba
6ea41ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
"""Document converter UI component and logic."""

import threading
import time
import gradio as gr
import logging
from pathlib import Path

from src.core.converter import convert_file
from src.core.logging_config import get_logger
from src.services.document_service import DocumentService
from src.rag import document_ingestion_service
from src.ui.utils.file_validation import validate_file_for_parser
from src.ui.utils.threading_utils import (
    conversion_cancelled, 
    monitor_cancellation, 
    reset_cancellation,
    set_cancellation
)
from src.ui.formatters.content_formatters import format_markdown_content, format_latex_content

logger = get_logger(__name__)


def run_conversion_thread(file_path, parser_name, ocr_method_name, output_format):
    """Run the conversion in a separate thread and return the thread object"""
    # Reset the cancellation flag
    reset_cancellation()
    
    # Create a container for the results
    results = {"content": None, "download_file": None, "error": None}
    
    def conversion_worker():
        try:
            content, download_file = convert_file(file_path, parser_name, ocr_method_name, output_format)
            results["content"] = content
            results["download_file"] = download_file
        except Exception as e:
            logger.error(f"Error during conversion: {str(e)}")
            results["error"] = str(e)
    
    # Create and start the thread
    thread = threading.Thread(target=conversion_worker)
    thread.daemon = True
    thread.start()
    
    return thread, results


def run_conversion_thread_multi(file_paths, parser_name, ocr_method_name, output_format, processing_type):
    """Run the conversion in a separate thread for multiple files."""
    # Results will be shared between threads
    results = {"content": None, "download_file": None, "error": None}
    
    def conversion_worker():
        try:
            logger.info(f"Starting multi-file conversion thread for {len(file_paths)} files")
            
            # Use the new document service unified method
            document_service = DocumentService()
            document_service.set_cancellation_flag(conversion_cancelled)
            
            # Call the unified convert_documents method
            content, output_file = document_service.convert_documents(
                file_paths=file_paths,
                parser_name=parser_name,
                ocr_method_name=ocr_method_name,
                output_format=output_format,
                processing_type=processing_type
            )
            
            logger.info(f"Multi-file conversion completed successfully for {len(file_paths)} files")
            results["content"] = content
            results["download_file"] = output_file
            
        except Exception as e:
            logger.error(f"Error during multi-file conversion: {str(e)}")
            results["error"] = str(e)
    
    # Create and start the thread
    thread = threading.Thread(target=conversion_worker)
    thread.daemon = True
    thread.start()
    
    return thread, results


def handle_convert(files, parser_name, ocr_method_name, output_format, processing_type, is_cancelled):
    """Handle file conversion for single or multiple files."""
    # Check if we should cancel before starting
    if is_cancelled:
        logger.info("Conversion cancelled before starting")
        return "Conversion cancelled.", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
    
    # Validate files input
    if not files or len(files) == 0:
        error_msg = "No files uploaded. Please upload at least one document."
        logger.error(error_msg)
        return f"Error: {error_msg}", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
    
    # Convert Gradio file objects to file paths
    file_paths = []
    for file in files:
        if hasattr(file, 'name'):
            file_paths.append(file.name)
        else:
            file_paths.append(str(file))
    
    # Validate file types for the selected parser
    for file_path in file_paths:
        is_valid, error_msg = validate_file_for_parser(file_path, parser_name)
        if not is_valid:
            logger.error(f"File validation error: {error_msg}")
            return f"Error: {error_msg}", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
    
    logger.info(f"Starting conversion of {len(file_paths)} file(s) with cancellation flag cleared")
    
    # Start the conversion in a separate thread
    thread, results = run_conversion_thread_multi(file_paths, parser_name, ocr_method_name, output_format, processing_type)
    
    # Start the monitoring thread
    monitor_thread = threading.Thread(target=monitor_cancellation)
    monitor_thread.daemon = True
    monitor_thread.start()
    
    # Wait for the thread to complete or be cancelled
    while thread.is_alive():
        # Check if cancellation was requested
        if conversion_cancelled.is_set():
            logger.info("Cancellation detected, waiting for thread to finish")
            # Give the thread a chance to clean up
            thread.join(timeout=0.5)
            if thread.is_alive():
                logger.warning("Thread did not finish within timeout")
            return "Conversion cancelled.", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
        
        # Sleep briefly to avoid busy waiting
        time.sleep(0.1)
    
    # Thread has completed, check results
    if results["error"]:
        return f"Error: {results['error']}", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
    
    content = results["content"]
    download_file = results["download_file"]
    
    # If conversion returned a cancellation message
    if content == "Conversion cancelled.":
        logger.info("Converter returned cancellation message")
        return content, None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
    
    # Format the content based on parser type
    if "GOT-OCR" in parser_name:
        # For GOT-OCR, display as LaTeX
        formatted_content = format_latex_content(str(content))
        html_output = f"<div class='output-container'>{formatted_content}</div>"
    else:
        # For other parsers, display as Markdown
        formatted_content = format_markdown_content(str(content))
        html_output = f"<div class='output-container'>{formatted_content}</div>"
    
    logger.info("Conversion completed successfully")
    
    # Auto-ingest the converted document for RAG
    try:
        # For multi-file conversion, use the first file for metadata
        file_path = file_paths[0] if file_paths else None
        
        # Read original file content for proper deduplication hashing
        original_file_content = None
        if file_path and Path(file_path).exists():
            try:
                with open(file_path, 'rb') as f:
                    original_file_content = f.read().decode('utf-8', errors='ignore')
            except Exception as e:
                logger.warning(f"Could not read original file content: {e}")
        
        conversion_result = {
            "markdown_content": content,
            "original_filename": Path(file_path).name if file_path else "unknown",
            "conversion_method": parser_name,
            "file_size": Path(file_path).stat().st_size if file_path and Path(file_path).exists() else 0,
            "conversion_time": 0,  # Could be tracked if needed
            "original_file_content": original_file_content
        }
        
        success, ingestion_msg, stats = document_ingestion_service.ingest_from_conversion_result(conversion_result)
        if success:
            logger.info(f"Document auto-ingested for RAG: {ingestion_msg}")
        else:
            logger.warning(f"Document ingestion failed: {ingestion_msg}")
    except Exception as e:
        logger.error(f"Error during auto-ingestion: {e}")
    
    return html_output, download_file, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)




def create_document_converter_tab():
    """Create the document converter tab UI."""
    with gr.TabItem("πŸ“„ Document Converter"):
        with gr.Column(elem_classes=["chat-tab-container"]):
            # Modern header matching other tabs
            gr.HTML("""
            <div class="chat-header">
                <h2>πŸ“„ Document Converter</h2>
                <p>Convert documents to Markdown format with advanced OCR and AI processing</p>
            </div>
            """)
            
            # State to track if cancellation is requested
            cancel_requested = gr.State(False)
            # State to store the conversion thread
            conversion_thread = gr.State(None)
            # State to store the output format (fixed to Markdown)
            output_format_state = gr.State("Markdown")

            # Multi-file input (supports single and multiple files)
            files_input = gr.Files(
                label="Upload Document(s) - Single file or up to 5 files (20MB max combined)",
                file_count="multiple",
                file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".webp", ".docx", ".doc", ".pptx", ".ppt", ".xlsx", ".xls", ".txt", ".md", ".html", ".htm", ".csv"]
            )
            
            # Processing type selector (visible only for multiple files)
            processing_type_selector = gr.Radio(
                choices=["combined", "individual", "summary", "comparison"],
                value="combined",
                label="Multi-Document Processing Type",
                info="How to process multiple documents together",
                visible=False
            )
            
            # Status text to show file count and processing mode
            file_status_text = gr.HTML(
                value="<div style='color: #666; font-style: italic;'>Upload documents to begin</div>",
                label=""
            )
            
            # Provider and OCR options below the file input
            with gr.Row(elem_classes=["provider-options-row"]):
                with gr.Column(scale=1):
                    from src.parsers.parser_registry import ParserRegistry
                    parser_names = ParserRegistry.get_parser_names()
                    
                    # Make MarkItDown the default parser if available
                    default_parser = next((p for p in parser_names if p == "MarkItDown"), parser_names[0] if parser_names else "PyPdfium")
                    
                    provider_dropdown = gr.Dropdown(
                        label="Provider",
                        choices=parser_names,
                        value=default_parser,
                        interactive=True
                    )
                with gr.Column(scale=1):
                    default_ocr_options = ParserRegistry.get_ocr_options(default_parser)
                    default_ocr = default_ocr_options[0] if default_ocr_options else "No OCR"
                    
                    ocr_dropdown = gr.Dropdown(
                        label="OCR Options",
                        choices=default_ocr_options,
                        value=default_ocr,
                        interactive=True
                    )
            
            # Processing controls row with consistent styling
            with gr.Row(elem_classes=["control-buttons"]):
                convert_button = gr.Button("πŸš€ Convert", elem_classes=["control-btn", "btn-primary"])
                cancel_button = gr.Button("⏹️ Cancel", elem_classes=["control-btn", "btn-clear-data"], visible=False)
            
            # Simple output container with just one scrollbar
            file_display = gr.HTML(
                value="<div class='output-container'></div>",
                label="Converted Content"
            )
            
            file_download = gr.File(label="Download File")
            
            # Event handlers
            from src.ui.utils.file_validation import update_ui_for_file_count
            
            # Update UI when files are uploaded
            files_input.change(
                fn=update_ui_for_file_count,
                inputs=[files_input],
                outputs=[processing_type_selector, file_status_text]
            )
            
            provider_dropdown.change(
                lambda p: gr.Dropdown(
                    choices=["Plain Text", "Formatted Text"] if "GOT-OCR" in p else ParserRegistry.get_ocr_options(p),
                    value="Plain Text" if "GOT-OCR" in p else (ParserRegistry.get_ocr_options(p)[0] if ParserRegistry.get_ocr_options(p) else None)
                ),
                inputs=[provider_dropdown],
                outputs=[ocr_dropdown]
            )
            
            # Reset cancel flag when starting conversion
            def start_conversion():
                from src.ui.utils.threading_utils import conversion_cancelled
                conversion_cancelled.clear()
                logger.info("Starting conversion with cancellation flag cleared")
                return gr.update(visible=False), gr.update(visible=True), False

            # Set cancel flag and terminate thread when cancel button is clicked
            def request_cancellation(thread):
                from src.ui.utils.threading_utils import conversion_cancelled
                conversion_cancelled.set()
                logger.info("Cancel button clicked, cancellation flag set")
                
                # Try to join the thread with a timeout
                if thread is not None:
                    logger.info(f"Attempting to join conversion thread: {thread}")
                    thread.join(timeout=0.5)
                    if thread.is_alive():
                        logger.warning("Thread did not finish within timeout")
                
                # Add immediate feedback to the user
                return gr.update(visible=True), gr.update(visible=False), True, None
            
            # Start conversion sequence
            convert_button.click(
                fn=start_conversion,
                inputs=[],
                outputs=[convert_button, cancel_button, cancel_requested],
                queue=False  # Execute immediately
            ).then(
                fn=handle_convert,
                inputs=[files_input, provider_dropdown, ocr_dropdown, output_format_state, processing_type_selector, cancel_requested],
                outputs=[file_display, file_download, convert_button, cancel_button, conversion_thread]
            )
            
            # Handle cancel button click
            cancel_button.click(
                fn=request_cancellation,
                inputs=[conversion_thread],
                outputs=[convert_button, cancel_button, cancel_requested, conversion_thread],
                queue=False  # Execute immediately
            )