Spaces:

Ansemin101
/

Markit_v2

Sleeping

AnseMin commited on Jun 27

Commit

033e4ba

1 Parent(s): 6ea41ec

Integrate Gemini API for enhanced image processing in MarkItDown

- Updated `app.py` to reflect the use of Gemini for image processing in MarkItDown.
- Modified `requirements.txt` to include `ffmpeg-python` for audio processing and removed OpenAI dependency.
- Enhanced `setup.sh` to install Gemini dependencies and updated installation instructions.
- Introduced `gemini_client_wrapper.py` to create a wrapper for Gemini API, mimicking OpenAI's interface for compatibility with MarkItDown.
- Added tests in `test_gemini_wrapper.py` to validate Gemini integration and MarkItDown functionality.
- Refactored `markitdown_parser.py` to utilize Gemini for image files while maintaining standard processing for other formats.
- Updated parser names and descriptions for clarity across various parsers.

Files changed (10) hide show

app.py +1 -1
requirements.txt +2 -1
setup.sh +2 -2
src/core/gemini_client_wrapper.py +198 -0
src/parsers/docling_parser.py +1 -1
src/parsers/got_ocr_parser.py +1 -1
src/parsers/markitdown_parser.py +95 -24
src/parsers/mistral_ocr_parser.py +1 -1
src/ui/components/document_converter.py +1 -1
test_gemini_wrapper.py +94 -0

app.py CHANGED Viewed

@@ -35,7 +35,7 @@ except ImportError as e:
     try:
         from markitdown import MarkItDown
-        print("MarkItDown is available")
     except ImportError:
         print("Installing MarkItDown...")
         subprocess.run([sys.executable, "-m", "pip", "install", "-q", "markitdown[all]"], check=False)

     try:
         from markitdown import MarkItDown
+        print("MarkItDown is available (using Gemini for image processing)")
     except ImportError:
         print("Installing MarkItDown...")
         subprocess.run([sys.executable, "-m", "pip", "install", "-q", "markitdown[all]"], check=False)

requirements.txt CHANGED Viewed

@@ -30,7 +30,8 @@ huggingface_hub[cli]>=0.19.0
 # MarkItDown and its dependencies
 markitdown[all]
-openai>=1.1.0  # For LLM image description support
 # Docling dependencies
 docling

 # MarkItDown and its dependencies
 markitdown[all]
+ffmpeg-python  # For audio processing in MarkItDown
+# Note: Using Gemini Flash 2.5 for LLM image descriptions instead of OpenAI
 # Docling dependencies
 docling

setup.sh CHANGED Viewed

@@ -30,8 +30,7 @@ echo "NumPy installed successfully"
 echo "Installing Python dependencies..."
 pip install -q -U pillow opencv-python
 pip install -q -U google-genai
-pip install -q -U openai>=1.1.0  # For LLM image description support
-# pip install -q -U latex2markdown - removed, now using Gemini API for LaTeX conversion
 echo "Python dependencies installed successfully"
 # Install GOT-OCR transformers dependencies
@@ -50,6 +49,7 @@ echo "Spaces module installed successfully"
 # Install markitdown with all optional dependencies
 echo "Installing MarkItDown with all dependencies..."
 pip install -q -U 'markitdown[all]'
 echo "MarkItDown installed successfully"
 # Install Docling for advanced PDF understanding

 echo "Installing Python dependencies..."
 pip install -q -U pillow opencv-python
 pip install -q -U google-genai
+# Note: Using Gemini Flash 2.5 for LLM image descriptions in MarkItDown instead of OpenAI
 echo "Python dependencies installed successfully"
 # Install GOT-OCR transformers dependencies
 # Install markitdown with all optional dependencies
 echo "Installing MarkItDown with all dependencies..."
 pip install -q -U 'markitdown[all]'
+pip install -q -U ffmpeg-python  # For audio processing
 echo "MarkItDown installed successfully"
 # Install Docling for advanced PDF understanding

src/core/gemini_client_wrapper.py ADDED Viewed

	@@ -0,0 +1,198 @@

+"""
+Gemini client wrapper that mimics OpenAI client interface for MarkItDown compatibility.
+This allows us to use Gemini Flash 2.5 for image processing in MarkItDown.
+"""
+import logging
+import base64
+from typing import List, Dict, Any, Optional
+from pathlib import Path
+try:
+    from google import genai
+    HAS_GEMINI = True
+except ImportError:
+    HAS_GEMINI = False
+from src.core.config import config
+from src.core.logging_config import get_logger
+logger = get_logger(__name__)
+class GeminiChatCompletions:
+    """Chat completions interface that mimics OpenAI's chat.completions API."""
+    def __init__(self, client):
+        self.client = client
+    def create(self, model: str, messages: List[Dict[str, Any]], **kwargs) -> 'GeminiResponse':
+        """Create a chat completion that mimics OpenAI's API."""
+        if not messages:
+            raise ValueError("Messages cannot be empty")
+        # Extract the user message (MarkItDown sends a single user message with text + image)
+        user_message = None
+        for msg in messages:
+            if msg.get("role") == "user":
+                user_message = msg
+                break
+        if not user_message:
+            raise ValueError("No user message found")
+        content = user_message.get("content", [])
+        if not isinstance(content, list):
+            content = [{"type": "text", "text": str(content)}]
+        # Extract text prompt and image
+        text_prompt = ""
+        image_data = None
+        for item in content:
+            if item.get("type") == "text":
+                text_prompt = item.get("text", "")
+            elif item.get("type") == "image_url":
+                image_url = item.get("image_url", {}).get("url", "")
+                if image_url.startswith("data:image/"):
+                    # Extract base64 data from data URI
+                    try:
+                        header, data = image_url.split(",", 1)
+                        image_data = base64.b64decode(data)
+                    except Exception as e:
+                        logger.error(f"Failed to decode image data: {e}")
+                        raise ValueError("Invalid image data URI")
+        if not text_prompt:
+            text_prompt = "Describe this image in detail."
+        if not image_data:
+            raise ValueError("No image data found in request")
+        try:
+            # Use Gemini to process the image
+            response = self.client.models.generate_content(
+                model=config.model.gemini_model,
+                contents=[
+                    {
+                        "parts": [
+                            {"text": text_prompt},
+                            {
+                                "inline_data": {
+                                    "mime_type": "image/jpeg",  # Assume JPEG for now
+                                    "data": base64.b64encode(image_data).decode()
+                                }
+                            }
+                        ]
+                    }
+                ],
+                config={
+                    "temperature": config.model.temperature,
+                    "max_output_tokens": 1024,  # Reasonable limit for image descriptions
+                }
+            )
+            # Extract text from Gemini response
+            response_text = ""
+            if hasattr(response, "text") and response.text:
+                response_text = response.text
+            elif hasattr(response, "candidates") and response.candidates:
+                candidate = response.candidates[0]
+                if hasattr(candidate, "content") and candidate.content:
+                    if hasattr(candidate.content, "parts") and candidate.content.parts:
+                        response_text = candidate.content.parts[0].text
+            if not response_text:
+                logger.warning("Empty response from Gemini, using fallback")
+                response_text = "Image processing completed but no description generated."
+            return GeminiResponse(response_text)
+        except Exception as e:
+            logger.error(f"Gemini API error: {str(e)}")
+            # Return a fallback response to avoid breaking MarkItDown
+            return GeminiResponse(f"Image description unavailable due to processing error: {str(e)}")
+class GeminiChoice:
+    """Mimics OpenAI's Choice object."""
+    def __init__(self, content: str):
+        self.message = GeminiMessage(content)
+class GeminiMessage:
+    """Mimics OpenAI's Message object."""
+    def __init__(self, content: str):
+        self.content = content
+class GeminiResponse:
+    """Mimics OpenAI's ChatCompletion response."""
+    def __init__(self, content: str):
+        self.choices = [GeminiChoice(content)]
+class GeminiClientWrapper:
+    """
+    Gemini client wrapper that mimics OpenAI client interface for MarkItDown.
+    This allows MarkItDown to use Gemini for image processing while thinking
+    it's using an OpenAI client.
+    """
+    def __init__(self, api_key: Optional[str] = None):
+        if not HAS_GEMINI:
+            raise ImportError("google-genai package is required for Gemini support")
+        api_key = api_key or config.api.google_api_key
+        if not api_key:
+            raise ValueError("Google API key is required for Gemini client")
+        self.client = genai.Client(api_key=api_key)
+        self.chat = GeminiChatCompletions(self.client)
+        logger.info("Gemini client wrapper initialized for MarkItDown compatibility")
+    @property
+    def completions(self):
+        """Alias for chat to match some OpenAI client patterns."""
+        return self.chat
+def create_gemini_client_for_markitdown() -> Optional[GeminiClientWrapper]:
+    """
+    Create a Gemini client wrapper for use with MarkItDown.
+    Returns:
+        GeminiClientWrapper if Gemini is available and configured, None otherwise.
+    """
+    if not HAS_GEMINI:
+        logger.warning("Gemini not available for MarkItDown image processing")
+        return None
+    if not config.api.google_api_key:
+        logger.warning("No Google API key found for MarkItDown image processing")
+        return None
+    try:
+        return GeminiClientWrapper()
+    except Exception as e:
+        logger.error(f"Failed to create Gemini client for MarkItDown: {e}")
+        return None
+# For testing purposes
+if __name__ == "__main__":
+    # Test the wrapper
+    try:
+        client = create_gemini_client_for_markitdown()
+        if client:
+            print("✅ Gemini client wrapper created successfully")
+            print("✅ Ready for MarkItDown integration")
+        else:
+            print("❌ Failed to create Gemini client wrapper")
+    except Exception as e:
+        print(f"❌ Error: {e}")

src/parsers/docling_parser.py CHANGED Viewed

@@ -132,7 +132,7 @@ class DoclingParser(DocumentParser):
     @classmethod
     def get_name(cls) -> str:
-        return "Docling (PDF, Images, DOCX, XLSX - Advanced PDF Understanding)"
     @classmethod
     def get_supported_file_types(cls) -> Set[str]:

     @classmethod
     def get_name(cls) -> str:
+        return "Docling"
     @classmethod
     def get_supported_file_types(cls) -> Set[str]:

src/parsers/got_ocr_parser.py CHANGED Viewed

@@ -41,7 +41,7 @@ class GotOcrParser(DocumentParser):
     @classmethod
     def get_name(cls) -> str:
-        return "GOT-OCR (jpg,png only)"
     @classmethod
     def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:

     @classmethod
     def get_name(cls) -> str:
+        return "GOT-OCR"
     @classmethod
     def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:

src/parsers/markitdown_parser.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import logging
 import os
 from pathlib import Path
 from typing import Dict, List, Optional, Any, Union, Set
 import io
@@ -12,12 +14,18 @@ from src.core.exceptions import DocumentProcessingError, ParserError
 # Check for MarkItDown availability
 try:
     from markitdown import MarkItDown
-    from openai import OpenAI
     HAS_MARKITDOWN = True
 except ImportError:
     HAS_MARKITDOWN = False
     logging.warning("MarkItDown package not installed. Please install with 'pip install markitdown[all]'")
 # Configure logging
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
@@ -33,19 +41,10 @@ class MarkItDownParser(DocumentParser):
         # Initialize MarkItDown instance
         if HAS_MARKITDOWN:
             try:
-                # Check for OpenAI API key for LLM-based image descriptions
-                openai_api_key = os.getenv("OPENAI_API_KEY")
-                if openai_api_key:
-                    client = OpenAI()
-                    self.markdown_instance = MarkItDown(
-                        enable_plugins=False,
-                        llm_client=client,
-                        llm_model="gpt-4o"
-                    )
-                    logger.info("MarkItDown initialized with OpenAI support for image descriptions")
-                else:
-                    self.markdown_instance = MarkItDown(enable_plugins=False)
-                    logger.info("MarkItDown initialized without OpenAI support")
             except Exception as e:
                 logger.error(f"Error initializing MarkItDown: {str(e)}")
                 self.markdown_instance = None
@@ -72,23 +71,95 @@ class MarkItDownParser(DocumentParser):
         # Check for cancellation before starting
         if self._check_cancellation():
             raise DocumentProcessingError("Conversion cancelled")
         try:
-            # Convert the file using the standard instance
-            result = self.markdown_instance.convert(str(file_path))
-            # Check for cancellation after processing
-            if self._check_cancellation():
-                raise DocumentProcessingError("Conversion cancelled")
-            return result.text_content
         except Exception as e:
             logger.error(f"Error converting file with MarkItDown: {str(e)}")
             raise DocumentProcessingError(f"MarkItDown conversion failed: {str(e)}")
     @classmethod
     def get_name(cls) -> str:
-        return "MarkItDown (pdf, jpg, png, xlsx --best for xlsx)"
     @classmethod
     def get_supported_file_types(cls) -> Set[str]:
@@ -112,7 +183,7 @@ class MarkItDownParser(DocumentParser):
     @classmethod
     def get_description(cls) -> str:
-        return "MarkItDown parser for converting various file formats to Markdown"
 # Register the parser with the registry if available

 import logging
 import os
+import threading
+import time
 from pathlib import Path
 from typing import Dict, List, Optional, Any, Union, Set
 import io
 # Check for MarkItDown availability
 try:
     from markitdown import MarkItDown
     HAS_MARKITDOWN = True
 except ImportError:
     HAS_MARKITDOWN = False
     logging.warning("MarkItDown package not installed. Please install with 'pip install markitdown[all]'")
+# Import our Gemini wrapper for LLM support
+try:
+    from src.core.gemini_client_wrapper import create_gemini_client_for_markitdown
+    HAS_GEMINI_WRAPPER = True
+except ImportError:
+    HAS_GEMINI_WRAPPER = False
 # Configure logging
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
         # Initialize MarkItDown instance
         if HAS_MARKITDOWN:
             try:
+                # Initialize MarkItDown without LLM client for better performance
+                # LLM client will only be used for image files when needed
+                self.markdown_instance = MarkItDown()
+                logger.info("MarkItDown initialized successfully")
             except Exception as e:
                 logger.error(f"Error initializing MarkItDown: {str(e)}")
                 self.markdown_instance = None
         # Check for cancellation before starting
         if self._check_cancellation():
             raise DocumentProcessingError("Conversion cancelled")
+        file_path_str = str(file_path)
+        file_ext = Path(file_path).suffix.lower()
         try:
+            # Run conversion in a separate thread to support cancellation
+            result_container = {"result": None, "error": None, "completed": False}
+            def conversion_worker():
+                try:
+                    # For image files, potentially use LLM if available
+                    if file_ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp']:
+                        if HAS_GEMINI_WRAPPER:
+                            try:
+                                # Create Gemini-enabled instance for image processing
+                                gemini_client = create_gemini_client_for_markitdown()
+                                if gemini_client:
+                                    llm_instance = MarkItDown(llm_client=gemini_client, llm_model="gemini-2.5-flash")
+                                    result = llm_instance.convert(file_path_str)
+                                else:
+                                    # No Gemini client available, use standard conversion
+                                    logger.info("Gemini client not available, using standard conversion for image")
+                                    result = self.markdown_instance.convert(file_path_str)
+                            except Exception as llm_error:
+                                logger.warning(f"Gemini image processing failed, falling back to basic conversion: {llm_error}")
+                                result = self.markdown_instance.convert(file_path_str)
+                        else:
+                            # No Gemini wrapper available, use standard conversion
+                            logger.info("Gemini wrapper not available, using standard conversion for image")
+                            result = self.markdown_instance.convert(file_path_str)
+                    else:
+                        # For non-image files, use standard conversion
+                        result = self.markdown_instance.convert(file_path_str)
+                    result_container["result"] = result
+                    result_container["completed"] = True
+                except Exception as e:
+                    result_container["error"] = e
+                    result_container["completed"] = True
+            # Start conversion in background thread
+            conversion_thread = threading.Thread(target=conversion_worker, daemon=True)
+            conversion_thread.start()
+            # Wait for completion or cancellation
+            while conversion_thread.is_alive():
+                if self._check_cancellation():
+                    logger.info("MarkItDown conversion cancelled by user")
+                    # Give thread a moment to finish cleanly
+                    conversion_thread.join(timeout=0.1)
+                    raise DocumentProcessingError("Conversion cancelled")
+                time.sleep(0.1)  # Check every 100ms
+            # Ensure thread has completed
+            conversion_thread.join()
+            # Check for errors
+            if result_container["error"]:
+                raise result_container["error"]
+            result = result_container["result"]
+            if result is None:
+                raise DocumentProcessingError("MarkItDown conversion returned no result")
+            # Use the correct attribute - MarkItDown returns .text_content
+            if hasattr(result, 'text_content') and result.text_content:
+                return result.text_content
+            elif hasattr(result, 'markdown') and result.markdown:
+                return result.markdown
+            elif hasattr(result, 'content') and result.content:
+                return result.content
+            else:
+                # Fallback - convert result to string
+                content = str(result)
+                if content and content.strip():
+                    return content
+                else:
+                    raise DocumentProcessingError("MarkItDown conversion returned empty content")
+        except DocumentProcessingError:
+            # Re-raise cancellation errors
+            raise
         except Exception as e:
             logger.error(f"Error converting file with MarkItDown: {str(e)}")
             raise DocumentProcessingError(f"MarkItDown conversion failed: {str(e)}")
     @classmethod
     def get_name(cls) -> str:
+        return "MarkItDown"
     @classmethod
     def get_supported_file_types(cls) -> Set[str]:
     @classmethod
     def get_description(cls) -> str:
+        return "MarkItDown parser for converting various file formats to Markdown. Uses Gemini Flash 2.5 for advanced image analysis."
 # Register the parser with the registry if available

src/parsers/mistral_ocr_parser.py CHANGED Viewed

@@ -32,7 +32,7 @@ class MistralOcrParser(DocumentParser):
     @classmethod
     def get_name(cls) -> str:
-        return "Mistral OCR (pdf, jpg, png)"
     @classmethod
     def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:

     @classmethod
     def get_name(cls) -> str:
+        return "Mistral OCR"
     @classmethod
     def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:

src/ui/components/document_converter.py CHANGED Viewed

@@ -220,7 +220,7 @@ def create_document_converter_tab():
             files_input = gr.Files(
                 label="Upload Document(s) - Single file or up to 5 files (20MB max combined)",
                 file_count="multiple",
-                file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".webp", ".docx", ".doc", ".pptx", ".ppt", ".xlsx", ".xls", ".txt", ".md", ".html", ".htm"]
             )
             # Processing type selector (visible only for multiple files)

             files_input = gr.Files(
                 label="Upload Document(s) - Single file or up to 5 files (20MB max combined)",
                 file_count="multiple",
+                file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".webp", ".docx", ".doc", ".pptx", ".ppt", ".xlsx", ".xls", ".txt", ".md", ".html", ".htm", ".csv"]
             )
             # Processing type selector (visible only for multiple files)

test_gemini_wrapper.py ADDED Viewed

	@@ -0,0 +1,94 @@

+#!/usr/bin/env python3
+"""
+Simple test script for Gemini wrapper functionality
+"""
+import sys
+from pathlib import Path
+# Add project root to path
+sys.path.append(str(Path(__file__).parent))
+def test_gemini_wrapper():
+    """Test Gemini wrapper without API key"""
+    print("Testing Gemini wrapper structure...")
+    try:
+        from src.core.gemini_client_wrapper import (
+            GeminiClientWrapper,
+            GeminiChatCompletions,
+            GeminiResponse,
+            HAS_GEMINI,
+            create_gemini_client_for_markitdown
+        )
+        print("✅ All classes imported successfully")
+        print(f"✅ HAS_GEMINI: {HAS_GEMINI}")
+        # Test response structure
+        test_response = GeminiResponse("Test image description")
+        print(f"✅ Response choices: {len(test_response.choices)}")
+        print(f"✅ Message content: {test_response.choices[0].message.content}")
+        # Test client creation (should fail gracefully without API key)
+        client = create_gemini_client_for_markitdown()
+        print(f"✅ Client creation (no API key): {client is None}")
+    except Exception as e:
+        print(f"❌ Error: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+    return True
+def test_markitdown_availability():
+    """Test MarkItDown availability"""
+    print("\nTesting MarkItDown availability...")
+    try:
+        from markitdown import MarkItDown
+        print("✅ MarkItDown imported successfully")
+        # Test basic initialization
+        md = MarkItDown()
+        print("✅ MarkItDown initialized without LLM client")
+    except Exception as e:
+        print(f"❌ MarkItDown error: {e}")
+        return False
+    return True
+def test_integration_structure():
+    """Test the overall integration structure"""
+    print("\nTesting integration structure...")
+    try:
+        # Test that our wrapper can theoretically work with MarkItDown
+        from src.core.gemini_client_wrapper import GeminiClientWrapper, HAS_GEMINI
+        from markitdown import MarkItDown
+        print("✅ Both components available for integration")
+        # Test interface compatibility (structure only)
+        if HAS_GEMINI:
+            print("✅ Gemini dependency available")
+        else:
+            print("⚠️  Gemini dependency not available")
+        print("✅ Integration structure test passed")
+    except Exception as e:
+        print(f"❌ Integration error: {e}")
+        return False
+    return True
+if __name__ == "__main__":
+    print("=== Testing Gemini-MarkItDown Integration ===\n")
+    success = True
+    success &= test_gemini_wrapper()
+    success &= test_markitdown_availability()
+    success &= test_integration_structure()
+    print(f"\n=== Overall Result: {'✅ PASS' if success else '❌ FAIL'} ===")