File size: 6,914 Bytes
033e4ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
"""
Gemini client wrapper that mimics OpenAI client interface for MarkItDown compatibility.
This allows us to use Gemini Flash 2.5 for image processing in MarkItDown.
"""

import logging
import base64
from typing import List, Dict, Any, Optional
from pathlib import Path

try:
    from google import genai
    HAS_GEMINI = True
except ImportError:
    HAS_GEMINI = False

from src.core.config import config
from src.core.logging_config import get_logger

logger = get_logger(__name__)


class GeminiChatCompletions:
    """Chat completions interface that mimics OpenAI's chat.completions API."""
    
    def __init__(self, client):
        self.client = client
    
    def create(self, model: str, messages: List[Dict[str, Any]], **kwargs) -> 'GeminiResponse':
        """Create a chat completion that mimics OpenAI's API."""
        if not messages:
            raise ValueError("Messages cannot be empty")
        
        # Extract the user message (MarkItDown sends a single user message with text + image)
        user_message = None
        for msg in messages:
            if msg.get("role") == "user":
                user_message = msg
                break
        
        if not user_message:
            raise ValueError("No user message found")
        
        content = user_message.get("content", [])
        if not isinstance(content, list):
            content = [{"type": "text", "text": str(content)}]
        
        # Extract text prompt and image
        text_prompt = ""
        image_data = None
        
        for item in content:
            if item.get("type") == "text":
                text_prompt = item.get("text", "")
            elif item.get("type") == "image_url":
                image_url = item.get("image_url", {}).get("url", "")
                if image_url.startswith("data:image/"):
                    # Extract base64 data from data URI
                    try:
                        header, data = image_url.split(",", 1)
                        image_data = base64.b64decode(data)
                    except Exception as e:
                        logger.error(f"Failed to decode image data: {e}")
                        raise ValueError("Invalid image data URI")
        
        if not text_prompt:
            text_prompt = "Describe this image in detail."
        
        if not image_data:
            raise ValueError("No image data found in request")
        
        try:
            # Use Gemini to process the image
            response = self.client.models.generate_content(
                model=config.model.gemini_model,
                contents=[
                    {
                        "parts": [
                            {"text": text_prompt},
                            {
                                "inline_data": {
                                    "mime_type": "image/jpeg",  # Assume JPEG for now
                                    "data": base64.b64encode(image_data).decode()
                                }
                            }
                        ]
                    }
                ],
                config={
                    "temperature": config.model.temperature,
                    "max_output_tokens": 1024,  # Reasonable limit for image descriptions
                }
            )
            
            # Extract text from Gemini response
            response_text = ""
            if hasattr(response, "text") and response.text:
                response_text = response.text
            elif hasattr(response, "candidates") and response.candidates:
                candidate = response.candidates[0]
                if hasattr(candidate, "content") and candidate.content:
                    if hasattr(candidate.content, "parts") and candidate.content.parts:
                        response_text = candidate.content.parts[0].text
            
            if not response_text:
                logger.warning("Empty response from Gemini, using fallback")
                response_text = "Image processing completed but no description generated."
            
            return GeminiResponse(response_text)
            
        except Exception as e:
            logger.error(f"Gemini API error: {str(e)}")
            # Return a fallback response to avoid breaking MarkItDown
            return GeminiResponse(f"Image description unavailable due to processing error: {str(e)}")


class GeminiChoice:
    """Mimics OpenAI's Choice object."""
    
    def __init__(self, content: str):
        self.message = GeminiMessage(content)


class GeminiMessage:
    """Mimics OpenAI's Message object."""
    
    def __init__(self, content: str):
        self.content = content


class GeminiResponse:
    """Mimics OpenAI's ChatCompletion response."""
    
    def __init__(self, content: str):
        self.choices = [GeminiChoice(content)]


class GeminiClientWrapper:
    """
    Gemini client wrapper that mimics OpenAI client interface for MarkItDown.
    
    This allows MarkItDown to use Gemini for image processing while thinking
    it's using an OpenAI client.
    """
    
    def __init__(self, api_key: Optional[str] = None):
        if not HAS_GEMINI:
            raise ImportError("google-genai package is required for Gemini support")
        
        api_key = api_key or config.api.google_api_key
        if not api_key:
            raise ValueError("Google API key is required for Gemini client")
        
        self.client = genai.Client(api_key=api_key)
        self.chat = GeminiChatCompletions(self.client)
        
        logger.info("Gemini client wrapper initialized for MarkItDown compatibility")
    
    @property
    def completions(self):
        """Alias for chat to match some OpenAI client patterns."""
        return self.chat


def create_gemini_client_for_markitdown() -> Optional[GeminiClientWrapper]:
    """
    Create a Gemini client wrapper for use with MarkItDown.
    
    Returns:
        GeminiClientWrapper if Gemini is available and configured, None otherwise.
    """
    if not HAS_GEMINI:
        logger.warning("Gemini not available for MarkItDown image processing")
        return None
    
    if not config.api.google_api_key:
        logger.warning("No Google API key found for MarkItDown image processing")
        return None
    
    try:
        return GeminiClientWrapper()
    except Exception as e:
        logger.error(f"Failed to create Gemini client for MarkItDown: {e}")
        return None


# For testing purposes
if __name__ == "__main__":
    # Test the wrapper
    try:
        client = create_gemini_client_for_markitdown()
        if client:
            print("βœ… Gemini client wrapper created successfully")
            print("βœ… Ready for MarkItDown integration")
        else:
            print("❌ Failed to create Gemini client wrapper")
    except Exception as e:
        print(f"❌ Error: {e}")