Spaces:
Runtime error
Runtime error
""" | |
Gemini client wrapper that mimics OpenAI client interface for MarkItDown compatibility. | |
This allows us to use Gemini Flash 2.5 for image processing in MarkItDown. | |
""" | |
import logging | |
import base64 | |
from typing import List, Dict, Any, Optional | |
from pathlib import Path | |
try: | |
from google import genai | |
HAS_GEMINI = True | |
except ImportError: | |
HAS_GEMINI = False | |
from src.core.config import config | |
from src.core.logging_config import get_logger | |
logger = get_logger(__name__) | |
class GeminiChatCompletions: | |
"""Chat completions interface that mimics OpenAI's chat.completions API.""" | |
def __init__(self, client): | |
self.client = client | |
def create(self, model: str, messages: List[Dict[str, Any]], **kwargs) -> 'GeminiResponse': | |
"""Create a chat completion that mimics OpenAI's API.""" | |
if not messages: | |
raise ValueError("Messages cannot be empty") | |
# Extract the user message (MarkItDown sends a single user message with text + image) | |
user_message = None | |
for msg in messages: | |
if msg.get("role") == "user": | |
user_message = msg | |
break | |
if not user_message: | |
raise ValueError("No user message found") | |
content = user_message.get("content", []) | |
if not isinstance(content, list): | |
content = [{"type": "text", "text": str(content)}] | |
# Extract text prompt and image | |
text_prompt = "" | |
image_data = None | |
for item in content: | |
if item.get("type") == "text": | |
text_prompt = item.get("text", "") | |
elif item.get("type") == "image_url": | |
image_url = item.get("image_url", {}).get("url", "") | |
if image_url.startswith("data:image/"): | |
# Extract base64 data from data URI | |
try: | |
header, data = image_url.split(",", 1) | |
image_data = base64.b64decode(data) | |
except Exception as e: | |
logger.error(f"Failed to decode image data: {e}") | |
raise ValueError("Invalid image data URI") | |
if not text_prompt: | |
text_prompt = "Describe this image in detail." | |
if not image_data: | |
raise ValueError("No image data found in request") | |
try: | |
# Use Gemini to process the image | |
response = self.client.models.generate_content( | |
model=config.model.gemini_model, | |
contents=[ | |
{ | |
"parts": [ | |
{"text": text_prompt}, | |
{ | |
"inline_data": { | |
"mime_type": "image/jpeg", # Assume JPEG for now | |
"data": base64.b64encode(image_data).decode() | |
} | |
} | |
] | |
} | |
], | |
config={ | |
"temperature": config.model.temperature, | |
"max_output_tokens": 1024, # Reasonable limit for image descriptions | |
} | |
) | |
# Extract text from Gemini response | |
response_text = "" | |
if hasattr(response, "text") and response.text: | |
response_text = response.text | |
elif hasattr(response, "candidates") and response.candidates: | |
candidate = response.candidates[0] | |
if hasattr(candidate, "content") and candidate.content: | |
if hasattr(candidate.content, "parts") and candidate.content.parts: | |
response_text = candidate.content.parts[0].text | |
if not response_text: | |
logger.warning("Empty response from Gemini, using fallback") | |
response_text = "Image processing completed but no description generated." | |
return GeminiResponse(response_text) | |
except Exception as e: | |
logger.error(f"Gemini API error: {str(e)}") | |
# Return a fallback response to avoid breaking MarkItDown | |
return GeminiResponse(f"Image description unavailable due to processing error: {str(e)}") | |
class GeminiChoice: | |
"""Mimics OpenAI's Choice object.""" | |
def __init__(self, content: str): | |
self.message = GeminiMessage(content) | |
class GeminiMessage: | |
"""Mimics OpenAI's Message object.""" | |
def __init__(self, content: str): | |
self.content = content | |
class GeminiResponse: | |
"""Mimics OpenAI's ChatCompletion response.""" | |
def __init__(self, content: str): | |
self.choices = [GeminiChoice(content)] | |
class GeminiClientWrapper: | |
""" | |
Gemini client wrapper that mimics OpenAI client interface for MarkItDown. | |
This allows MarkItDown to use Gemini for image processing while thinking | |
it's using an OpenAI client. | |
""" | |
def __init__(self, api_key: Optional[str] = None): | |
if not HAS_GEMINI: | |
raise ImportError("google-genai package is required for Gemini support") | |
api_key = api_key or config.api.google_api_key | |
if not api_key: | |
raise ValueError("Google API key is required for Gemini client") | |
self.client = genai.Client(api_key=api_key) | |
self.chat = GeminiChatCompletions(self.client) | |
logger.info("Gemini client wrapper initialized for MarkItDown compatibility") | |
def completions(self): | |
"""Alias for chat to match some OpenAI client patterns.""" | |
return self.chat | |
def create_gemini_client_for_markitdown() -> Optional[GeminiClientWrapper]: | |
""" | |
Create a Gemini client wrapper for use with MarkItDown. | |
Returns: | |
GeminiClientWrapper if Gemini is available and configured, None otherwise. | |
""" | |
if not HAS_GEMINI: | |
logger.warning("Gemini not available for MarkItDown image processing") | |
return None | |
if not config.api.google_api_key: | |
logger.warning("No Google API key found for MarkItDown image processing") | |
return None | |
try: | |
return GeminiClientWrapper() | |
except Exception as e: | |
logger.error(f"Failed to create Gemini client for MarkItDown: {e}") | |
return None | |
# For testing purposes | |
if __name__ == "__main__": | |
# Test the wrapper | |
try: | |
client = create_gemini_client_for_markitdown() | |
if client: | |
print("β Gemini client wrapper created successfully") | |
print("β Ready for MarkItDown integration") | |
else: | |
print("β Failed to create Gemini client wrapper") | |
except Exception as e: | |
print(f"β Error: {e}") |