Markit_v2 / src /core /gemini_client_wrapper.py
AnseMin's picture
Integrate Gemini API for enhanced image processing in MarkItDown
033e4ba
"""
Gemini client wrapper that mimics OpenAI client interface for MarkItDown compatibility.
This allows us to use Gemini Flash 2.5 for image processing in MarkItDown.
"""
import logging
import base64
from typing import List, Dict, Any, Optional
from pathlib import Path
try:
from google import genai
HAS_GEMINI = True
except ImportError:
HAS_GEMINI = False
from src.core.config import config
from src.core.logging_config import get_logger
logger = get_logger(__name__)
class GeminiChatCompletions:
"""Chat completions interface that mimics OpenAI's chat.completions API."""
def __init__(self, client):
self.client = client
def create(self, model: str, messages: List[Dict[str, Any]], **kwargs) -> 'GeminiResponse':
"""Create a chat completion that mimics OpenAI's API."""
if not messages:
raise ValueError("Messages cannot be empty")
# Extract the user message (MarkItDown sends a single user message with text + image)
user_message = None
for msg in messages:
if msg.get("role") == "user":
user_message = msg
break
if not user_message:
raise ValueError("No user message found")
content = user_message.get("content", [])
if not isinstance(content, list):
content = [{"type": "text", "text": str(content)}]
# Extract text prompt and image
text_prompt = ""
image_data = None
for item in content:
if item.get("type") == "text":
text_prompt = item.get("text", "")
elif item.get("type") == "image_url":
image_url = item.get("image_url", {}).get("url", "")
if image_url.startswith("data:image/"):
# Extract base64 data from data URI
try:
header, data = image_url.split(",", 1)
image_data = base64.b64decode(data)
except Exception as e:
logger.error(f"Failed to decode image data: {e}")
raise ValueError("Invalid image data URI")
if not text_prompt:
text_prompt = "Describe this image in detail."
if not image_data:
raise ValueError("No image data found in request")
try:
# Use Gemini to process the image
response = self.client.models.generate_content(
model=config.model.gemini_model,
contents=[
{
"parts": [
{"text": text_prompt},
{
"inline_data": {
"mime_type": "image/jpeg", # Assume JPEG for now
"data": base64.b64encode(image_data).decode()
}
}
]
}
],
config={
"temperature": config.model.temperature,
"max_output_tokens": 1024, # Reasonable limit for image descriptions
}
)
# Extract text from Gemini response
response_text = ""
if hasattr(response, "text") and response.text:
response_text = response.text
elif hasattr(response, "candidates") and response.candidates:
candidate = response.candidates[0]
if hasattr(candidate, "content") and candidate.content:
if hasattr(candidate.content, "parts") and candidate.content.parts:
response_text = candidate.content.parts[0].text
if not response_text:
logger.warning("Empty response from Gemini, using fallback")
response_text = "Image processing completed but no description generated."
return GeminiResponse(response_text)
except Exception as e:
logger.error(f"Gemini API error: {str(e)}")
# Return a fallback response to avoid breaking MarkItDown
return GeminiResponse(f"Image description unavailable due to processing error: {str(e)}")
class GeminiChoice:
"""Mimics OpenAI's Choice object."""
def __init__(self, content: str):
self.message = GeminiMessage(content)
class GeminiMessage:
"""Mimics OpenAI's Message object."""
def __init__(self, content: str):
self.content = content
class GeminiResponse:
"""Mimics OpenAI's ChatCompletion response."""
def __init__(self, content: str):
self.choices = [GeminiChoice(content)]
class GeminiClientWrapper:
"""
Gemini client wrapper that mimics OpenAI client interface for MarkItDown.
This allows MarkItDown to use Gemini for image processing while thinking
it's using an OpenAI client.
"""
def __init__(self, api_key: Optional[str] = None):
if not HAS_GEMINI:
raise ImportError("google-genai package is required for Gemini support")
api_key = api_key or config.api.google_api_key
if not api_key:
raise ValueError("Google API key is required for Gemini client")
self.client = genai.Client(api_key=api_key)
self.chat = GeminiChatCompletions(self.client)
logger.info("Gemini client wrapper initialized for MarkItDown compatibility")
@property
def completions(self):
"""Alias for chat to match some OpenAI client patterns."""
return self.chat
def create_gemini_client_for_markitdown() -> Optional[GeminiClientWrapper]:
"""
Create a Gemini client wrapper for use with MarkItDown.
Returns:
GeminiClientWrapper if Gemini is available and configured, None otherwise.
"""
if not HAS_GEMINI:
logger.warning("Gemini not available for MarkItDown image processing")
return None
if not config.api.google_api_key:
logger.warning("No Google API key found for MarkItDown image processing")
return None
try:
return GeminiClientWrapper()
except Exception as e:
logger.error(f"Failed to create Gemini client for MarkItDown: {e}")
return None
# For testing purposes
if __name__ == "__main__":
# Test the wrapper
try:
client = create_gemini_client_for_markitdown()
if client:
print("βœ… Gemini client wrapper created successfully")
print("βœ… Ready for MarkItDown integration")
else:
print("❌ Failed to create Gemini client wrapper")
except Exception as e:
print(f"❌ Error: {e}")