Spaces:
Sleeping
Sleeping
Integrate Gemini API for enhanced image processing in MarkItDown
Browse files- Updated `app.py` to reflect the use of Gemini for image processing in MarkItDown.
- Modified `requirements.txt` to include `ffmpeg-python` for audio processing and removed OpenAI dependency.
- Enhanced `setup.sh` to install Gemini dependencies and updated installation instructions.
- Introduced `gemini_client_wrapper.py` to create a wrapper for Gemini API, mimicking OpenAI's interface for compatibility with MarkItDown.
- Added tests in `test_gemini_wrapper.py` to validate Gemini integration and MarkItDown functionality.
- Refactored `markitdown_parser.py` to utilize Gemini for image files while maintaining standard processing for other formats.
- Updated parser names and descriptions for clarity across various parsers.
- app.py +1 -1
- requirements.txt +2 -1
- setup.sh +2 -2
- src/core/gemini_client_wrapper.py +198 -0
- src/parsers/docling_parser.py +1 -1
- src/parsers/got_ocr_parser.py +1 -1
- src/parsers/markitdown_parser.py +95 -24
- src/parsers/mistral_ocr_parser.py +1 -1
- src/ui/components/document_converter.py +1 -1
- test_gemini_wrapper.py +94 -0
app.py
CHANGED
@@ -35,7 +35,7 @@ except ImportError as e:
|
|
35 |
|
36 |
try:
|
37 |
from markitdown import MarkItDown
|
38 |
-
print("MarkItDown is available")
|
39 |
except ImportError:
|
40 |
print("Installing MarkItDown...")
|
41 |
subprocess.run([sys.executable, "-m", "pip", "install", "-q", "markitdown[all]"], check=False)
|
|
|
35 |
|
36 |
try:
|
37 |
from markitdown import MarkItDown
|
38 |
+
print("MarkItDown is available (using Gemini for image processing)")
|
39 |
except ImportError:
|
40 |
print("Installing MarkItDown...")
|
41 |
subprocess.run([sys.executable, "-m", "pip", "install", "-q", "markitdown[all]"], check=False)
|
requirements.txt
CHANGED
@@ -30,7 +30,8 @@ huggingface_hub[cli]>=0.19.0
|
|
30 |
|
31 |
# MarkItDown and its dependencies
|
32 |
markitdown[all]
|
33 |
-
|
|
|
34 |
|
35 |
# Docling dependencies
|
36 |
docling
|
|
|
30 |
|
31 |
# MarkItDown and its dependencies
|
32 |
markitdown[all]
|
33 |
+
ffmpeg-python # For audio processing in MarkItDown
|
34 |
+
# Note: Using Gemini Flash 2.5 for LLM image descriptions instead of OpenAI
|
35 |
|
36 |
# Docling dependencies
|
37 |
docling
|
setup.sh
CHANGED
@@ -30,8 +30,7 @@ echo "NumPy installed successfully"
|
|
30 |
echo "Installing Python dependencies..."
|
31 |
pip install -q -U pillow opencv-python
|
32 |
pip install -q -U google-genai
|
33 |
-
|
34 |
-
# pip install -q -U latex2markdown - removed, now using Gemini API for LaTeX conversion
|
35 |
echo "Python dependencies installed successfully"
|
36 |
|
37 |
# Install GOT-OCR transformers dependencies
|
@@ -50,6 +49,7 @@ echo "Spaces module installed successfully"
|
|
50 |
# Install markitdown with all optional dependencies
|
51 |
echo "Installing MarkItDown with all dependencies..."
|
52 |
pip install -q -U 'markitdown[all]'
|
|
|
53 |
echo "MarkItDown installed successfully"
|
54 |
|
55 |
# Install Docling for advanced PDF understanding
|
|
|
30 |
echo "Installing Python dependencies..."
|
31 |
pip install -q -U pillow opencv-python
|
32 |
pip install -q -U google-genai
|
33 |
+
# Note: Using Gemini Flash 2.5 for LLM image descriptions in MarkItDown instead of OpenAI
|
|
|
34 |
echo "Python dependencies installed successfully"
|
35 |
|
36 |
# Install GOT-OCR transformers dependencies
|
|
|
49 |
# Install markitdown with all optional dependencies
|
50 |
echo "Installing MarkItDown with all dependencies..."
|
51 |
pip install -q -U 'markitdown[all]'
|
52 |
+
pip install -q -U ffmpeg-python # For audio processing
|
53 |
echo "MarkItDown installed successfully"
|
54 |
|
55 |
# Install Docling for advanced PDF understanding
|
src/core/gemini_client_wrapper.py
ADDED
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Gemini client wrapper that mimics OpenAI client interface for MarkItDown compatibility.
|
3 |
+
This allows us to use Gemini Flash 2.5 for image processing in MarkItDown.
|
4 |
+
"""
|
5 |
+
|
6 |
+
import logging
|
7 |
+
import base64
|
8 |
+
from typing import List, Dict, Any, Optional
|
9 |
+
from pathlib import Path
|
10 |
+
|
11 |
+
try:
|
12 |
+
from google import genai
|
13 |
+
HAS_GEMINI = True
|
14 |
+
except ImportError:
|
15 |
+
HAS_GEMINI = False
|
16 |
+
|
17 |
+
from src.core.config import config
|
18 |
+
from src.core.logging_config import get_logger
|
19 |
+
|
20 |
+
logger = get_logger(__name__)
|
21 |
+
|
22 |
+
|
23 |
+
class GeminiChatCompletions:
|
24 |
+
"""Chat completions interface that mimics OpenAI's chat.completions API."""
|
25 |
+
|
26 |
+
def __init__(self, client):
|
27 |
+
self.client = client
|
28 |
+
|
29 |
+
def create(self, model: str, messages: List[Dict[str, Any]], **kwargs) -> 'GeminiResponse':
|
30 |
+
"""Create a chat completion that mimics OpenAI's API."""
|
31 |
+
if not messages:
|
32 |
+
raise ValueError("Messages cannot be empty")
|
33 |
+
|
34 |
+
# Extract the user message (MarkItDown sends a single user message with text + image)
|
35 |
+
user_message = None
|
36 |
+
for msg in messages:
|
37 |
+
if msg.get("role") == "user":
|
38 |
+
user_message = msg
|
39 |
+
break
|
40 |
+
|
41 |
+
if not user_message:
|
42 |
+
raise ValueError("No user message found")
|
43 |
+
|
44 |
+
content = user_message.get("content", [])
|
45 |
+
if not isinstance(content, list):
|
46 |
+
content = [{"type": "text", "text": str(content)}]
|
47 |
+
|
48 |
+
# Extract text prompt and image
|
49 |
+
text_prompt = ""
|
50 |
+
image_data = None
|
51 |
+
|
52 |
+
for item in content:
|
53 |
+
if item.get("type") == "text":
|
54 |
+
text_prompt = item.get("text", "")
|
55 |
+
elif item.get("type") == "image_url":
|
56 |
+
image_url = item.get("image_url", {}).get("url", "")
|
57 |
+
if image_url.startswith("data:image/"):
|
58 |
+
# Extract base64 data from data URI
|
59 |
+
try:
|
60 |
+
header, data = image_url.split(",", 1)
|
61 |
+
image_data = base64.b64decode(data)
|
62 |
+
except Exception as e:
|
63 |
+
logger.error(f"Failed to decode image data: {e}")
|
64 |
+
raise ValueError("Invalid image data URI")
|
65 |
+
|
66 |
+
if not text_prompt:
|
67 |
+
text_prompt = "Describe this image in detail."
|
68 |
+
|
69 |
+
if not image_data:
|
70 |
+
raise ValueError("No image data found in request")
|
71 |
+
|
72 |
+
try:
|
73 |
+
# Use Gemini to process the image
|
74 |
+
response = self.client.models.generate_content(
|
75 |
+
model=config.model.gemini_model,
|
76 |
+
contents=[
|
77 |
+
{
|
78 |
+
"parts": [
|
79 |
+
{"text": text_prompt},
|
80 |
+
{
|
81 |
+
"inline_data": {
|
82 |
+
"mime_type": "image/jpeg", # Assume JPEG for now
|
83 |
+
"data": base64.b64encode(image_data).decode()
|
84 |
+
}
|
85 |
+
}
|
86 |
+
]
|
87 |
+
}
|
88 |
+
],
|
89 |
+
config={
|
90 |
+
"temperature": config.model.temperature,
|
91 |
+
"max_output_tokens": 1024, # Reasonable limit for image descriptions
|
92 |
+
}
|
93 |
+
)
|
94 |
+
|
95 |
+
# Extract text from Gemini response
|
96 |
+
response_text = ""
|
97 |
+
if hasattr(response, "text") and response.text:
|
98 |
+
response_text = response.text
|
99 |
+
elif hasattr(response, "candidates") and response.candidates:
|
100 |
+
candidate = response.candidates[0]
|
101 |
+
if hasattr(candidate, "content") and candidate.content:
|
102 |
+
if hasattr(candidate.content, "parts") and candidate.content.parts:
|
103 |
+
response_text = candidate.content.parts[0].text
|
104 |
+
|
105 |
+
if not response_text:
|
106 |
+
logger.warning("Empty response from Gemini, using fallback")
|
107 |
+
response_text = "Image processing completed but no description generated."
|
108 |
+
|
109 |
+
return GeminiResponse(response_text)
|
110 |
+
|
111 |
+
except Exception as e:
|
112 |
+
logger.error(f"Gemini API error: {str(e)}")
|
113 |
+
# Return a fallback response to avoid breaking MarkItDown
|
114 |
+
return GeminiResponse(f"Image description unavailable due to processing error: {str(e)}")
|
115 |
+
|
116 |
+
|
117 |
+
class GeminiChoice:
|
118 |
+
"""Mimics OpenAI's Choice object."""
|
119 |
+
|
120 |
+
def __init__(self, content: str):
|
121 |
+
self.message = GeminiMessage(content)
|
122 |
+
|
123 |
+
|
124 |
+
class GeminiMessage:
|
125 |
+
"""Mimics OpenAI's Message object."""
|
126 |
+
|
127 |
+
def __init__(self, content: str):
|
128 |
+
self.content = content
|
129 |
+
|
130 |
+
|
131 |
+
class GeminiResponse:
|
132 |
+
"""Mimics OpenAI's ChatCompletion response."""
|
133 |
+
|
134 |
+
def __init__(self, content: str):
|
135 |
+
self.choices = [GeminiChoice(content)]
|
136 |
+
|
137 |
+
|
138 |
+
class GeminiClientWrapper:
|
139 |
+
"""
|
140 |
+
Gemini client wrapper that mimics OpenAI client interface for MarkItDown.
|
141 |
+
|
142 |
+
This allows MarkItDown to use Gemini for image processing while thinking
|
143 |
+
it's using an OpenAI client.
|
144 |
+
"""
|
145 |
+
|
146 |
+
def __init__(self, api_key: Optional[str] = None):
|
147 |
+
if not HAS_GEMINI:
|
148 |
+
raise ImportError("google-genai package is required for Gemini support")
|
149 |
+
|
150 |
+
api_key = api_key or config.api.google_api_key
|
151 |
+
if not api_key:
|
152 |
+
raise ValueError("Google API key is required for Gemini client")
|
153 |
+
|
154 |
+
self.client = genai.Client(api_key=api_key)
|
155 |
+
self.chat = GeminiChatCompletions(self.client)
|
156 |
+
|
157 |
+
logger.info("Gemini client wrapper initialized for MarkItDown compatibility")
|
158 |
+
|
159 |
+
@property
|
160 |
+
def completions(self):
|
161 |
+
"""Alias for chat to match some OpenAI client patterns."""
|
162 |
+
return self.chat
|
163 |
+
|
164 |
+
|
165 |
+
def create_gemini_client_for_markitdown() -> Optional[GeminiClientWrapper]:
|
166 |
+
"""
|
167 |
+
Create a Gemini client wrapper for use with MarkItDown.
|
168 |
+
|
169 |
+
Returns:
|
170 |
+
GeminiClientWrapper if Gemini is available and configured, None otherwise.
|
171 |
+
"""
|
172 |
+
if not HAS_GEMINI:
|
173 |
+
logger.warning("Gemini not available for MarkItDown image processing")
|
174 |
+
return None
|
175 |
+
|
176 |
+
if not config.api.google_api_key:
|
177 |
+
logger.warning("No Google API key found for MarkItDown image processing")
|
178 |
+
return None
|
179 |
+
|
180 |
+
try:
|
181 |
+
return GeminiClientWrapper()
|
182 |
+
except Exception as e:
|
183 |
+
logger.error(f"Failed to create Gemini client for MarkItDown: {e}")
|
184 |
+
return None
|
185 |
+
|
186 |
+
|
187 |
+
# For testing purposes
|
188 |
+
if __name__ == "__main__":
|
189 |
+
# Test the wrapper
|
190 |
+
try:
|
191 |
+
client = create_gemini_client_for_markitdown()
|
192 |
+
if client:
|
193 |
+
print("β
Gemini client wrapper created successfully")
|
194 |
+
print("β
Ready for MarkItDown integration")
|
195 |
+
else:
|
196 |
+
print("β Failed to create Gemini client wrapper")
|
197 |
+
except Exception as e:
|
198 |
+
print(f"β Error: {e}")
|
src/parsers/docling_parser.py
CHANGED
@@ -132,7 +132,7 @@ class DoclingParser(DocumentParser):
|
|
132 |
|
133 |
@classmethod
|
134 |
def get_name(cls) -> str:
|
135 |
-
return "Docling
|
136 |
|
137 |
@classmethod
|
138 |
def get_supported_file_types(cls) -> Set[str]:
|
|
|
132 |
|
133 |
@classmethod
|
134 |
def get_name(cls) -> str:
|
135 |
+
return "Docling"
|
136 |
|
137 |
@classmethod
|
138 |
def get_supported_file_types(cls) -> Set[str]:
|
src/parsers/got_ocr_parser.py
CHANGED
@@ -41,7 +41,7 @@ class GotOcrParser(DocumentParser):
|
|
41 |
|
42 |
@classmethod
|
43 |
def get_name(cls) -> str:
|
44 |
-
return "GOT-OCR
|
45 |
|
46 |
@classmethod
|
47 |
def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
|
|
|
41 |
|
42 |
@classmethod
|
43 |
def get_name(cls) -> str:
|
44 |
+
return "GOT-OCR"
|
45 |
|
46 |
@classmethod
|
47 |
def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
|
src/parsers/markitdown_parser.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1 |
import logging
|
2 |
import os
|
|
|
|
|
3 |
from pathlib import Path
|
4 |
from typing import Dict, List, Optional, Any, Union, Set
|
5 |
import io
|
@@ -12,12 +14,18 @@ from src.core.exceptions import DocumentProcessingError, ParserError
|
|
12 |
# Check for MarkItDown availability
|
13 |
try:
|
14 |
from markitdown import MarkItDown
|
15 |
-
from openai import OpenAI
|
16 |
HAS_MARKITDOWN = True
|
17 |
except ImportError:
|
18 |
HAS_MARKITDOWN = False
|
19 |
logging.warning("MarkItDown package not installed. Please install with 'pip install markitdown[all]'")
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
# Configure logging
|
22 |
logger = logging.getLogger(__name__)
|
23 |
logger.setLevel(logging.DEBUG)
|
@@ -33,19 +41,10 @@ class MarkItDownParser(DocumentParser):
|
|
33 |
# Initialize MarkItDown instance
|
34 |
if HAS_MARKITDOWN:
|
35 |
try:
|
36 |
-
#
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
self.markdown_instance = MarkItDown(
|
41 |
-
enable_plugins=False,
|
42 |
-
llm_client=client,
|
43 |
-
llm_model="gpt-4o"
|
44 |
-
)
|
45 |
-
logger.info("MarkItDown initialized with OpenAI support for image descriptions")
|
46 |
-
else:
|
47 |
-
self.markdown_instance = MarkItDown(enable_plugins=False)
|
48 |
-
logger.info("MarkItDown initialized without OpenAI support")
|
49 |
except Exception as e:
|
50 |
logger.error(f"Error initializing MarkItDown: {str(e)}")
|
51 |
self.markdown_instance = None
|
@@ -72,23 +71,95 @@ class MarkItDownParser(DocumentParser):
|
|
72 |
# Check for cancellation before starting
|
73 |
if self._check_cancellation():
|
74 |
raise DocumentProcessingError("Conversion cancelled")
|
75 |
-
|
|
|
|
|
|
|
76 |
try:
|
77 |
-
#
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
|
84 |
-
|
|
|
|
|
85 |
except Exception as e:
|
86 |
logger.error(f"Error converting file with MarkItDown: {str(e)}")
|
87 |
raise DocumentProcessingError(f"MarkItDown conversion failed: {str(e)}")
|
88 |
|
89 |
@classmethod
|
90 |
def get_name(cls) -> str:
|
91 |
-
return "MarkItDown
|
92 |
|
93 |
@classmethod
|
94 |
def get_supported_file_types(cls) -> Set[str]:
|
@@ -112,7 +183,7 @@ class MarkItDownParser(DocumentParser):
|
|
112 |
|
113 |
@classmethod
|
114 |
def get_description(cls) -> str:
|
115 |
-
return "MarkItDown parser for converting various file formats to Markdown"
|
116 |
|
117 |
|
118 |
# Register the parser with the registry if available
|
|
|
1 |
import logging
|
2 |
import os
|
3 |
+
import threading
|
4 |
+
import time
|
5 |
from pathlib import Path
|
6 |
from typing import Dict, List, Optional, Any, Union, Set
|
7 |
import io
|
|
|
14 |
# Check for MarkItDown availability
|
15 |
try:
|
16 |
from markitdown import MarkItDown
|
|
|
17 |
HAS_MARKITDOWN = True
|
18 |
except ImportError:
|
19 |
HAS_MARKITDOWN = False
|
20 |
logging.warning("MarkItDown package not installed. Please install with 'pip install markitdown[all]'")
|
21 |
|
22 |
+
# Import our Gemini wrapper for LLM support
|
23 |
+
try:
|
24 |
+
from src.core.gemini_client_wrapper import create_gemini_client_for_markitdown
|
25 |
+
HAS_GEMINI_WRAPPER = True
|
26 |
+
except ImportError:
|
27 |
+
HAS_GEMINI_WRAPPER = False
|
28 |
+
|
29 |
# Configure logging
|
30 |
logger = logging.getLogger(__name__)
|
31 |
logger.setLevel(logging.DEBUG)
|
|
|
41 |
# Initialize MarkItDown instance
|
42 |
if HAS_MARKITDOWN:
|
43 |
try:
|
44 |
+
# Initialize MarkItDown without LLM client for better performance
|
45 |
+
# LLM client will only be used for image files when needed
|
46 |
+
self.markdown_instance = MarkItDown()
|
47 |
+
logger.info("MarkItDown initialized successfully")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
except Exception as e:
|
49 |
logger.error(f"Error initializing MarkItDown: {str(e)}")
|
50 |
self.markdown_instance = None
|
|
|
71 |
# Check for cancellation before starting
|
72 |
if self._check_cancellation():
|
73 |
raise DocumentProcessingError("Conversion cancelled")
|
74 |
+
|
75 |
+
file_path_str = str(file_path)
|
76 |
+
file_ext = Path(file_path).suffix.lower()
|
77 |
+
|
78 |
try:
|
79 |
+
# Run conversion in a separate thread to support cancellation
|
80 |
+
result_container = {"result": None, "error": None, "completed": False}
|
81 |
+
|
82 |
+
def conversion_worker():
|
83 |
+
try:
|
84 |
+
# For image files, potentially use LLM if available
|
85 |
+
if file_ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp']:
|
86 |
+
if HAS_GEMINI_WRAPPER:
|
87 |
+
try:
|
88 |
+
# Create Gemini-enabled instance for image processing
|
89 |
+
gemini_client = create_gemini_client_for_markitdown()
|
90 |
+
if gemini_client:
|
91 |
+
llm_instance = MarkItDown(llm_client=gemini_client, llm_model="gemini-2.5-flash")
|
92 |
+
result = llm_instance.convert(file_path_str)
|
93 |
+
else:
|
94 |
+
# No Gemini client available, use standard conversion
|
95 |
+
logger.info("Gemini client not available, using standard conversion for image")
|
96 |
+
result = self.markdown_instance.convert(file_path_str)
|
97 |
+
except Exception as llm_error:
|
98 |
+
logger.warning(f"Gemini image processing failed, falling back to basic conversion: {llm_error}")
|
99 |
+
result = self.markdown_instance.convert(file_path_str)
|
100 |
+
else:
|
101 |
+
# No Gemini wrapper available, use standard conversion
|
102 |
+
logger.info("Gemini wrapper not available, using standard conversion for image")
|
103 |
+
result = self.markdown_instance.convert(file_path_str)
|
104 |
+
else:
|
105 |
+
# For non-image files, use standard conversion
|
106 |
+
result = self.markdown_instance.convert(file_path_str)
|
107 |
+
|
108 |
+
result_container["result"] = result
|
109 |
+
result_container["completed"] = True
|
110 |
+
except Exception as e:
|
111 |
+
result_container["error"] = e
|
112 |
+
result_container["completed"] = True
|
113 |
+
|
114 |
+
# Start conversion in background thread
|
115 |
+
conversion_thread = threading.Thread(target=conversion_worker, daemon=True)
|
116 |
+
conversion_thread.start()
|
117 |
+
|
118 |
+
# Wait for completion or cancellation
|
119 |
+
while conversion_thread.is_alive():
|
120 |
+
if self._check_cancellation():
|
121 |
+
logger.info("MarkItDown conversion cancelled by user")
|
122 |
+
# Give thread a moment to finish cleanly
|
123 |
+
conversion_thread.join(timeout=0.1)
|
124 |
+
raise DocumentProcessingError("Conversion cancelled")
|
125 |
+
time.sleep(0.1) # Check every 100ms
|
126 |
+
|
127 |
+
# Ensure thread has completed
|
128 |
+
conversion_thread.join()
|
129 |
+
|
130 |
+
# Check for errors
|
131 |
+
if result_container["error"]:
|
132 |
+
raise result_container["error"]
|
133 |
+
|
134 |
+
result = result_container["result"]
|
135 |
+
if result is None:
|
136 |
+
raise DocumentProcessingError("MarkItDown conversion returned no result")
|
137 |
+
|
138 |
+
# Use the correct attribute - MarkItDown returns .text_content
|
139 |
+
if hasattr(result, 'text_content') and result.text_content:
|
140 |
+
return result.text_content
|
141 |
+
elif hasattr(result, 'markdown') and result.markdown:
|
142 |
+
return result.markdown
|
143 |
+
elif hasattr(result, 'content') and result.content:
|
144 |
+
return result.content
|
145 |
+
else:
|
146 |
+
# Fallback - convert result to string
|
147 |
+
content = str(result)
|
148 |
+
if content and content.strip():
|
149 |
+
return content
|
150 |
+
else:
|
151 |
+
raise DocumentProcessingError("MarkItDown conversion returned empty content")
|
152 |
|
153 |
+
except DocumentProcessingError:
|
154 |
+
# Re-raise cancellation errors
|
155 |
+
raise
|
156 |
except Exception as e:
|
157 |
logger.error(f"Error converting file with MarkItDown: {str(e)}")
|
158 |
raise DocumentProcessingError(f"MarkItDown conversion failed: {str(e)}")
|
159 |
|
160 |
@classmethod
|
161 |
def get_name(cls) -> str:
|
162 |
+
return "MarkItDown"
|
163 |
|
164 |
@classmethod
|
165 |
def get_supported_file_types(cls) -> Set[str]:
|
|
|
183 |
|
184 |
@classmethod
|
185 |
def get_description(cls) -> str:
|
186 |
+
return "MarkItDown parser for converting various file formats to Markdown. Uses Gemini Flash 2.5 for advanced image analysis."
|
187 |
|
188 |
|
189 |
# Register the parser with the registry if available
|
src/parsers/mistral_ocr_parser.py
CHANGED
@@ -32,7 +32,7 @@ class MistralOcrParser(DocumentParser):
|
|
32 |
|
33 |
@classmethod
|
34 |
def get_name(cls) -> str:
|
35 |
-
return "Mistral OCR
|
36 |
|
37 |
@classmethod
|
38 |
def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
|
|
|
32 |
|
33 |
@classmethod
|
34 |
def get_name(cls) -> str:
|
35 |
+
return "Mistral OCR"
|
36 |
|
37 |
@classmethod
|
38 |
def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
|
src/ui/components/document_converter.py
CHANGED
@@ -220,7 +220,7 @@ def create_document_converter_tab():
|
|
220 |
files_input = gr.Files(
|
221 |
label="Upload Document(s) - Single file or up to 5 files (20MB max combined)",
|
222 |
file_count="multiple",
|
223 |
-
file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".webp", ".docx", ".doc", ".pptx", ".ppt", ".xlsx", ".xls", ".txt", ".md", ".html", ".htm"]
|
224 |
)
|
225 |
|
226 |
# Processing type selector (visible only for multiple files)
|
|
|
220 |
files_input = gr.Files(
|
221 |
label="Upload Document(s) - Single file or up to 5 files (20MB max combined)",
|
222 |
file_count="multiple",
|
223 |
+
file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".webp", ".docx", ".doc", ".pptx", ".ppt", ".xlsx", ".xls", ".txt", ".md", ".html", ".htm", ".csv"]
|
224 |
)
|
225 |
|
226 |
# Processing type selector (visible only for multiple files)
|
test_gemini_wrapper.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Simple test script for Gemini wrapper functionality
|
4 |
+
"""
|
5 |
+
import sys
|
6 |
+
from pathlib import Path
|
7 |
+
|
8 |
+
# Add project root to path
|
9 |
+
sys.path.append(str(Path(__file__).parent))
|
10 |
+
|
11 |
+
def test_gemini_wrapper():
|
12 |
+
"""Test Gemini wrapper without API key"""
|
13 |
+
print("Testing Gemini wrapper structure...")
|
14 |
+
|
15 |
+
try:
|
16 |
+
from src.core.gemini_client_wrapper import (
|
17 |
+
GeminiClientWrapper,
|
18 |
+
GeminiChatCompletions,
|
19 |
+
GeminiResponse,
|
20 |
+
HAS_GEMINI,
|
21 |
+
create_gemini_client_for_markitdown
|
22 |
+
)
|
23 |
+
print("β
All classes imported successfully")
|
24 |
+
print(f"β
HAS_GEMINI: {HAS_GEMINI}")
|
25 |
+
|
26 |
+
# Test response structure
|
27 |
+
test_response = GeminiResponse("Test image description")
|
28 |
+
print(f"β
Response choices: {len(test_response.choices)}")
|
29 |
+
print(f"β
Message content: {test_response.choices[0].message.content}")
|
30 |
+
|
31 |
+
# Test client creation (should fail gracefully without API key)
|
32 |
+
client = create_gemini_client_for_markitdown()
|
33 |
+
print(f"β
Client creation (no API key): {client is None}")
|
34 |
+
|
35 |
+
except Exception as e:
|
36 |
+
print(f"β Error: {e}")
|
37 |
+
import traceback
|
38 |
+
traceback.print_exc()
|
39 |
+
return False
|
40 |
+
|
41 |
+
return True
|
42 |
+
|
43 |
+
def test_markitdown_availability():
|
44 |
+
"""Test MarkItDown availability"""
|
45 |
+
print("\nTesting MarkItDown availability...")
|
46 |
+
|
47 |
+
try:
|
48 |
+
from markitdown import MarkItDown
|
49 |
+
print("β
MarkItDown imported successfully")
|
50 |
+
|
51 |
+
# Test basic initialization
|
52 |
+
md = MarkItDown()
|
53 |
+
print("β
MarkItDown initialized without LLM client")
|
54 |
+
|
55 |
+
except Exception as e:
|
56 |
+
print(f"β MarkItDown error: {e}")
|
57 |
+
return False
|
58 |
+
|
59 |
+
return True
|
60 |
+
|
61 |
+
def test_integration_structure():
|
62 |
+
"""Test the overall integration structure"""
|
63 |
+
print("\nTesting integration structure...")
|
64 |
+
|
65 |
+
try:
|
66 |
+
# Test that our wrapper can theoretically work with MarkItDown
|
67 |
+
from src.core.gemini_client_wrapper import GeminiClientWrapper, HAS_GEMINI
|
68 |
+
from markitdown import MarkItDown
|
69 |
+
|
70 |
+
print("β
Both components available for integration")
|
71 |
+
|
72 |
+
# Test interface compatibility (structure only)
|
73 |
+
if HAS_GEMINI:
|
74 |
+
print("β
Gemini dependency available")
|
75 |
+
else:
|
76 |
+
print("β οΈ Gemini dependency not available")
|
77 |
+
|
78 |
+
print("β
Integration structure test passed")
|
79 |
+
|
80 |
+
except Exception as e:
|
81 |
+
print(f"β Integration error: {e}")
|
82 |
+
return False
|
83 |
+
|
84 |
+
return True
|
85 |
+
|
86 |
+
if __name__ == "__main__":
|
87 |
+
print("=== Testing Gemini-MarkItDown Integration ===\n")
|
88 |
+
|
89 |
+
success = True
|
90 |
+
success &= test_gemini_wrapper()
|
91 |
+
success &= test_markitdown_availability()
|
92 |
+
success &= test_integration_structure()
|
93 |
+
|
94 |
+
print(f"\n=== Overall Result: {'β
PASS' if success else 'β FAIL'} ===")
|