Spaces:
Runtime error
Runtime error
Enhance Docling and Mistral OCR parsers with improved response handling and logging
Browse files- Updated the `_format_batch_output` method in `DoclingParser` for better output formatting.
- Enhanced response text extraction logic in `DoclingParser` to handle various response structures from the Gemini API, including detailed logging for debugging.
- Removed unnecessary whitespace in `MistralOcrParser` for cleaner code.
- Improved error handling and logging for API errors in `DoclingParser` to facilitate troubleshooting.
src/parsers/docling_parser.py
CHANGED
@@ -244,11 +244,11 @@ class DoclingParser(DocumentParser):
|
|
244 |
def _format_batch_output(self, response_text: str, file_paths: List[Path], processing_type: str, original_filenames: Optional[List[str]] = None) -> str:
|
245 |
names = original_filenames if original_filenames else [p.name for p in file_paths]
|
246 |
header = (
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
# Ensure response_text is a string to avoid TypeError when it is None
|
253 |
safe_resp = "" if response_text is None else str(response_text)
|
254 |
return header + safe_resp
|
@@ -322,7 +322,7 @@ class DoclingParser(DocumentParser):
|
|
322 |
client = genai.Client(api_key=config.api.google_api_key)
|
323 |
response = client.models.generate_content(
|
324 |
model=config.model.gemini_model,
|
325 |
-
contents=[prompt
|
326 |
config={
|
327 |
"temperature": config.model.temperature,
|
328 |
"top_p": 0.95,
|
@@ -330,10 +330,34 @@ class DoclingParser(DocumentParser):
|
|
330 |
"max_output_tokens": config.model.max_tokens,
|
331 |
},
|
332 |
)
|
333 |
-
|
334 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
335 |
raise DocumentProcessingError("Gemini post-processing returned no text")
|
|
|
336 |
except Exception as e:
|
|
|
337 |
raise DocumentProcessingError(f"Gemini post-processing failed: {str(e)}")
|
338 |
|
339 |
return self._format_batch_output(final_text, paths, processing_type, original_filenames)
|
|
|
244 |
def _format_batch_output(self, response_text: str, file_paths: List[Path], processing_type: str, original_filenames: Optional[List[str]] = None) -> str:
|
245 |
names = original_filenames if original_filenames else [p.name for p in file_paths]
|
246 |
header = (
|
247 |
+
f"<!-- Multi-Document Processing Results -->\n"
|
248 |
+
f"<!-- Processing Type: {processing_type} -->\n"
|
249 |
+
f"<!-- Files Processed: {len(file_paths)} -->\n"
|
250 |
+
f"<!-- File Names: {', '.join(names)} -->\n\n"
|
251 |
+
)
|
252 |
# Ensure response_text is a string to avoid TypeError when it is None
|
253 |
safe_resp = "" if response_text is None else str(response_text)
|
254 |
return header + safe_resp
|
|
|
322 |
client = genai.Client(api_key=config.api.google_api_key)
|
323 |
response = client.models.generate_content(
|
324 |
model=config.model.gemini_model,
|
325 |
+
contents=[prompt + "\n\n" + combined_md],
|
326 |
config={
|
327 |
"temperature": config.model.temperature,
|
328 |
"top_p": 0.95,
|
|
|
330 |
"max_output_tokens": config.model.max_tokens,
|
331 |
},
|
332 |
)
|
333 |
+
|
334 |
+
# Debug logging for response structure
|
335 |
+
logger.debug(f"Gemini response type: {type(response)}")
|
336 |
+
logger.debug(f"Gemini response attributes: {dir(response)}")
|
337 |
+
|
338 |
+
# Try different ways to extract text from response
|
339 |
+
final_text = None
|
340 |
+
if hasattr(response, "text") and response.text:
|
341 |
+
final_text = response.text
|
342 |
+
elif hasattr(response, "candidates") and response.candidates:
|
343 |
+
# Try to get text from first candidate
|
344 |
+
candidate = response.candidates[0]
|
345 |
+
if hasattr(candidate, "content") and candidate.content:
|
346 |
+
if hasattr(candidate.content, "parts") and candidate.content.parts:
|
347 |
+
final_text = candidate.content.parts[0].text
|
348 |
+
elif hasattr(candidate.content, "text"):
|
349 |
+
final_text = candidate.content.text
|
350 |
+
elif hasattr(candidate, "text"):
|
351 |
+
final_text = candidate.text
|
352 |
+
elif hasattr(response, "content") and response.content:
|
353 |
+
final_text = str(response.content)
|
354 |
+
|
355 |
+
if not final_text:
|
356 |
+
logger.error(f"No text found in Gemini response. Response: {response}")
|
357 |
raise DocumentProcessingError("Gemini post-processing returned no text")
|
358 |
+
|
359 |
except Exception as e:
|
360 |
+
logger.error(f"Gemini API error: {str(e)}")
|
361 |
raise DocumentProcessingError(f"Gemini post-processing failed: {str(e)}")
|
362 |
|
363 |
return self._format_batch_output(final_text, paths, processing_type, original_filenames)
|
src/parsers/mistral_ocr_parser.py
CHANGED
@@ -357,8 +357,6 @@ class MistralOcrParser(DocumentParser):
|
|
357 |
|
358 |
return markdown
|
359 |
|
360 |
-
|
361 |
-
|
362 |
def _validate_batch_files(self, file_paths: List[Path]) -> None:
|
363 |
"""Validate batch of files for multi-document processing."""
|
364 |
if len(file_paths) == 0:
|
|
|
357 |
|
358 |
return markdown
|
359 |
|
|
|
|
|
360 |
def _validate_batch_files(self, file_paths: List[Path]) -> None:
|
361 |
"""Validate batch of files for multi-document processing."""
|
362 |
if len(file_paths) == 0:
|