AnseMin commited on
Commit
c61b4e2
·
1 Parent(s): d437733

Enhance Docling and Mistral OCR parsers with improved response handling and logging

Browse files

- Updated the `_format_batch_output` method in `DoclingParser` for better output formatting.
- Enhanced response text extraction logic in `DoclingParser` to handle various response structures from the Gemini API, including detailed logging for debugging.
- Removed unnecessary whitespace in `MistralOcrParser` for cleaner code.
- Improved error handling and logging for API errors in `DoclingParser` to facilitate troubleshooting.

src/parsers/docling_parser.py CHANGED
@@ -244,11 +244,11 @@ class DoclingParser(DocumentParser):
244
  def _format_batch_output(self, response_text: str, file_paths: List[Path], processing_type: str, original_filenames: Optional[List[str]] = None) -> str:
245
  names = original_filenames if original_filenames else [p.name for p in file_paths]
246
  header = (
247
- f"<!-- Multi-Document Processing Results -->\n"
248
- f"<!-- Processing Type: {processing_type} -->\n"
249
- f"<!-- Files Processed: {len(file_paths)} -->\n"
250
- f"<!-- File Names: {', '.join(names)} -->\n\n"
251
- )
252
  # Ensure response_text is a string to avoid TypeError when it is None
253
  safe_resp = "" if response_text is None else str(response_text)
254
  return header + safe_resp
@@ -322,7 +322,7 @@ class DoclingParser(DocumentParser):
322
  client = genai.Client(api_key=config.api.google_api_key)
323
  response = client.models.generate_content(
324
  model=config.model.gemini_model,
325
- contents=[prompt, combined_md],
326
  config={
327
  "temperature": config.model.temperature,
328
  "top_p": 0.95,
@@ -330,10 +330,34 @@ class DoclingParser(DocumentParser):
330
  "max_output_tokens": config.model.max_tokens,
331
  },
332
  )
333
- final_text = response.text if hasattr(response, "text") else None
334
- if final_text is None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
335
  raise DocumentProcessingError("Gemini post-processing returned no text")
 
336
  except Exception as e:
 
337
  raise DocumentProcessingError(f"Gemini post-processing failed: {str(e)}")
338
 
339
  return self._format_batch_output(final_text, paths, processing_type, original_filenames)
 
244
  def _format_batch_output(self, response_text: str, file_paths: List[Path], processing_type: str, original_filenames: Optional[List[str]] = None) -> str:
245
  names = original_filenames if original_filenames else [p.name for p in file_paths]
246
  header = (
247
+ f"<!-- Multi-Document Processing Results -->\n"
248
+ f"<!-- Processing Type: {processing_type} -->\n"
249
+ f"<!-- Files Processed: {len(file_paths)} -->\n"
250
+ f"<!-- File Names: {', '.join(names)} -->\n\n"
251
+ )
252
  # Ensure response_text is a string to avoid TypeError when it is None
253
  safe_resp = "" if response_text is None else str(response_text)
254
  return header + safe_resp
 
322
  client = genai.Client(api_key=config.api.google_api_key)
323
  response = client.models.generate_content(
324
  model=config.model.gemini_model,
325
+ contents=[prompt + "\n\n" + combined_md],
326
  config={
327
  "temperature": config.model.temperature,
328
  "top_p": 0.95,
 
330
  "max_output_tokens": config.model.max_tokens,
331
  },
332
  )
333
+
334
+ # Debug logging for response structure
335
+ logger.debug(f"Gemini response type: {type(response)}")
336
+ logger.debug(f"Gemini response attributes: {dir(response)}")
337
+
338
+ # Try different ways to extract text from response
339
+ final_text = None
340
+ if hasattr(response, "text") and response.text:
341
+ final_text = response.text
342
+ elif hasattr(response, "candidates") and response.candidates:
343
+ # Try to get text from first candidate
344
+ candidate = response.candidates[0]
345
+ if hasattr(candidate, "content") and candidate.content:
346
+ if hasattr(candidate.content, "parts") and candidate.content.parts:
347
+ final_text = candidate.content.parts[0].text
348
+ elif hasattr(candidate.content, "text"):
349
+ final_text = candidate.content.text
350
+ elif hasattr(candidate, "text"):
351
+ final_text = candidate.text
352
+ elif hasattr(response, "content") and response.content:
353
+ final_text = str(response.content)
354
+
355
+ if not final_text:
356
+ logger.error(f"No text found in Gemini response. Response: {response}")
357
  raise DocumentProcessingError("Gemini post-processing returned no text")
358
+
359
  except Exception as e:
360
+ logger.error(f"Gemini API error: {str(e)}")
361
  raise DocumentProcessingError(f"Gemini post-processing failed: {str(e)}")
362
 
363
  return self._format_batch_output(final_text, paths, processing_type, original_filenames)
src/parsers/mistral_ocr_parser.py CHANGED
@@ -357,8 +357,6 @@ class MistralOcrParser(DocumentParser):
357
 
358
  return markdown
359
 
360
-
361
-
362
  def _validate_batch_files(self, file_paths: List[Path]) -> None:
363
  """Validate batch of files for multi-document processing."""
364
  if len(file_paths) == 0:
 
357
 
358
  return markdown
359
 
 
 
360
  def _validate_batch_files(self, file_paths: List[Path]) -> None:
361
  """Validate batch of files for multi-document processing."""
362
  if len(file_paths) == 0: