Spaces:

Ansemin101
/

Markit_v2

Runtime error

AnseMin commited on 20 days ago

Commit

c61b4e2

1 Parent(s): d437733

Enhance Docling and Mistral OCR parsers with improved response handling and logging

- Updated the `_format_batch_output` method in `DoclingParser` for better output formatting.
- Enhanced response text extraction logic in `DoclingParser` to handle various response structures from the Gemini API, including detailed logging for debugging.
- Removed unnecessary whitespace in `MistralOcrParser` for cleaner code.
- Improved error handling and logging for API errors in `DoclingParser` to facilitate troubleshooting.

Files changed (2) hide show

src/parsers/docling_parser.py +32 -8
src/parsers/mistral_ocr_parser.py +0 -2

src/parsers/docling_parser.py CHANGED Viewed

@@ -244,11 +244,11 @@ class DoclingParser(DocumentParser):
     def _format_batch_output(self, response_text: str, file_paths: List[Path], processing_type: str, original_filenames: Optional[List[str]] = None) -> str:
         names = original_filenames if original_filenames else [p.name for p in file_paths]
         header = (
-            f"<!-- Multi-Document Processing Results -->\n"
-            f"<!-- Processing Type: {processing_type} -->\n"
-            f"<!-- Files Processed: {len(file_paths)} -->\n"
-            f"<!-- File Names: {', '.join(names)} -->\n\n"
-        )
         # Ensure response_text is a string to avoid TypeError when it is None
         safe_resp = "" if response_text is None else str(response_text)
         return header + safe_resp
@@ -322,7 +322,7 @@ class DoclingParser(DocumentParser):
             client = genai.Client(api_key=config.api.google_api_key)
             response = client.models.generate_content(
                 model=config.model.gemini_model,
-                contents=[prompt, combined_md],
                 config={
                     "temperature": config.model.temperature,
                     "top_p": 0.95,
@@ -330,10 +330,34 @@ class DoclingParser(DocumentParser):
                     "max_output_tokens": config.model.max_tokens,
                 },
             )
-            final_text = response.text if hasattr(response, "text") else None
-            if final_text is None:
                 raise DocumentProcessingError("Gemini post-processing returned no text")
         except Exception as e:
             raise DocumentProcessingError(f"Gemini post-processing failed: {str(e)}")
         return self._format_batch_output(final_text, paths, processing_type, original_filenames)

     def _format_batch_output(self, response_text: str, file_paths: List[Path], processing_type: str, original_filenames: Optional[List[str]] = None) -> str:
         names = original_filenames if original_filenames else [p.name for p in file_paths]
         header = (
+             f"<!-- Multi-Document Processing Results -->\n"
+             f"<!-- Processing Type: {processing_type} -->\n"
+             f"<!-- Files Processed: {len(file_paths)} -->\n"
+             f"<!-- File Names: {', '.join(names)} -->\n\n"
+         )
         # Ensure response_text is a string to avoid TypeError when it is None
         safe_resp = "" if response_text is None else str(response_text)
         return header + safe_resp
             client = genai.Client(api_key=config.api.google_api_key)
             response = client.models.generate_content(
                 model=config.model.gemini_model,
+                contents=[prompt + "\n\n" + combined_md],
                 config={
                     "temperature": config.model.temperature,
                     "top_p": 0.95,
                     "max_output_tokens": config.model.max_tokens,
                 },
             )
+            # Debug logging for response structure
+            logger.debug(f"Gemini response type: {type(response)}")
+            logger.debug(f"Gemini response attributes: {dir(response)}")
+            # Try different ways to extract text from response
+            final_text = None
+            if hasattr(response, "text") and response.text:
+                final_text = response.text
+            elif hasattr(response, "candidates") and response.candidates:
+                # Try to get text from first candidate
+                candidate = response.candidates[0]
+                if hasattr(candidate, "content") and candidate.content:
+                    if hasattr(candidate.content, "parts") and candidate.content.parts:
+                        final_text = candidate.content.parts[0].text
+                    elif hasattr(candidate.content, "text"):
+                        final_text = candidate.content.text
+                elif hasattr(candidate, "text"):
+                    final_text = candidate.text
+            elif hasattr(response, "content") and response.content:
+                final_text = str(response.content)
+            if not final_text:
+                logger.error(f"No text found in Gemini response. Response: {response}")
                 raise DocumentProcessingError("Gemini post-processing returned no text")
         except Exception as e:
+            logger.error(f"Gemini API error: {str(e)}")
             raise DocumentProcessingError(f"Gemini post-processing failed: {str(e)}")
         return self._format_batch_output(final_text, paths, processing_type, original_filenames)

src/parsers/mistral_ocr_parser.py CHANGED Viewed

@@ -357,8 +357,6 @@ class MistralOcrParser(DocumentParser):
         return markdown
     def _validate_batch_files(self, file_paths: List[Path]) -> None:
         """Validate batch of files for multi-document processing."""
         if len(file_paths) == 0:

         return markdown
     def _validate_batch_files(self, file_paths: List[Path]) -> None:
         """Validate batch of files for multi-document processing."""
         if len(file_paths) == 0: