AnseMin commited on
Commit
d66e90c
·
1 Parent(s): 9123559

Implement ZeroGPU support in DoclingParser for enhanced document processing

Browse files

- Added support for GPU processing using the ZeroGPU framework, allowing for accelerated document conversion.
- Introduced methods for CPU-only processing and fallback mechanisms to ensure robust performance.
- Updated the initialization process to defer converter creation until needed, preventing CUDA issues.
- Enhanced error handling and logging for better debugging and user feedback during document conversion.

Files changed (1) hide show
  1. src/parsers/docling_parser.py +147 -32
src/parsers/docling_parser.py CHANGED
@@ -1,3 +1,10 @@
 
 
 
 
 
 
 
1
  import logging
2
  import os
3
  from pathlib import Path
@@ -16,6 +23,7 @@ try:
16
  from docling.datamodel.base_models import InputFormat
17
  from docling.datamodel.pipeline_options import PdfPipelineOptions, EasyOcrOptions, TesseractOcrOptions
18
  from docling.document_converter import PdfFormatOption
 
19
  HAS_DOCLING = True
20
  except ImportError:
21
  HAS_DOCLING = False
@@ -42,16 +50,11 @@ class DoclingParser(DocumentParser):
42
  def __init__(self):
43
  super().__init__() # Initialize the base class (including _cancellation_flag)
44
  self.converter = None
 
45
 
46
- # Initialize Docling converter
47
- if HAS_DOCLING:
48
- try:
49
- # Create default converter instance
50
- self.converter = DocumentConverter()
51
- logger.info("Docling initialized successfully")
52
- except Exception as e:
53
- logger.error(f"Error initializing Docling: {str(e)}")
54
- self.converter = None
55
 
56
  def _create_converter_with_options(self, ocr_method: str, **kwargs) -> DocumentConverter:
57
  """Create a DocumentConverter with specific OCR options."""
@@ -100,7 +103,7 @@ class DoclingParser(DocumentParser):
100
  self.validate_file(file_path)
101
 
102
  # Check if Docling is available
103
- if not HAS_DOCLING or self.converter is None:
104
  raise ParserError("Docling is not available. Please install with 'pip install docling'")
105
 
106
  # Check for cancellation before starting
@@ -108,27 +111,145 @@ class DoclingParser(DocumentParser):
108
  raise DocumentProcessingError("Conversion cancelled")
109
 
110
  try:
111
- # Use method-specific converter if OCR method is specified
112
- if ocr_method and ocr_method != "docling_default":
113
- converter = self._create_converter_with_options(ocr_method, **kwargs)
114
- else:
115
- converter = self.converter
 
 
 
 
116
 
117
- # Convert the document
118
- result = converter.convert(str(file_path))
 
119
 
120
- # Check for cancellation after processing
121
- if self._check_cancellation():
122
- raise DocumentProcessingError("Conversion cancelled")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
 
124
  # Export to markdown
125
  markdown_content = result.document.export_to_markdown()
126
 
127
- return markdown_content
 
 
 
128
 
129
- except Exception as e:
130
- logger.error(f"Error converting file with Docling: {str(e)}")
131
- raise DocumentProcessingError(f"Docling conversion failed: {str(e)}")
 
 
 
132
 
133
  @classmethod
134
  def get_name(cls) -> str:
@@ -258,14 +379,8 @@ class DoclingParser(DocumentParser):
258
  if self._check_cancellation():
259
  raise DocumentProcessingError("Conversion cancelled")
260
 
261
- # Select converter (respecting OCR method if set)
262
- if ocr_method and ocr_method != "docling_default":
263
- converter = self._create_converter_with_options(ocr_method, **kwargs)
264
- else:
265
- converter = self.converter
266
-
267
- if converter is None:
268
- raise DocumentProcessingError("Docling converter not initialized")
269
 
270
  # Convert all docs
271
  from docling.datamodel.base_models import ConversionStatus
 
1
+ # Import spaces module for ZeroGPU support - Must be first import
2
+ try:
3
+ import spaces
4
+ HAS_SPACES = True
5
+ except ImportError:
6
+ HAS_SPACES = False
7
+
8
  import logging
9
  import os
10
  from pathlib import Path
 
23
  from docling.datamodel.base_models import InputFormat
24
  from docling.datamodel.pipeline_options import PdfPipelineOptions, EasyOcrOptions, TesseractOcrOptions
25
  from docling.document_converter import PdfFormatOption
26
+ from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
27
  HAS_DOCLING = True
28
  except ImportError:
29
  HAS_DOCLING = False
 
50
  def __init__(self):
51
  super().__init__() # Initialize the base class (including _cancellation_flag)
52
  self.converter = None
53
+ self.gpu_converter = None
54
 
55
+ # Don't initialize converters here to avoid CUDA issues
56
+ # They will be created on-demand in the parse methods
57
+ logger.info("Docling parser initialized (converters will be created on-demand)")
 
 
 
 
 
 
58
 
59
  def _create_converter_with_options(self, ocr_method: str, **kwargs) -> DocumentConverter:
60
  """Create a DocumentConverter with specific OCR options."""
 
103
  self.validate_file(file_path)
104
 
105
  # Check if Docling is available
106
+ if not HAS_DOCLING:
107
  raise ParserError("Docling is not available. Please install with 'pip install docling'")
108
 
109
  # Check for cancellation before starting
 
111
  raise DocumentProcessingError("Conversion cancelled")
112
 
113
  try:
114
+ # Try ZeroGPU first, fallback to CPU
115
+ if HAS_SPACES:
116
+ try:
117
+ logger.info("Attempting Docling processing with ZeroGPU")
118
+ result = self._process_with_gpu(str(file_path), ocr_method, **kwargs)
119
+ return result
120
+ except Exception as e:
121
+ logger.warning(f"ZeroGPU processing failed: {str(e)}")
122
+ logger.info("Falling back to CPU processing")
123
 
124
+ # Fallback to CPU processing
125
+ result = self._process_with_cpu(str(file_path), ocr_method, **kwargs)
126
+ return result
127
 
128
+ except Exception as e:
129
+ logger.error(f"Error converting file with Docling: {str(e)}")
130
+ raise DocumentProcessingError(f"Docling conversion failed: {str(e)}")
131
+
132
+ def _process_with_cpu(self, file_path: str, ocr_method: Optional[str] = None, **kwargs) -> str:
133
+ """Process document with CPU-only Docling converter."""
134
+ logger.info("Processing with CPU-only Docling converter")
135
+
136
+ # Create CPU converter if not exists
137
+ if self.converter is None:
138
+ self.converter = self._create_cpu_converter(ocr_method, **kwargs)
139
+
140
+ # Convert the document
141
+ result = self.converter.convert(file_path)
142
+
143
+ # Check for cancellation after processing
144
+ if self._check_cancellation():
145
+ raise DocumentProcessingError("Conversion cancelled")
146
+
147
+ # Export to markdown
148
+ return result.document.export_to_markdown()
149
+
150
+ def _create_cpu_converter(self, ocr_method: Optional[str] = None, **kwargs) -> DocumentConverter:
151
+ """Create a CPU-only DocumentConverter."""
152
+ # Configure CPU-only accelerator
153
+ accelerator_options = AcceleratorOptions(
154
+ num_threads=4,
155
+ device=AcceleratorDevice.CPU
156
+ )
157
+
158
+ # Create pipeline options with CPU-only accelerator
159
+ pipeline_options = PdfPipelineOptions()
160
+ pipeline_options.accelerator_options = accelerator_options
161
+ pipeline_options.do_ocr = True
162
+ pipeline_options.do_table_structure = True
163
+ pipeline_options.table_structure_options.do_cell_matching = True
164
+
165
+ # Configure OCR method
166
+ if ocr_method == "docling_tesseract":
167
+ pipeline_options.ocr_options = TesseractOcrOptions()
168
+ elif ocr_method == "docling_easyocr":
169
+ pipeline_options.ocr_options = EasyOcrOptions()
170
+ else: # Default to EasyOCR
171
+ pipeline_options.ocr_options = EasyOcrOptions()
172
+
173
+ # Configure advanced features
174
+ pipeline_options.do_table_structure = kwargs.get('enable_tables', True)
175
+ pipeline_options.do_code_enrichment = kwargs.get('enable_code_enrichment', False)
176
+ pipeline_options.do_formula_enrichment = kwargs.get('enable_formula_enrichment', False)
177
+ pipeline_options.do_picture_classification = kwargs.get('enable_picture_classification', False)
178
+ pipeline_options.generate_picture_images = kwargs.get('generate_picture_images', False)
179
+
180
+ # Create converter with CPU-only configuration
181
+ return DocumentConverter(
182
+ format_options={
183
+ InputFormat.PDF: PdfFormatOption(
184
+ pipeline_options=pipeline_options,
185
+ )
186
+ }
187
+ )
188
+
189
+ # Define the GPU-decorated function for ZeroGPU
190
+ if HAS_SPACES:
191
+ @spaces.GPU(duration=120) # Allocate GPU for up to 2 minutes
192
+ def _process_with_gpu(self, file_path: str, ocr_method: Optional[str] = None, **kwargs) -> str:
193
+ """Process document with GPU-accelerated Docling converter.
194
+
195
+ IMPORTANT: All model loading and CUDA operations must happen inside this method.
196
+ """
197
+ logger.info("Processing with ZeroGPU allocation")
198
+
199
+ # Configure GPU accelerator
200
+ accelerator_options = AcceleratorOptions(
201
+ num_threads=4,
202
+ device=AcceleratorDevice.CUDA
203
+ )
204
+
205
+ # Create pipeline options with GPU accelerator
206
+ pipeline_options = PdfPipelineOptions()
207
+ pipeline_options.accelerator_options = accelerator_options
208
+ pipeline_options.do_ocr = True
209
+ pipeline_options.do_table_structure = True
210
+ pipeline_options.table_structure_options.do_cell_matching = True
211
+
212
+ # Configure OCR method
213
+ if ocr_method == "docling_tesseract":
214
+ pipeline_options.ocr_options = TesseractOcrOptions()
215
+ elif ocr_method == "docling_easyocr":
216
+ pipeline_options.ocr_options = EasyOcrOptions()
217
+ else: # Default to EasyOCR
218
+ pipeline_options.ocr_options = EasyOcrOptions()
219
+
220
+ # Configure advanced features
221
+ pipeline_options.do_table_structure = kwargs.get('enable_tables', True)
222
+ pipeline_options.do_code_enrichment = kwargs.get('enable_code_enrichment', False)
223
+ pipeline_options.do_formula_enrichment = kwargs.get('enable_formula_enrichment', False)
224
+ pipeline_options.do_picture_classification = kwargs.get('enable_picture_classification', False)
225
+ pipeline_options.generate_picture_images = kwargs.get('generate_picture_images', False)
226
+
227
+ # Create converter with GPU configuration inside the decorated function
228
+ converter = DocumentConverter(
229
+ format_options={
230
+ InputFormat.PDF: PdfFormatOption(
231
+ pipeline_options=pipeline_options,
232
+ )
233
+ }
234
+ )
235
+
236
+ # Convert the document
237
+ result = converter.convert(file_path)
238
 
239
  # Export to markdown
240
  markdown_content = result.document.export_to_markdown()
241
 
242
+ # Clean up to free memory
243
+ del converter
244
+ import gc
245
+ gc.collect()
246
 
247
+ return markdown_content
248
+ else:
249
+ # Define a dummy method if spaces is not available
250
+ def _process_with_gpu(self, file_path: str, ocr_method: Optional[str] = None, **kwargs) -> str:
251
+ # This should never be called if HAS_SPACES is False
252
+ return self._process_with_cpu(file_path, ocr_method, **kwargs)
253
 
254
  @classmethod
255
  def get_name(cls) -> str:
 
379
  if self._check_cancellation():
380
  raise DocumentProcessingError("Conversion cancelled")
381
 
382
+ # Create CPU converter for batch processing (GPU not supported for batch yet)
383
+ converter = self._create_cpu_converter(ocr_method, **kwargs)
 
 
 
 
 
 
384
 
385
  # Convert all docs
386
  from docling.datamodel.base_models import ConversionStatus