AnseMin commited on
Commit
18e6067
·
1 Parent(s): 5e0609f

Refactor OCR configuration in DoclingParser to use EasyOCR exclusively

Browse files

- Simplified the OCR method configuration by removing Tesseract support and defaulting to EasyOCR.
- Updated the supported OCR methods list to reflect the removal of Tesseract, enhancing clarity and maintainability.
- Improved logging to indicate the use of EasyOCR for CPU-only processing.

Files changed (1) hide show
  1. src/parsers/docling_parser.py +7 -55
src/parsers/docling_parser.py CHANGED
@@ -183,21 +183,9 @@ class DoclingParser(DocumentParser):
183
  pipeline_options.do_table_structure = True
184
  pipeline_options.table_structure_options.do_cell_matching = True
185
 
186
- # Configure OCR method - prefer EasyOCR with CPU enforcement
187
- if ocr_method == "docling_tesseract":
188
- try:
189
- import subprocess
190
- subprocess.run(["tesseract", "--version"], capture_output=True, check=True)
191
- pipeline_options.ocr_options = TesseractOcrOptions()
192
- logger.info("Using Tesseract OCR (CPU-only)")
193
- except (FileNotFoundError, subprocess.CalledProcessError):
194
- logger.warning("Tesseract not available, falling back to EasyOCR")
195
- pipeline_options.ocr_options = EasyOcrOptions()
196
- logger.info("Using EasyOCR (CPU-only)")
197
- else:
198
- # Default to EasyOCR (including docling_easyocr and docling_default)
199
- pipeline_options.ocr_options = EasyOcrOptions()
200
- logger.info("Using EasyOCR (CPU-only)")
201
 
202
  # Configure advanced features
203
  pipeline_options.do_table_structure = kwargs.get('enable_tables', True)
@@ -238,13 +226,8 @@ class DoclingParser(DocumentParser):
238
  pipeline_options.do_table_structure = True
239
  pipeline_options.table_structure_options.do_cell_matching = True
240
 
241
- # Configure OCR method
242
- if ocr_method == "docling_tesseract":
243
- pipeline_options.ocr_options = TesseractOcrOptions()
244
- elif ocr_method == "docling_easyocr":
245
- pipeline_options.ocr_options = EasyOcrOptions()
246
- else: # Default to EasyOCR
247
- pipeline_options.ocr_options = EasyOcrOptions()
248
 
249
  # Configure advanced features
250
  pipeline_options.do_table_structure = kwargs.get('enable_tables', True)
@@ -308,21 +291,10 @@ class DoclingParser(DocumentParser):
308
  @classmethod
309
  def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
310
  """Return list of supported OCR methods."""
311
- methods = [
312
  {
313
  "id": "docling_default",
314
- "name": "Docling Default (EasyOCR)",
315
- "default_params": {
316
- "enable_tables": True,
317
- "enable_code_enrichment": False,
318
- "enable_formula_enrichment": False,
319
- "enable_picture_classification": False,
320
- "generate_picture_images": False
321
- }
322
- },
323
- {
324
- "id": "docling_easyocr",
325
- "name": "Docling EasyOCR",
326
  "default_params": {
327
  "enable_tables": True,
328
  "enable_code_enrichment": False,
@@ -332,26 +304,6 @@ class DoclingParser(DocumentParser):
332
  }
333
  }
334
  ]
335
-
336
- # Add Tesseract method if available (requires system installation)
337
- try:
338
- import subprocess
339
- subprocess.run(["tesseract", "--version"], capture_output=True, check=True)
340
- methods.append({
341
- "id": "docling_tesseract",
342
- "name": "Docling Tesseract OCR",
343
- "default_params": {
344
- "enable_tables": True,
345
- "enable_code_enrichment": False,
346
- "enable_formula_enrichment": False,
347
- "enable_picture_classification": False,
348
- "generate_picture_images": False
349
- }
350
- })
351
- except (FileNotFoundError, subprocess.CalledProcessError):
352
- logger.debug("Tesseract not available on system")
353
-
354
- return methods
355
 
356
  @classmethod
357
  def get_description(cls) -> str:
 
183
  pipeline_options.do_table_structure = True
184
  pipeline_options.table_structure_options.do_cell_matching = True
185
 
186
+ # Configure OCR method - use EasyOCR with CPU enforcement
187
+ pipeline_options.ocr_options = EasyOcrOptions()
188
+ logger.info("Using EasyOCR (CPU-only)")
 
 
 
 
 
 
 
 
 
 
 
 
189
 
190
  # Configure advanced features
191
  pipeline_options.do_table_structure = kwargs.get('enable_tables', True)
 
226
  pipeline_options.do_table_structure = True
227
  pipeline_options.table_structure_options.do_cell_matching = True
228
 
229
+ # Configure OCR method - use EasyOCR
230
+ pipeline_options.ocr_options = EasyOcrOptions()
 
 
 
 
 
231
 
232
  # Configure advanced features
233
  pipeline_options.do_table_structure = kwargs.get('enable_tables', True)
 
291
  @classmethod
292
  def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
293
  """Return list of supported OCR methods."""
294
+ return [
295
  {
296
  "id": "docling_default",
297
+ "name": "EasyOCR",
 
 
 
 
 
 
 
 
 
 
 
298
  "default_params": {
299
  "enable_tables": True,
300
  "enable_code_enrichment": False,
 
304
  }
305
  }
306
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
307
 
308
  @classmethod
309
  def get_description(cls) -> str: