Spaces:
Running
Running
Refactor OCR configuration in DoclingParser to use EasyOCR exclusively
Browse files- Simplified the OCR method configuration by removing Tesseract support and defaulting to EasyOCR.
- Updated the supported OCR methods list to reflect the removal of Tesseract, enhancing clarity and maintainability.
- Improved logging to indicate the use of EasyOCR for CPU-only processing.
src/parsers/docling_parser.py
CHANGED
@@ -183,21 +183,9 @@ class DoclingParser(DocumentParser):
|
|
183 |
pipeline_options.do_table_structure = True
|
184 |
pipeline_options.table_structure_options.do_cell_matching = True
|
185 |
|
186 |
-
# Configure OCR method -
|
187 |
-
|
188 |
-
|
189 |
-
import subprocess
|
190 |
-
subprocess.run(["tesseract", "--version"], capture_output=True, check=True)
|
191 |
-
pipeline_options.ocr_options = TesseractOcrOptions()
|
192 |
-
logger.info("Using Tesseract OCR (CPU-only)")
|
193 |
-
except (FileNotFoundError, subprocess.CalledProcessError):
|
194 |
-
logger.warning("Tesseract not available, falling back to EasyOCR")
|
195 |
-
pipeline_options.ocr_options = EasyOcrOptions()
|
196 |
-
logger.info("Using EasyOCR (CPU-only)")
|
197 |
-
else:
|
198 |
-
# Default to EasyOCR (including docling_easyocr and docling_default)
|
199 |
-
pipeline_options.ocr_options = EasyOcrOptions()
|
200 |
-
logger.info("Using EasyOCR (CPU-only)")
|
201 |
|
202 |
# Configure advanced features
|
203 |
pipeline_options.do_table_structure = kwargs.get('enable_tables', True)
|
@@ -238,13 +226,8 @@ class DoclingParser(DocumentParser):
|
|
238 |
pipeline_options.do_table_structure = True
|
239 |
pipeline_options.table_structure_options.do_cell_matching = True
|
240 |
|
241 |
-
# Configure OCR method
|
242 |
-
|
243 |
-
pipeline_options.ocr_options = TesseractOcrOptions()
|
244 |
-
elif ocr_method == "docling_easyocr":
|
245 |
-
pipeline_options.ocr_options = EasyOcrOptions()
|
246 |
-
else: # Default to EasyOCR
|
247 |
-
pipeline_options.ocr_options = EasyOcrOptions()
|
248 |
|
249 |
# Configure advanced features
|
250 |
pipeline_options.do_table_structure = kwargs.get('enable_tables', True)
|
@@ -308,21 +291,10 @@ class DoclingParser(DocumentParser):
|
|
308 |
@classmethod
|
309 |
def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
|
310 |
"""Return list of supported OCR methods."""
|
311 |
-
|
312 |
{
|
313 |
"id": "docling_default",
|
314 |
-
"name": "
|
315 |
-
"default_params": {
|
316 |
-
"enable_tables": True,
|
317 |
-
"enable_code_enrichment": False,
|
318 |
-
"enable_formula_enrichment": False,
|
319 |
-
"enable_picture_classification": False,
|
320 |
-
"generate_picture_images": False
|
321 |
-
}
|
322 |
-
},
|
323 |
-
{
|
324 |
-
"id": "docling_easyocr",
|
325 |
-
"name": "Docling EasyOCR",
|
326 |
"default_params": {
|
327 |
"enable_tables": True,
|
328 |
"enable_code_enrichment": False,
|
@@ -332,26 +304,6 @@ class DoclingParser(DocumentParser):
|
|
332 |
}
|
333 |
}
|
334 |
]
|
335 |
-
|
336 |
-
# Add Tesseract method if available (requires system installation)
|
337 |
-
try:
|
338 |
-
import subprocess
|
339 |
-
subprocess.run(["tesseract", "--version"], capture_output=True, check=True)
|
340 |
-
methods.append({
|
341 |
-
"id": "docling_tesseract",
|
342 |
-
"name": "Docling Tesseract OCR",
|
343 |
-
"default_params": {
|
344 |
-
"enable_tables": True,
|
345 |
-
"enable_code_enrichment": False,
|
346 |
-
"enable_formula_enrichment": False,
|
347 |
-
"enable_picture_classification": False,
|
348 |
-
"generate_picture_images": False
|
349 |
-
}
|
350 |
-
})
|
351 |
-
except (FileNotFoundError, subprocess.CalledProcessError):
|
352 |
-
logger.debug("Tesseract not available on system")
|
353 |
-
|
354 |
-
return methods
|
355 |
|
356 |
@classmethod
|
357 |
def get_description(cls) -> str:
|
|
|
183 |
pipeline_options.do_table_structure = True
|
184 |
pipeline_options.table_structure_options.do_cell_matching = True
|
185 |
|
186 |
+
# Configure OCR method - use EasyOCR with CPU enforcement
|
187 |
+
pipeline_options.ocr_options = EasyOcrOptions()
|
188 |
+
logger.info("Using EasyOCR (CPU-only)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
|
190 |
# Configure advanced features
|
191 |
pipeline_options.do_table_structure = kwargs.get('enable_tables', True)
|
|
|
226 |
pipeline_options.do_table_structure = True
|
227 |
pipeline_options.table_structure_options.do_cell_matching = True
|
228 |
|
229 |
+
# Configure OCR method - use EasyOCR
|
230 |
+
pipeline_options.ocr_options = EasyOcrOptions()
|
|
|
|
|
|
|
|
|
|
|
231 |
|
232 |
# Configure advanced features
|
233 |
pipeline_options.do_table_structure = kwargs.get('enable_tables', True)
|
|
|
291 |
@classmethod
|
292 |
def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
|
293 |
"""Return list of supported OCR methods."""
|
294 |
+
return [
|
295 |
{
|
296 |
"id": "docling_default",
|
297 |
+
"name": "EasyOCR",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
298 |
"default_params": {
|
299 |
"enable_tables": True,
|
300 |
"enable_code_enrichment": False,
|
|
|
304 |
}
|
305 |
}
|
306 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
307 |
|
308 |
@classmethod
|
309 |
def get_description(cls) -> str:
|