File size: 3,674 Bytes
dda982a
 
a773878
 
 
 
dda982a
 
 
 
 
a773878
 
 
 
 
 
 
 
 
 
 
 
dda982a
 
 
 
 
 
 
 
 
 
 
 
a773878
 
 
 
dda982a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a773878
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Dict, List, Optional, Any, Union, Set
import threading

from src.core.exceptions import ParserError, UnsupportedFileTypeError


class DocumentParser(ABC):
    """Base interface for all document parsers in the system."""
    
    def __init__(self):
        """Initialize the parser."""
        self._cancellation_flag: Optional[threading.Event] = None
    
    def set_cancellation_flag(self, flag: Optional[threading.Event]) -> None:
        """Set the cancellation flag for this parser."""
        self._cancellation_flag = flag
    
    def _check_cancellation(self) -> bool:
        """Check if cancellation has been requested."""
        return self._cancellation_flag is not None and self._cancellation_flag.is_set()
    
    @abstractmethod
    def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str:
        """
        Parse a document and return its content.
        
        Args:
            file_path: Path to the document
            ocr_method: OCR method to use (if applicable)
            **kwargs: Additional parser-specific options
            
        Returns:
            str: The parsed content
            
        Raises:
            ParserError: For general parsing errors
            UnsupportedFileTypeError: For unsupported file types
        """
        pass
    
    @classmethod
    @abstractmethod
    def get_name(cls) -> str:
        """Return the display name of this parser"""
        pass
    
    @classmethod
    @abstractmethod
    def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
        """
        Return a list of supported OCR methods.
        
        Returns:
            List of dictionaries with keys:
                - id: Unique identifier for the OCR method
                - name: Display name for the OCR method
                - default_params: Default parameters for this OCR method
        """
        pass
    
    @classmethod
    def get_description(cls) -> str:
        """Return a description of this parser"""
        return f"{cls.get_name()} document parser"
    
    @classmethod
    def get_supported_file_types(cls) -> Set[str]:
        """Return a set of supported file extensions (including the dot)."""
        return {".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".webp"}
    
    @classmethod
    def is_available(cls) -> bool:
        """Check if this parser is available with current configuration."""
        return True
    
    def validate_file(self, file_path: Union[str, Path]) -> None:
        """
        Validate that the file can be processed by this parser.
        
        Args:
            file_path: Path to the file to validate
            
        Raises:
            UnsupportedFileTypeError: If file type is not supported
            ParserError: For other validation errors
        """
        path = Path(file_path)
        if not path.exists():
            raise ParserError(f"File not found: {file_path}")
        
        if path.suffix.lower() not in self.get_supported_file_types():
            raise UnsupportedFileTypeError(
                f"File type '{path.suffix}' not supported by {self.get_name()}"
            )
    
    def get_metadata(self) -> Dict[str, Any]:
        """Return metadata about this parser instance."""
        return {
            "name": self.get_name(),
            "description": self.get_description(),
            "supported_file_types": list(self.get_supported_file_types()),
            "supported_ocr_methods": self.get_supported_ocr_methods(),
            "available": self.is_available()
        }