Spaces:
Sleeping
Sleeping
# Standard library imports | |
import io | |
import os | |
import re | |
import json | |
import time | |
import textwrap # Added textwrap import | |
import logging | |
import traceback | |
from datetime import datetime | |
from enum import Enum, auto | |
from typing import Dict, List, Any, Tuple, Optional, Pattern | |
from dataclasses import dataclass | |
from functools import wraps | |
from abc import ABC, abstractmethod | |
# Third-party imports | |
import gradio as gr | |
from docx import Document | |
from colorama import init, Fore, Style | |
# Constants | |
DEFAULT_PORT = 7860 | |
DEFAULT_HOST = "0.0.0.0" | |
DEFAULT_LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" | |
DEFAULT_LOG_LEVEL = "INFO" | |
# Document Type Constants | |
DOCUMENT_TYPES = [ | |
"Advisory Circular", | |
"Airworthiness Criteria", | |
"Deviation Memo", | |
"Exemption", | |
"Federal Register Notice", | |
"Order", | |
"Policy Statement", | |
"Rule", | |
"Special Condition", | |
"Technical Standard Order", | |
"Other" | |
] | |
TEMPLATE_TYPES = ["Short AC template AC", "Long AC template AC"] | |
# Heading Word Constants | |
HEADING_WORDS = { | |
'INFORMATION', 'GENERAL', 'SUMMARY', 'INTRODUCTION', 'BACKGROUND', | |
'DISCUSSION', 'CONCLUSION', 'APPENDIX', 'CHAPTER', 'SECTION', | |
'PURPOSE', 'APPLICABILITY', 'CANCELLATION', 'DEFINITION', 'REQUIREMENTS', | |
'AUTHORITY', 'POLICY', 'SCOPE', 'RELATED', 'MATERIAL', 'DISTRIBUTION', | |
'EXPLANATION', 'PROCEDURES', 'NOTE', 'WARNING', 'CAUTION', 'EXCEPTION', | |
'GROUPS', 'PARTS', 'TABLE', 'FIGURE', 'REFERENCES', 'DEFINITIONS' | |
} | |
# Predefined Acronyms | |
PREDEFINED_ACRONYMS = { | |
'CFR', 'U.S.', 'USA', 'US', 'U.S.C', 'e.g.', 'i.e.', 'FAQ', 'No.', 'ZIP', 'PDF', 'SSN', | |
'DC', 'MA', 'WA', 'TX', 'MO' | |
} | |
# Configuration Constants | |
REQUIRED_CONFIG_KEYS = {'logging', 'checks', 'document_types'} | |
REQUIRED_LOGGING_KEYS = {'level', 'format'} | |
REQUIRED_CHECKS_KEYS = {'acronyms', 'terminology_check', 'headings'} | |
# Document Type Period Requirements | |
PERIOD_REQUIRED = { | |
"Advisory Circular": True, | |
"Airworthiness Criteria": False, | |
"Deviation Memo": False, | |
"Exemption": False, | |
"Federal Register Notice": False, | |
"Order": True, | |
"Policy Statement": False, | |
"Rule": False, | |
"Special Condition": False, | |
"Technical Standard Order": True, | |
"Other": False | |
} | |
# Document formatting rules | |
DOCUMENT_FORMATTING_RULES = { | |
"italics_only": { | |
"types": ["Advisory Circular"], | |
"italics": True, | |
"quotes": False, | |
"description": "For Advisory Circulars, referenced document titles should be italicized but not quoted", | |
"example": "See AC 20-135, *Powerplant Installation and Propulsion System Component Fire Protection Test Methods, Standards, and Criteria* for information on X." | |
}, | |
"quotes_only": { | |
"types": [ | |
"Airworthiness Criteria", "Deviation Memo", "Exemption", | |
"Federal Register Notice", "Order", "Rule", "Special Condition", | |
"Technical Standard Order" | |
], | |
"italics": False, | |
"quotes": True, | |
"description": "For this document type, referenced document titles should be in quotes without italics", | |
"example": 'See AC 20-135, "Powerplant Installation and Propulsion System Component Fire Protection Test Methods, Standards, and Criteria" for information on X.' | |
}, | |
"no_formatting": { | |
"types": ["Policy Statement", "Other"], | |
"italics": False, | |
"quotes": False, | |
"description": "For this document type, referenced document titles should not use italics or quotes", | |
"example": "See AC 20-135, Powerplant Installation and Propulsion System Component Fire Protection Test Methods, Standards, and Criteria for information on X." | |
} | |
} | |
# 1. Base Exception Classes | |
class DocumentCheckError(Exception): | |
"""Base exception for document checker errors.""" | |
pass | |
class ConfigurationError(DocumentCheckError): | |
"""Raised when configuration is invalid.""" | |
pass | |
class DocumentTypeError(DocumentCheckError): | |
"""Raised when document type is invalid.""" | |
pass | |
# 2. Configuration Classes | |
class PatternConfig: | |
"""Configuration for pattern matching.""" | |
pattern: str | |
description: str | |
is_error: bool | |
replacement: Optional[str] = None | |
def compile(self) -> Pattern: | |
"""Compile the pattern.""" | |
try: | |
return re.compile(self.pattern) | |
except re.error as e: | |
raise ConfigurationError(f"Invalid pattern '{self.pattern}': {e}") | |
class DocumentType(Enum): | |
"""Enumeration of supported document types.""" | |
ADVISORY_CIRCULAR = auto() | |
AIRWORTHINESS_CRITERIA = auto() | |
DEVIATION_MEMO = auto() | |
EXEMPTION = auto() | |
FEDERAL_REGISTER_NOTICE = auto() | |
ORDER = auto() | |
POLICY_STATEMENT = auto() | |
RULE = auto() | |
SPECIAL_CONDITION = auto() | |
TECHNICAL_STANDARD_ORDER = auto() | |
OTHER = auto() | |
def from_string(cls, doc_type: str) -> 'DocumentType': | |
"""Convert string to DocumentType, case-insensitive.""" | |
try: | |
return cls[doc_type.upper().replace(" ", "_")] | |
except KeyError: | |
raise DocumentTypeError(f"Unsupported document type: {doc_type}") | |
# 4. Utility Classes | |
class TextNormalization: | |
"""Text normalization utilities.""" | |
def normalize_heading(text: str) -> str: | |
"""Normalize heading text for consistent comparison.""" | |
# Remove excess whitespace | |
text = ' '.join(text.split()) | |
# Normalize periods (convert multiple periods to single period) | |
text = re.sub(r'\.+$', '.', text.strip()) | |
# Remove any whitespace before the period | |
text = re.sub(r'\s+\.$', '.', text) | |
return text | |
def normalize_document_type(doc_type: str) -> str: | |
"""Normalize document type string.""" | |
return ' '.join(word.capitalize() for word in doc_type.lower().split()) | |
# 5. Result Class | |
class DocumentCheckResult: | |
"""Structured result for document checks.""" | |
success: bool | |
issues: List[Dict[str, Any]] | |
details: Optional[Dict[str, Any]] = None | |
# 6. Base Document Checker | |
class DocumentChecker: | |
"""Base class for document checking with comprehensive configuration and logging.""" | |
def __init__(self, config_path: Optional[str] = None): | |
""" | |
Initialize DocumentChecker with optional configuration. | |
Args: | |
config_path (str, optional): Path to configuration file. | |
""" | |
self.config_manager = DocumentCheckerConfig(config_path) | |
self.logger = self.config_manager.logger | |
def extract_paragraphs(cls, doc_path: str) -> List[str]: | |
""" | |
Extract plain text paragraphs from a document. | |
Args: | |
doc_path (str): Path to the document. | |
Returns: | |
List[str]: List of paragraph texts. | |
""" | |
try: | |
doc = Document(doc_path) | |
return [para.text for para in doc.paragraphs if para.text.strip()] | |
except Exception as e: | |
logging.error(f"Error extracting paragraphs: {e}") | |
return [] | |
def validate_input(doc: List[str]) -> bool: | |
""" | |
Validate input document. | |
Args: | |
doc (List[str]): List of paragraphs. | |
Returns: | |
bool: Whether input is valid. | |
""" | |
return doc is not None and isinstance(doc, list) and len(doc) > 0 | |
# 7. Configuration Manager | |
class DocumentCheckerConfig: | |
"""Configuration management for document checks.""" | |
REQUIRED_CONFIG_KEYS = {'logging', 'checks', 'document_types'} | |
REQUIRED_LOGGING_KEYS = {'level', 'format'} | |
REQUIRED_CHECKS_KEYS = {'acronyms', 'terminology_check', 'headings'} | |
def __init__(self, config_path: Optional[str] = None): | |
"""Initialize configuration with optional config file.""" | |
self.default_config = { | |
"logging": { | |
"level": DEFAULT_LOG_LEVEL, # Use constant defined at top | |
"format": DEFAULT_LOG_FORMAT # Use constant defined at top | |
}, | |
"checks": { | |
"acronyms": True, | |
"terminology_check": True, | |
"headings": True | |
}, | |
"document_types": { | |
"Advisory Circular": { | |
"required_headings": [ | |
"Purpose.", | |
"Applicability.", | |
"Cancellation.", | |
"Related Material.", | |
"Definition of Key Terms." | |
], | |
"skip_title_check": False | |
}, | |
"Federal Register Notice": { | |
"required_headings": [ | |
"Purpose of This Notice", | |
"Audience", | |
"Where can I Find This Notice" | |
], | |
"skip_title_check": False | |
}, | |
"Order": { | |
"required_headings": [ | |
"Purpose of This Order.", | |
"Audience.", | |
"Where to Find This Order." | |
], | |
"skip_title_check": False | |
}, | |
"Policy Statement": { | |
"required_headings": [ | |
"SUMMARY", | |
"CURRENT REGULATORY AND ADVISORY MATERIAL", | |
"RELEVANT PAST PRACTICE", | |
"POLICY", | |
"EFFECT OF POLICY", | |
"CONCLUSION" | |
], | |
"skip_title_check": False | |
}, | |
"Technical Standard Order": { | |
"required_headings": [ | |
"PURPOSE.", | |
"APPLICABILITY.", | |
"REQUIREMENTS.", | |
"MARKING.", | |
"APPLICATION DATA REQUIREMENTS.", | |
"MANUFACTURER DATA REQUIREMENTS.", | |
"FURNISHED DATA REQUIREMENTS.", | |
"HOW TO GET REFERENCED DOCUMENTS." | |
], | |
"skip_title_check": False | |
}, | |
"Airworthiness Criteria": { | |
"required_headings": [], | |
"skip_title_check": True | |
}, | |
"Deviation Memo": { | |
"required_headings": [], | |
"skip_title_check": True | |
}, | |
"Exemption": { | |
"required_headings": [], | |
"skip_title_check": True | |
}, | |
"Rule": { | |
"required_headings": [], | |
"skip_title_check": True | |
}, | |
"Special Condition": { | |
"required_headings": [], | |
"skip_title_check": True | |
}, | |
"Other": { | |
"required_headings": [], | |
"skip_title_check": True | |
} | |
} | |
} | |
self.config = self._load_config(config_path) | |
self._validate_config(self.config) | |
self.logger = self._setup_logger() | |
self.pattern_registry = self._setup_patterns() | |
def _load_config(self, config_path: Optional[str] = None) -> Dict[str, Any]: | |
""" | |
Load configuration from JSON file or use default settings. | |
Args: | |
config_path (str, optional): Path to configuration file. | |
Returns: | |
Dict[str, Any]: Loaded configuration dictionary. | |
""" | |
if config_path and os.path.exists(config_path): | |
try: | |
with open(config_path, 'r') as f: | |
user_config = json.load(f) | |
# Deep merge default and user config | |
return self._deep_merge(self.default_config.copy(), user_config) | |
except (json.JSONDecodeError, IOError) as e: | |
logging.warning(f"Error loading config: {e}. Using default config.") | |
return self.default_config.copy() | |
return self.default_config.copy() | |
def _validate_config(self, config: Dict[str, Any]) -> None: | |
"""Validate configuration structure.""" | |
# Check required top-level keys | |
missing_keys = self.REQUIRED_CONFIG_KEYS - set(config.keys()) | |
if missing_keys: | |
raise ConfigurationError(f"Missing required configuration keys: {missing_keys}") | |
# Validate logging configuration | |
missing_logging = self.REQUIRED_LOGGING_KEYS - set(config['logging'].keys()) | |
if missing_logging: | |
raise ConfigurationError(f"Missing required logging keys: {missing_logging}") | |
# Validate checks configuration | |
missing_checks = self.REQUIRED_CHECKS_KEYS - set(config['checks'].keys()) | |
if missing_checks: | |
raise ConfigurationError(f"Missing required checks keys: {missing_checks}") | |
# Validate document types | |
if not isinstance(config['document_types'], dict): | |
raise ConfigurationError("Document types must be a dictionary") | |
# Validate each document type's configuration | |
for doc_type, type_config in config['document_types'].items(): | |
if not isinstance(type_config, dict): | |
raise ConfigurationError(f"Invalid configuration for document type {doc_type}") | |
# Check for required keys in each document type | |
required_keys = {'required_headings', 'skip_title_check'} | |
missing_type_keys = required_keys - set(type_config.keys()) | |
if missing_type_keys: | |
raise ConfigurationError( | |
f"Missing required keys {missing_type_keys} for document type {doc_type}" | |
) | |
# Validate required_headings is a list | |
if not isinstance(type_config['required_headings'], list): | |
raise ConfigurationError( | |
f"required_headings must be a list for document type {doc_type}" | |
) | |
# Validate skip_title_check is boolean | |
if not isinstance(type_config['skip_title_check'], bool): | |
raise ConfigurationError( | |
f"skip_title_check must be a boolean for document type {doc_type}" | |
) | |
def _deep_merge(self, base: Dict[str, Any], update: Dict[str, Any]) -> Dict[str, Any]: | |
""" | |
Recursively merge two dictionaries. | |
Args: | |
base (Dict): Base dictionary to merge into. | |
update (Dict): Dictionary to merge from. | |
Returns: | |
Dict: Merged dictionary. | |
""" | |
for key, value in update.items(): | |
if isinstance(value, dict) and key in base and isinstance(base[key], dict): | |
self._deep_merge(base[key], value) | |
else: | |
base[key] = value | |
return base | |
def _setup_logger(self) -> logging.Logger: | |
""" | |
Set up and configure logging based on configuration. | |
Returns: | |
logging.Logger: Configured logger instance. | |
""" | |
logger = logging.getLogger(__name__) | |
log_level = getattr(logging, self.config['logging']['level'].upper()) | |
formatter = logging.Formatter(self.config['logging']['format']) | |
# Console Handler | |
console_handler = logging.StreamHandler() | |
console_handler.setFormatter(formatter) | |
console_handler.setLevel(log_level) | |
logger.addHandler(console_handler) | |
logger.setLevel(log_level) | |
return logger | |
def _setup_patterns(self) -> Dict[str, List[PatternConfig]]: | |
""" | |
Set up comprehensive pattern registry for all document checks. | |
Returns: | |
Dict[str, List[PatternConfig]]: Dictionary of pattern configurations by category | |
""" | |
return { | |
'terminology': [ | |
PatternConfig( | |
pattern=r'\bUSC\b', | |
description="USC should be U.S.C.", | |
is_error=True, | |
replacement="U.S.C." | |
), | |
PatternConfig( | |
pattern=r'\bCFR Part\b', | |
description="CFR Part should be CFR part", | |
is_error=True, | |
replacement="CFR part" | |
), | |
PatternConfig( | |
pattern=r'\bC\.F\.R\.\b', | |
description="C.F.R. should be CFR", | |
is_error=True, | |
replacement="CFR" | |
), | |
PatternConfig( | |
pattern=r'\b14 CFR\s*Β§', | |
description="14 CFR Β§ should be 14 CFR", | |
is_error=True, | |
replacement="14 CFR" | |
), | |
PatternConfig( | |
pattern=r'\bWe\b', | |
description="'We' should be 'The FAA'", | |
is_error=True, | |
replacement="The FAA" | |
), | |
PatternConfig( | |
pattern=r'\bwe\b', | |
description="'we' should be 'the FAA'", | |
is_error=True, | |
replacement="the FAA" | |
), | |
PatternConfig( | |
pattern=r'\bcancelled\b', | |
description="'cancelled' should be 'canceled'", | |
is_error=True, | |
replacement="canceled" | |
), | |
PatternConfig( | |
pattern=r'\bshall\b', | |
description="'shall' should be 'must'", | |
is_error=True, | |
replacement="must" | |
), | |
PatternConfig( | |
pattern=r'\b\&\b', | |
description="'&' should be 'and'", | |
is_error=True, | |
replacement="and" | |
), | |
PatternConfig( | |
pattern=r'\bflight crew\b', | |
description="'flight crew' should be 'flightcrew'", | |
is_error=True, | |
replacement="flightcrew" | |
) | |
], | |
'section_symbol': [ | |
PatternConfig( | |
pattern=r'^Β§', | |
description="Sentence should not start with section symbol", | |
is_error=True | |
), | |
PatternConfig( | |
pattern=r'\b14 CFR Β§\s*\d+\.\d+\b', | |
description="14 CFR should not use section symbol", | |
is_error=True | |
), | |
PatternConfig( | |
pattern=r'Β§\s*\d+\.\d+\s+(?:and|or)\s+\d+\.\d+', | |
description="Missing section symbol in multiple sections", | |
is_error=True | |
), | |
PatternConfig( | |
pattern=r'Β§\s*\d+\.\d+\s+through\s+\d+\.\d+', | |
description="Missing section symbol in range of sections", | |
is_error=True | |
), | |
PatternConfig( | |
pattern=r'Β§\s*\d+\.\d+\s+or\s+Β§?\s*\d+\.\d+', | |
description="Inconsistent section symbol usage with 'or'", | |
is_error=True | |
) | |
], | |
'spacing': [ | |
PatternConfig( | |
pattern=r'(?<!\s)(AC|AD|CFR|FAA|N|SFAR)(\d+[-]?\d*)', | |
description="Missing space between document type and number", | |
is_error=True | |
), | |
PatternConfig( | |
pattern=r'(?<!\s)(Β§|Β§Β§)(\d+\.\d+)', | |
description="Missing space after section symbol (Β§)", | |
is_error=True | |
), | |
PatternConfig( | |
pattern=r'(?<!\s)Part(\d+)', | |
description="Missing space between 'Part' and number", | |
is_error=True | |
), | |
PatternConfig( | |
pattern=r'(?<!\s)(\([a-z](?!\))|\([1-9](?!\)))', | |
description="Missing space before paragraph indication", | |
is_error=True | |
), | |
PatternConfig( | |
pattern=r'\s{2,}', | |
description="Double spaces between words", | |
is_error=True | |
) | |
], | |
'dates': [ | |
PatternConfig( | |
pattern=r'(?<![\w/-])\d{1,2}/\d{1,2}/\d{2,4}(?![\w/-])', | |
description="Use 'Month Day, Year' format instead of MM/DD/YYYY", | |
is_error=True | |
), | |
PatternConfig( | |
pattern=r'(?<![\w/-])\d{1,2}-\d{1,2}-\d{2,4}(?![\w/-])', | |
description="Use 'Month Day, Year' format instead of MM-DD-YYYY", | |
is_error=True | |
), | |
PatternConfig( | |
pattern=r'(?<![\w/-])\d{4}-\d{1,2}-\d{1,2}(?![\w/-])', | |
description="Use 'Month Day, Year' format instead of YYYY-MM-DD", | |
is_error=True | |
) | |
], | |
'placeholders': [ | |
PatternConfig( | |
pattern=r'\bTBD\b', | |
description="Remove TBD placeholder", | |
is_error=True | |
), | |
PatternConfig( | |
pattern=r'\bTo be determined\b', | |
description="Remove 'To be determined' placeholder", | |
is_error=True | |
), | |
PatternConfig( | |
pattern=r'\bTo be added\b', | |
description="Remove 'To be added' placeholder", | |
is_error=True | |
) | |
], | |
'reference_terms': [ | |
PatternConfig( | |
pattern=r'\babove\b', | |
description="Avoid using 'above' for references", | |
is_error=True | |
), | |
PatternConfig( | |
pattern=r'\bbelow\b', | |
description="Avoid using 'below' for references", | |
is_error=True | |
), | |
PatternConfig( | |
pattern=r'(?:^|(?<=[.!?]\s))There\s+(?:is|are)\b', | |
description="Avoid starting sentences with 'There is/are'", | |
is_error=True | |
) | |
], | |
'periods': [ | |
PatternConfig( | |
pattern=r'\.\.', | |
description="Remove double periods", | |
is_error=True | |
) | |
], | |
'table_figure_references': [ | |
PatternConfig( | |
pattern=r'(?<!^)(?<![.!?])\s+[T]able\s+\d+(?:-\d+)?', | |
description="Table reference within sentence should be lowercase", | |
is_error=True | |
), | |
PatternConfig( | |
pattern=r'(?<!^)(?<![.!?])\s+[F]igure\s+\d+(?:-\d+)?', | |
description="Figure reference within sentence should be lowercase", | |
is_error=True | |
), | |
PatternConfig( | |
pattern=r'^[t]able\s+\d+(?:-\d+)?', | |
description="Table reference at start of sentence should be capitalized", | |
is_error=True | |
), | |
PatternConfig( | |
pattern=r'^[f]igure\s+\d+(?:-\d+)?', | |
description="Figure reference at start of sentence should be capitalized", | |
is_error=True | |
) | |
] | |
} | |
def profile_performance(func): | |
"""Decorator to profile function performance.""" | |
def wrapper(*args, **kwargs): | |
start_time = time.time() | |
result = func(*args, **kwargs) | |
end_time = time.time() | |
# Get logger from the class instance (first argument) | |
logger = args[0].logger | |
logger.info( | |
f"Performance: {func.__name__} took {end_time - start_time:.4f} seconds" | |
) | |
return result | |
return wrapper | |
# 8. FAA Document Checker | |
class FAADocumentChecker(DocumentChecker): | |
"""Document checker implementation for FAA documents.""" | |
# Constructor | |
def __init__(self, config_path: Optional[str] = None): | |
super().__init__(config_path) | |
# Core Check Methods | |
def heading_title_check(self, doc: List[str], doc_type: str) -> DocumentCheckResult: | |
""" | |
Check headings for a specific document type. | |
Args: | |
doc (List[str]): List of document paragraphs | |
doc_type (str): Type of document being checked | |
Returns: | |
DocumentCheckResult: Result of heading check including found and missing headings | |
""" | |
if not self.validate_input(doc): | |
self.logger.error("Invalid document input for heading check") | |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}]) | |
# Validate document type | |
doc_type_config = self.config_manager.config['document_types'].get(doc_type) | |
if not doc_type_config: | |
self.logger.error(f"Unsupported document type: {doc_type}") | |
return DocumentCheckResult( | |
success=False, | |
issues=[{'error': f'Unsupported document type: {doc_type}'}] | |
) | |
# Get configuration for document-specific headings | |
required_headings = doc_type_config.get('required_headings', []) | |
if not required_headings: | |
return DocumentCheckResult( | |
success=True, | |
issues=[], | |
details={'message': f'No required headings defined for {doc_type}'} | |
) | |
headings_found = [] | |
required_headings_set = set(required_headings) | |
# Extract and normalize headings from document | |
for para in doc: | |
para_strip = para.strip() | |
# Handle both exact matches and variations with trailing periods | |
para_base = para_strip.rstrip('.') | |
if para_base in required_headings_set or para_strip in required_headings_set: | |
headings_found.append(para_strip) | |
# Check if all required headings are found | |
found_headings_set = set(headings_found) | |
missing_headings = required_headings_set - found_headings_set | |
unexpected_headings = found_headings_set - required_headings_set | |
success = len(missing_headings) == 0 | |
issues = [] | |
if not success: | |
issues.append({ | |
'type': 'missing_headings', | |
'missing': list(missing_headings) | |
}) | |
if unexpected_headings: | |
issues.append({ | |
'type': 'unexpected_headings', | |
'unexpected': list(unexpected_headings) | |
}) | |
details = { | |
'found_headings': list(found_headings_set), | |
'required_headings': required_headings, | |
'document_type': doc_type, | |
'missing_count': len(missing_headings), | |
'unexpected_count': len(unexpected_headings) | |
} | |
return DocumentCheckResult(success=success, issues=issues, details=details) | |
def heading_title_period_check(self, doc: List[str], doc_type: str) -> DocumentCheckResult: | |
""" | |
Check if headings end with periods according to document type requirements. | |
Enforces both required periods and no periods based on document type. | |
Args: | |
doc (List[str]): List of document paragraphs | |
doc_type (str): Type of document being checked | |
Returns: | |
DocumentCheckResult: Result of the heading period check including: | |
- success: Boolean indicating if all headings follow period rules | |
- issues: List of dicts with heading format issues | |
- details: Additional information about the check | |
""" | |
if not self.validate_input(doc): | |
self.logger.error("Invalid document input for period check") | |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}]) | |
# Validate document type exists in configuration | |
doc_type_config = self.config_manager.config['document_types'].get(doc_type) | |
if not doc_type_config: | |
self.logger.error(f"Unsupported document type: {doc_type}") | |
return DocumentCheckResult( | |
success=False, | |
issues=[{'error': f'Unsupported document type: {doc_type}'}] | |
) | |
should_have_period = PERIOD_REQUIRED.get(doc_type) | |
if should_have_period is None: | |
self.logger.error(f"Period requirement not defined for document type: {doc_type}") | |
return DocumentCheckResult( | |
success=False, | |
issues=[{'error': f'Period requirement not defined for document type: {doc_type}'}] | |
) | |
# Get the headings configuration for this document type | |
required_headings = doc_type_config.get('required_headings', []) | |
if not required_headings: | |
return DocumentCheckResult( | |
success=True, | |
issues=[], | |
details={'message': f'No required headings defined for {doc_type}'} | |
) | |
issues = [] | |
checked_headings = [] | |
# Create a set of normalized required headings (without periods) | |
# Strip periods from the required headings to allow for flexible matching | |
required_headings_set = {h.rstrip('.') for h in required_headings} | |
for para in doc: | |
para_strip = para.strip() | |
para_base = para_strip.rstrip('.') | |
# Check only if paragraph is a heading (comparing without periods) | |
if para_base in required_headings_set: | |
ends_with_period = para_strip.endswith('.') | |
# Check for both cases: | |
# 1. Should have period but doesn't | |
# 2. Shouldn't have period but does | |
if should_have_period and not ends_with_period: | |
issues.append({ | |
'heading': para_strip, | |
'issue': 'missing_period', | |
'message': f"Heading should end with a period: '{para_strip}'" | |
}) | |
elif not should_have_period and ends_with_period: | |
issues.append({ | |
'heading': para_strip, | |
'issue': 'unexpected_period', | |
'message': f"Heading should not have a period: '{para_strip}'" | |
}) | |
checked_headings.append({ | |
'heading': para_strip, | |
'has_period': ends_with_period, | |
'needs_period': should_have_period | |
}) | |
# Calculate statistics for the details | |
total_checked = len(checked_headings) | |
total_issues = len(issues) | |
incorrect_period_count = sum(1 for h in checked_headings | |
if h['has_period'] != h['needs_period']) | |
# Detailed results for debugging and reporting | |
details = { | |
'document_type': doc_type, | |
'periods_required': should_have_period, | |
'checked_headings': checked_headings, | |
'total_checked': total_checked, | |
'total_issues': total_issues, | |
'incorrect_period_count': incorrect_period_count | |
} | |
success = len(issues) == 0 | |
# Log summary for debugging | |
self.logger.debug(f"Period check for {doc_type}: " | |
f"checked {total_checked} headings, " | |
f"found {total_issues} issues") | |
return DocumentCheckResult(success=success, issues=issues, details=details) | |
def acronym_check(self, doc: List[str]) -> DocumentCheckResult: | |
""" | |
Check if acronyms are defined at their first use, ignoring uppercase headings | |
and common exceptions. | |
""" | |
if not self.validate_input(doc): | |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}]) | |
defined_acronyms = set() | |
first_occurrences = {} # Track first occurrence of each acronym | |
undefined_acronyms = [] | |
# Common words that might appear in uppercase but aren't acronyms | |
heading_words = { | |
'INFORMATION', 'GENERAL', 'SUMMARY', 'INTRODUCTION', 'BACKGROUND', | |
'DISCUSSION', 'CONCLUSION', 'APPENDIX', 'CHAPTER', 'SECTION', | |
'PURPOSE', 'APPLICABILITY', 'CANCELLATION', 'DEFINITION', 'REQUIREMENTS', | |
'AUTHORITY', 'POLICY', 'SCOPE', 'RELATED', 'MATERIAL', 'DISTRIBUTION', | |
'EXPLANATION', 'PROCEDURES', 'NOTE', 'WARNING', 'CAUTION', 'EXCEPTION', | |
'GROUPS', 'PARTS', 'TABLE', 'FIGURE', 'REFERENCES', 'DEFINITIONS' | |
} | |
# Standard acronyms that don't need to be defined | |
predefined_acronyms = { | |
'CFR', 'U.S.', 'USA', 'US', 'U.S.C', 'e.g.', 'i.e.', 'FAQ', 'No.', 'ZIP', 'PDF', 'SSN', | |
'DC', 'MA', 'WA', 'TX', 'MO' | |
} | |
defined_acronyms.update(predefined_acronyms) | |
# Pattern for finding defined acronyms like "Federal Aviation Administration (FAA)" | |
defined_pattern = re.compile(r'\b([\w\s&]+?)\s*\((\b[A-Z]{2,}\b)\)') | |
# Modified acronym pattern to exclude common heading patterns | |
acronym_pattern = re.compile(r'\b[A-Z]{2,}\b(?!\s*[:.]\s*)') | |
for paragraph in doc: | |
# Skip lines that appear to be headings (all uppercase with common heading words) | |
words = paragraph.strip().split() | |
if all(word.isupper() for word in words) and any(word in heading_words for word in words): | |
continue | |
# Check for definitions first | |
defined_matches = defined_pattern.findall(paragraph) | |
for full_term, acronym in defined_matches: | |
defined_acronyms.add(acronym) | |
# If this was previously marked as undefined, remove it | |
if acronym in first_occurrences: | |
del first_occurrences[acronym] | |
# Check for acronym usage | |
usage_matches = acronym_pattern.finditer(paragraph) | |
for match in usage_matches: | |
acronym = match.group() | |
# Skip if it's part of a heading or contains non-letter characters | |
if (acronym in heading_words or | |
any(not c.isalpha() for c in acronym) or | |
len(acronym) > 10): # Usually acronyms aren't this long | |
continue | |
if acronym not in defined_acronyms: | |
# Only process if we haven't seen this acronym before | |
if acronym not in first_occurrences: | |
# Find the sentence containing the first undefined acronym | |
sentences = re.split(r'(?<=[.!?])\s+', paragraph) | |
for sentence in sentences: | |
if acronym in sentence: | |
# Additional check to avoid marking uppercase headings | |
if not (sentence.isupper() and any(word in heading_words for word in sentence.split())): | |
first_occurrences[acronym] = { | |
'acronym': acronym, | |
'sentence': sentence.strip() | |
} | |
break | |
# Convert first occurrences to list of issues | |
undefined_acronyms = list(first_occurrences.values()) | |
success = len(undefined_acronyms) == 0 | |
issues = undefined_acronyms if not success else [] | |
return DocumentCheckResult(success=success, issues=issues) | |
def check_terminology(self, doc: List[str]) -> DocumentCheckResult: | |
""" | |
Check document terminology for: | |
1. Legal reference formatting and preferred terms | |
2. Prohibited phrases and constructions | |
""" | |
if not self.validate_input(doc): | |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}]) | |
# Get patterns from the pattern registry | |
terminology_patterns = self.config_manager.pattern_registry.get('terminology', []) | |
prohibited_patterns = self.config_manager.pattern_registry.get('reference_terms', []) | |
# Dictionary of terms that should be replaced with preferred alternatives | |
incorrect_terms = [] | |
prohibited_phrases = [] | |
# Check each paragraph | |
for paragraph in doc: | |
sentences = re.split(r'(?<=[.!?])\s+', paragraph) | |
for sentence in sentences: | |
# Check for incorrect terms that need replacement | |
for pattern_config in terminology_patterns: | |
compiled_pattern = re.compile(pattern_config.pattern) | |
matches = compiled_pattern.finditer(sentence) | |
for match in matches: | |
incorrect_term = match.group() | |
incorrect_terms.append({ | |
'incorrect_term': incorrect_term, | |
'correct_term': pattern_config.replacement, | |
'description': pattern_config.description, | |
'sentence': sentence.strip(), | |
'pattern_type': 'terminology' | |
}) | |
# Check for prohibited phrases and constructions | |
for pattern_config in prohibited_patterns: | |
compiled_pattern = re.compile(pattern_config.pattern, re.IGNORECASE) | |
match = compiled_pattern.search(sentence) | |
if match: | |
prohibited_phrases.append({ | |
'phrase': match.group().strip(), | |
'description': pattern_config.description, | |
'sentence': sentence.strip(), | |
'pattern_type': 'prohibited_phrase' | |
}) | |
# Organize issues by category for better reporting | |
issues = [] | |
if incorrect_terms: | |
# Group incorrect terms by type for better organization | |
term_groups = {} | |
for term in incorrect_terms: | |
key = (term['incorrect_term'], term['correct_term']) | |
if key not in term_groups: | |
term_groups[key] = [] | |
term_groups[key].append(term['sentence']) | |
for (incorrect, correct), sentences in term_groups.items(): | |
issues.append({ | |
'type': 'incorrect_term', | |
'incorrect_term': incorrect, | |
'correct_term': correct, | |
'occurrence_count': len(sentences), | |
'sentences': sentences[:3], # Show first 3 examples | |
'total_occurrences': len(sentences) | |
}) | |
if prohibited_phrases: | |
# Group prohibited phrases for better reporting | |
phrase_groups = {} | |
for phrase in prohibited_phrases: | |
if phrase['description'] not in phrase_groups: | |
phrase_groups[phrase['description']] = [] | |
phrase_groups[phrase['description']].append(phrase['sentence']) | |
for description, sentences in phrase_groups.items(): | |
issues.append({ | |
'type': 'prohibited_phrase', | |
'description': description, | |
'occurrence_count': len(sentences), | |
'sentences': sentences[:3], # Show first 3 examples | |
'total_occurrences': len(sentences) | |
}) | |
# Add summary information | |
details = { | |
'total_terminology_issues': len(incorrect_terms), | |
'total_prohibited_phrases': len(prohibited_phrases), | |
'unique_term_types': len(set(term['incorrect_term'] for term in incorrect_terms)), | |
'unique_phrase_types': len(set(phrase['description'] for phrase in prohibited_phrases)) | |
} | |
success = len(issues) == 0 | |
return DocumentCheckResult( | |
success=success, | |
issues=issues, | |
details=details | |
) | |
def check_section_symbol_usage(self, doc: List[str]) -> DocumentCheckResult: | |
"""Check for various section symbol (Β§) usage issues.""" | |
if not self.validate_input(doc): | |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}]) | |
# Get patterns from registry while maintaining categorization | |
section_patterns = self.config_manager.pattern_registry.get('section_symbol', []) | |
# Initialize categorized issue lists | |
sentences_starting_with_section_symbol = [] | |
incorrect_14_CFR_section_symbol_usage = [] | |
single_section_symbol_multiple_sections = [] | |
missing_section_symbol_in_multiple_sections = [] | |
# Create pattern mapping for better organization | |
pattern_categories = { | |
r'^Β§': ('sentences_starting_with_section_symbol', sentences_starting_with_section_symbol), | |
r'\b14 CFR Β§\s*\d+\.\d+\b': ('incorrect_14_CFR_section_symbol_usage', incorrect_14_CFR_section_symbol_usage), | |
r'Β§\s*\d+\.\d+\s+(?:and|or|through)\s+\d+\.\d+': ('single_section_symbol_multiple_sections', single_section_symbol_multiple_sections), | |
r'Β§\s*\d+\.\d+\s+or\s+Β§?\s*\d+\.\d+': ('missing_section_symbol_in_multiple_sections', missing_section_symbol_in_multiple_sections) | |
} | |
# Process patterns and categorize matches | |
for paragraph in doc: | |
sentences = re.split(r'(?<=[.!?])\s+', paragraph) | |
for pattern_config in section_patterns: | |
compiled_pattern = re.compile(pattern_config.pattern) | |
# Find the corresponding category for this pattern | |
for pattern_key, (category_name, category_list) in pattern_categories.items(): | |
if pattern_config.pattern == pattern_key: | |
# For sentence-start patterns, check each sentence | |
if category_name == 'sentences_starting_with_section_symbol': | |
for sentence in sentences: | |
if compiled_pattern.match(sentence.strip()): | |
category_list.append(sentence.strip()) | |
else: | |
# For other patterns, check the whole paragraph | |
matches = compiled_pattern.findall(paragraph) | |
category_list.extend(matches) | |
# Compile issues maintaining the original structure | |
issues = [] | |
if sentences_starting_with_section_symbol: | |
issues.append({ | |
'issue': 'sentences_starting_with_section_symbol', | |
'sentences': sentences_starting_with_section_symbol, | |
'description': "Sentences should not start with section symbol" | |
}) | |
if incorrect_14_CFR_section_symbol_usage: | |
issues.append({ | |
'issue': 'incorrect_14_CFR_section_symbol_usage', | |
'matches': incorrect_14_CFR_section_symbol_usage, | |
'description': "14 CFR should not use section symbol" | |
}) | |
if single_section_symbol_multiple_sections: | |
issues.append({ | |
'issue': 'single_section_symbol_multiple_sections', | |
'matches': single_section_symbol_multiple_sections, | |
'description': "Missing section symbol in multiple sections" | |
}) | |
if missing_section_symbol_in_multiple_sections: | |
issues.append({ | |
'issue': 'missing_section_symbol_in_multiple_sections', | |
'matches': missing_section_symbol_in_multiple_sections, | |
'description': "Inconsistent section symbol usage" | |
}) | |
success = len(issues) == 0 | |
return DocumentCheckResult(success=success, issues=issues) | |
def caption_check(self, doc: List[str], doc_type: str, caption_type: str) -> DocumentCheckResult: | |
"""Check for correctly formatted captions (Table or Figure).""" | |
if not self.validate_input(doc): | |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}]) | |
# Determine the caption pattern based on document type | |
if doc_type in ["Advisory Circular", "Order"]: | |
caption_pattern = re.compile(rf'^{caption_type}\s+([A-Z0-9]+)-([A-Z0-9]+)[\.\s]', re.IGNORECASE) | |
correct_format = f"{caption_type} X-Y" | |
else: | |
caption_pattern = re.compile(rf'^{caption_type}\s+([A-Z0-9]+)[\.\s]', re.IGNORECASE) | |
correct_format = f"{caption_type} X" | |
incorrect_captions = [] | |
in_toc = False | |
for paragraph in doc: | |
# Check for start or end of Table of Contents (TOC) | |
if "Table of Contents" in paragraph or "Contents" in paragraph: | |
in_toc = True | |
continue | |
elif in_toc and paragraph.strip() == "": | |
in_toc = False # Assume blank line marks the end of TOC | |
# If within TOC, skip this paragraph | |
if in_toc: | |
continue | |
# Only check paragraphs that start with "Table" or "Figure" for proper caption format | |
paragraph_strip = paragraph.strip() | |
if paragraph_strip.lower().startswith(caption_type.lower()): | |
if not caption_pattern.match(paragraph_strip): | |
incorrect_captions.append({ | |
'incorrect_caption': paragraph_strip, | |
'correct_format': correct_format | |
}) | |
success = len(incorrect_captions) == 0 | |
return DocumentCheckResult(success=success, issues=incorrect_captions) | |
def table_figure_reference_check(self, doc: List[str], doc_type: str) -> DocumentCheckResult: | |
""" | |
Check for incorrect references to tables and figures in the document. | |
References should be lowercase within sentences and capitalized at sentence start. | |
Args: | |
doc (List[str]): List of document paragraphs | |
doc_type (str): Type of document being checked | |
Returns: | |
DocumentCheckResult: Result of table and figure reference check | |
""" | |
if not self.validate_input(doc): | |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}]) | |
incorrect_references = [] | |
# Define patterns based on document type | |
if doc_type in ["Advisory Circular", "Order"]: | |
table_pattern = r'\b([Tt]able)\s+\d+-\d+\b' | |
figure_pattern = r'\b([Ff]igure)\s+\d+-\d+\b' | |
else: | |
table_pattern = r'\b([Tt]able)\s+\d+\b' | |
figure_pattern = r'\b([Ff]igure)\s+\d+\b' | |
# Compile patterns for efficiency | |
table_ref_pattern = re.compile(table_pattern) | |
figure_ref_pattern = re.compile(figure_pattern) | |
# Pattern to identify table/figure captions | |
caption_pattern = re.compile(r'^(Table|Figure)\s+\d+[-\d]*\.?', re.IGNORECASE) | |
for paragraph in doc: | |
# Skip if this is a caption line | |
if caption_pattern.match(paragraph.strip()): | |
continue | |
# Split into sentences while preserving punctuation | |
sentences = re.split(r'(?<=[.!?])\s+', paragraph) | |
for sentence in sentences: | |
sentence = sentence.strip() | |
# Check table references | |
for pattern, ref_type in [(table_ref_pattern, "Table"), (figure_ref_pattern, "Figure")]: | |
matches = list(pattern.finditer(sentence)) | |
for match in matches: | |
ref = match.group() | |
word = match.group(1) # The actual "Table" or "Figure" word | |
# Get text before the reference | |
text_before = sentence[:match.start()].strip() | |
# Determine if reference is at start of sentence | |
is_sentence_start = not text_before or text_before.endswith((':',';')) | |
if is_sentence_start and word[0].islower(): | |
incorrect_references.append({ | |
'reference': ref, | |
'issue': f"{ref_type} reference at sentence start should be capitalized", | |
'sentence': sentence, | |
'correct_form': ref.capitalize() | |
}) | |
elif not is_sentence_start and word[0].isupper(): | |
incorrect_references.append({ | |
'reference': ref, | |
'issue': f"{ref_type} reference within sentence should be lowercase", | |
'sentence': sentence, | |
'correct_form': ref.lower() | |
}) | |
success = len(incorrect_references) == 0 | |
return DocumentCheckResult(success=success, issues=incorrect_references) | |
def document_title_check(self, doc_path: str, doc_type: str) -> DocumentCheckResult: | |
"""Check for correct formatting of document titles.""" | |
try: | |
doc = Document(doc_path) | |
except Exception as e: | |
self.logger.error(f"Error reading the document in title check: {e}") | |
return DocumentCheckResult(success=False, issues=[{'error': str(e)}]) | |
incorrect_titles = [] | |
# Define formatting rules for different document types | |
formatting_rules = { | |
"Advisory Circular": {"italics": True, "quotes": False}, | |
"Airworthiness Criteria": {"italics": False, "quotes": True}, | |
"Deviation Memo": {"italics": False, "quotes": True}, | |
"Exemption": {"italics": False, "quotes": True}, | |
"Federal Register Notice": {"italics": False, "quotes": True}, | |
"Order": {"italics": False, "quotes": True}, | |
"Policy Statement": {"italics": False, "quotes": False}, | |
"Rule": {"italics": False, "quotes": True}, | |
"Special Condition": {"italics": False, "quotes": True}, | |
"Technical Standard Order": {"italics": False, "quotes": True}, | |
"Other": {"italics": False, "quotes": False} | |
} | |
if doc_type not in formatting_rules: | |
self.logger.warning(f"Unsupported document type: {doc_type}. Skipping title check.") | |
return DocumentCheckResult(success=True, issues=[]) | |
required_format = formatting_rules[doc_type] | |
ac_pattern = re.compile(r'(AC\s+\d+(?:-\d+)?(?:,|\s)+)(.+?)(?=\.|,|$)') | |
for paragraph in doc.paragraphs: | |
text = paragraph.text | |
matches = ac_pattern.finditer(text) | |
for match in matches: | |
full_match = match.group(0) | |
title_text = match.group(2).strip() | |
# Get the position where the title starts | |
title_start = match.start(2) | |
title_end = match.end(2) | |
# Check for any type of quotation marks, including smart quotes | |
title_in_quotes = any(q in title_text for q in ['"', "'", 'β', 'β', 'β', 'β']) | |
# Check the formatting of the title | |
title_is_italicized = False | |
current_pos = 0 | |
for run in paragraph.runs: | |
run_length = len(run.text) | |
run_start = current_pos | |
run_end = current_pos + run_length | |
if run_start <= title_start < run_end: | |
title_is_italicized = run.italic | |
break | |
current_pos += run_length | |
# Check if formatting matches the required format | |
formatting_incorrect = False | |
issue_message = [] | |
# Check italics requirement | |
if required_format["italics"] and not title_is_italicized: | |
formatting_incorrect = True | |
issue_message.append("should be italicized") | |
elif not required_format["italics"] and title_is_italicized: | |
formatting_incorrect = True | |
issue_message.append("should not be italicized") | |
# Check quotes requirement | |
if required_format["quotes"] and not title_in_quotes: | |
formatting_incorrect = True | |
issue_message.append("should be in quotes") | |
elif not required_format["quotes"] and title_in_quotes: | |
formatting_incorrect = True | |
issue_message.append("should not be in quotes") | |
if formatting_incorrect: | |
incorrect_titles.append({ | |
'text': title_text, | |
'issue': ', '.join(issue_message), | |
'sentence': text.strip() | |
}) | |
success = len(incorrect_titles) == 0 | |
return DocumentCheckResult(success=success, issues=incorrect_titles) | |
def double_period_check(self, doc: List[str]) -> DocumentCheckResult: | |
"""Check for sentences that end with two periods.""" | |
if not self.validate_input(doc): | |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}]) | |
incorrect_sentences = [] | |
for paragraph in doc: | |
# Split the paragraph into sentences based on common sentence-ending punctuation | |
sentences = re.split(r'(?<=[.!?]) +', paragraph) | |
for sentence in sentences: | |
if sentence.endswith('..'): | |
incorrect_sentences.append({'sentence': sentence.strip()}) | |
success = len(incorrect_sentences) == 0 | |
return DocumentCheckResult(success=success, issues=incorrect_sentences) | |
def spacing_check(self, doc: List[str]) -> DocumentCheckResult: | |
"""Check for correct spacing in the document.""" | |
if not self.validate_input(doc): | |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}]) | |
# Get patterns from registry | |
spacing_patterns = self.config_manager.pattern_registry.get('spacing', []) | |
# Categorize different types of spacing issues | |
document_type_spacing_issues = [] # AC25.25, FAA123, etc. | |
section_symbol_spacing_issues = [] # Β§25.25 | |
part_number_spacing_issues = [] # Part25 | |
paragraph_spacing_issues = [] # text(a) or text(1) | |
double_space_issues = [] # Multiple spaces between words | |
# Pattern mapping for categorization | |
pattern_categories = { | |
r'(?<!\s)(AC|AD|CFR|FAA|N|SFAR)(\d+[-]?\d*)': ('document_type_spacing', document_type_spacing_issues), | |
r'(?<!\s)(Β§|Β§Β§)(\d+\.\d+)': ('section_symbol_spacing', section_symbol_spacing_issues), | |
r'(?<!\s)Part(\d+)': ('part_number_spacing', part_number_spacing_issues), | |
r'(?<!\s)(\([a-z](?!\))|\([1-9](?!\)))': ('paragraph_spacing', paragraph_spacing_issues), | |
r'\s{2,}': ('double_spacing', double_space_issues) | |
} | |
for paragraph in doc: | |
sentences = re.split(r'(?<=[.!?])\s+', paragraph) | |
for sentence in sentences: | |
for pattern_config in spacing_patterns: | |
compiled_pattern = re.compile(pattern_config.pattern) | |
# Find the corresponding category for this pattern | |
for pattern_key, (category_name, category_list) in pattern_categories.items(): | |
if pattern_config.pattern == pattern_key: | |
matches = compiled_pattern.finditer(sentence) | |
for match in matches: | |
category_list.append({ | |
'text': match.group(), | |
'sentence': sentence.strip(), | |
'description': pattern_config.description | |
}) | |
# Compile issues maintaining the original structure | |
issues = [] | |
if document_type_spacing_issues: | |
issues.append({ | |
'issue_type': 'document_type_spacing', | |
'description': 'Missing space between document type and number', | |
'occurrences': document_type_spacing_issues | |
}) | |
if section_symbol_spacing_issues: | |
issues.append({ | |
'issue_type': 'section_symbol_spacing', | |
'description': 'Missing space after section symbol', | |
'occurrences': section_symbol_spacing_issues | |
}) | |
if part_number_spacing_issues: | |
issues.append({ | |
'issue_type': 'part_number_spacing', | |
'description': 'Missing space between Part and number', | |
'occurrences': part_number_spacing_issues | |
}) | |
if paragraph_spacing_issues: | |
issues.append({ | |
'issue_type': 'paragraph_spacing', | |
'description': 'Missing space before paragraph indication', | |
'occurrences': paragraph_spacing_issues | |
}) | |
if double_space_issues: | |
issues.append({ | |
'issue_type': 'double_spacing', | |
'description': 'Multiple spaces between words', | |
'occurrences': double_space_issues | |
}) | |
success = len(issues) == 0 | |
return DocumentCheckResult(success=success, issues=issues) | |
def check_abbreviation_usage(self, doc: List[str]) -> DocumentCheckResult: | |
"""Check for abbreviation consistency after first definition.""" | |
if not self.validate_input(doc): | |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}]) | |
# Get patterns from registry - Add a new category in _setup_patterns if not existing | |
abbreviation_patterns = self.config_manager.pattern_registry.get('abbreviations', [ | |
PatternConfig( | |
pattern=r'\b([A-Za-z &]+)\s+\((\b[A-Z]{2,}\b)\)', | |
description="Acronym definition pattern", | |
is_error=False # Not an error, just a pattern to find definitions | |
) | |
]) | |
# Track abbreviations and their usage | |
abbreviations = {} # Store defined abbreviations | |
undefined_uses = [] # Track uses before definition | |
inconsistent_uses = [] # Track full term usage after definition | |
duplicate_definitions = [] # Track multiple definitions of same acronym | |
for paragraph in doc: | |
sentences = re.split(r'(?<=[.!?])\s+', paragraph) | |
for sentence in sentences: | |
# Find definitions using pattern from registry | |
for pattern_config in abbreviation_patterns: | |
compiled_pattern = re.compile(pattern_config.pattern) | |
defined_matches = compiled_pattern.finditer(sentence) | |
for match in defined_matches: | |
full_term, acronym = match.groups() | |
full_term = full_term.strip() | |
# Check for duplicate definitions | |
if acronym in abbreviations: | |
if abbreviations[acronym]["full_term"] != full_term: | |
duplicate_definitions.append({ | |
'acronym': acronym, | |
'first_definition': abbreviations[acronym]["full_term"], | |
'second_definition': full_term, | |
'sentence': sentence.strip() | |
}) | |
else: | |
abbreviations[acronym] = { | |
"full_term": full_term, | |
"defined": True, | |
"first_occurrence": sentence.strip() | |
} | |
# Check for full term usage after definition | |
for acronym, data in abbreviations.items(): | |
full_term = data["full_term"] | |
if full_term in sentence: | |
# Skip if this is the definition sentence | |
if sentence.strip() == data["first_occurrence"]: | |
continue | |
# Only flag if already defined | |
if not data["defined"]: | |
inconsistent_uses.append({ | |
'issue_type': 'full_term_after_acronym', | |
'full_term': full_term, | |
'acronym': acronym, | |
'sentence': sentence.strip(), | |
'definition_context': data["first_occurrence"] | |
}) | |
data["defined"] = False # Mark as used | |
# Compile all issues | |
issues = [] | |
if duplicate_definitions: | |
issues.append({ | |
'issue_type': 'duplicate_acronym_definition', | |
'description': 'Acronym defined multiple times with different terms', | |
'occurrences': duplicate_definitions | |
}) | |
if inconsistent_uses: | |
issues.append({ | |
'issue_type': 'inconsistent_acronym_usage', | |
'description': 'Full term used after acronym was defined', | |
'occurrences': inconsistent_uses | |
}) | |
# Add summary information | |
details = { | |
'total_acronyms_defined': len(abbreviations), | |
'total_duplicate_definitions': len(duplicate_definitions), | |
'total_inconsistent_uses': len(inconsistent_uses), | |
'defined_acronyms': [ | |
{ | |
'acronym': acronym, | |
'full_term': data['full_term'], | |
'first_occurrence': data['first_occurrence'] | |
} | |
for acronym, data in abbreviations.items() | |
] | |
} | |
success = len(issues) == 0 | |
return DocumentCheckResult( | |
success=success, | |
issues=issues, | |
details=details | |
) | |
def check_date_formats(self, doc: List[str]) -> DocumentCheckResult: | |
"""Check for inconsistent date formats while ignoring aviation reference numbers.""" | |
if not self.validate_input(doc): | |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}]) | |
# Get patterns from registry | |
date_patterns = self.config_manager.pattern_registry.get('dates', []) | |
# Patterns to ignore (aviation references) | |
ignore_patterns = [ | |
r'\bAD \d{4}-\d{2}-\d{2}\b', # Airworthiness Directive references | |
r'\bSWPM \d{2}-\d{2}-\d{2}\b', # Standard Wiring Practices Manual references | |
r'\bAMM \d{2}-\d{2}-\d{2}\b', # Aircraft Maintenance Manual references | |
r'\bSOPM \d{2}-\d{2}-\d{2}\b', # Standard Operating Procedure references | |
r'\b[A-Z]{2,4} \d{2}-\d{2}-\d{2}\b' # Generic manual reference pattern | |
] | |
# Combine ignore patterns into one | |
ignore_regex = '|'.join(ignore_patterns) | |
ignore_pattern = re.compile(ignore_regex) | |
# Track different types of date format issues | |
slash_format_dates = [] # MM/DD/YYYY | |
hyphen_format_dates = [] # MM-DD-YYYY or YYYY-MM-DD | |
for paragraph in doc: | |
sentences = re.split(r'(?<=[.!?])\s+', paragraph) | |
for sentence in sentences: | |
# First, identify and temporarily remove text that should be ignored | |
ignored_matches = list(ignore_pattern.finditer(sentence)) | |
working_sentence = sentence | |
# Replace ignored patterns with placeholders | |
for match in reversed(ignored_matches): | |
start, end = match.span() | |
working_sentence = working_sentence[:start] + 'X' * (end - start) + working_sentence[end:] | |
# Now check for date patterns | |
for pattern_config in date_patterns: | |
compiled_pattern = re.compile(pattern_config.pattern) | |
matches = compiled_pattern.finditer(working_sentence) | |
for match in matches: | |
# Get the original text from the match position | |
original_date = sentence[match.start():match.end()] | |
issue = { | |
'date': original_date, | |
'description': pattern_config.description, | |
'sentence': sentence.strip() | |
} | |
if '/' in original_date: | |
slash_format_dates.append(issue) | |
elif '-' in original_date: | |
hyphen_format_dates.append(issue) | |
# Compile issues | |
issues = [] | |
if slash_format_dates: | |
issues.append({ | |
'issue_type': 'slash_date_format', | |
'description': "Dates should use 'Month Day, Year' format instead of MM/DD/YYYY", | |
'occurrences': slash_format_dates | |
}) | |
if hyphen_format_dates: | |
issues.append({ | |
'issue_type': 'hyphen_date_format', | |
'description': "Dates should use 'Month Day, Year' format instead of MM-DD-YYYY or YYYY-MM-DD", | |
'occurrences': hyphen_format_dates | |
}) | |
success = len(issues) == 0 | |
return DocumentCheckResult(success=success, issues=issues) | |
def check_placeholders(self, doc: List[str]) -> DocumentCheckResult: | |
"""Check for placeholders that should be removed.""" | |
if not self.validate_input(doc): | |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}]) | |
# Get patterns from registry | |
placeholder_patterns = self.config_manager.pattern_registry.get('placeholders', []) | |
# Track different types of placeholders | |
tbd_placeholders = [] | |
to_be_determined_placeholders = [] | |
to_be_added_placeholders = [] | |
# Pattern mapping for categorization | |
pattern_categories = { | |
r'\bTBD\b': ('tbd', tbd_placeholders), | |
r'\bTo be determined\b': ('to_be_determined', to_be_determined_placeholders), | |
r'\bTo be added\b': ('to_be_added', to_be_added_placeholders) | |
} | |
for paragraph in doc: | |
sentences = re.split(r'(?<=[.!?])\s+', paragraph) | |
for sentence in sentences: | |
for pattern_config in placeholder_patterns: | |
compiled_pattern = re.compile(pattern_config.pattern, re.IGNORECASE) | |
# Find the corresponding category for this pattern | |
for pattern_key, (category_name, category_list) in pattern_categories.items(): | |
if pattern_config.pattern == pattern_key: | |
matches = compiled_pattern.finditer(sentence) | |
for match in matches: | |
category_list.append({ | |
'placeholder': match.group().strip(), | |
'sentence': sentence.strip(), | |
'description': pattern_config.description | |
}) | |
# Compile issues | |
issues = [] | |
if tbd_placeholders: | |
issues.append({ | |
'issue_type': 'tbd_placeholder', | |
'description': 'Remove TBD placeholder', | |
'occurrences': tbd_placeholders | |
}) | |
if to_be_determined_placeholders: | |
issues.append({ | |
'issue_type': 'to_be_determined_placeholder', | |
'description': "Remove 'To be determined' placeholder", | |
'occurrences': to_be_determined_placeholders | |
}) | |
if to_be_added_placeholders: | |
issues.append({ | |
'issue_type': 'to_be_added_placeholder', | |
'description': "Remove 'To be added' placeholder", | |
'occurrences': to_be_added_placeholders | |
}) | |
# Add summary information | |
details = { | |
'total_placeholders': len(tbd_placeholders) + | |
len(to_be_determined_placeholders) + | |
len(to_be_added_placeholders), | |
'placeholder_types': { | |
'TBD': len(tbd_placeholders), | |
'To be determined': len(to_be_determined_placeholders), | |
'To be added': len(to_be_added_placeholders) | |
} | |
} | |
success = len(issues) == 0 | |
return DocumentCheckResult(success=success, issues=issues, details=details) | |
def run_all_checks(self, doc_path: str, doc_type: str, template_type: Optional[str] = None) -> Dict[str, DocumentCheckResult]: | |
""" | |
Run all checks on the document. | |
Args: | |
doc_path (str): Path to the document. | |
doc_type (str): Type of the document. | |
template_type (str, optional): Template type, if applicable. | |
Returns: | |
Dict[str, DocumentCheckResult]: Dictionary of check names to results. | |
""" | |
# Read the document | |
doc = self.extract_paragraphs(doc_path) | |
# Retrieve any specific flags | |
checks_config = self.config_manager.config['document_types'].get(doc_type, {}) | |
skip_title_check = checks_config.get('skip_title_check', False) | |
# Initialize results dictionary | |
results = {} | |
# Define order of checks for better organization | |
check_sequence = [ | |
('heading_title_check', lambda: self.heading_title_check(doc, doc_type)), | |
('heading_title_period_check', lambda: self.heading_title_period_check(doc, doc_type)), | |
('acronym_check', lambda: self.acronym_check(doc)), | |
('terminology_check', lambda: self.check_terminology(doc)), | |
('section_symbol_usage_check', lambda: self.check_section_symbol_usage(doc)), | |
('caption_check_table', lambda: self.caption_check(doc, doc_type, 'Table')), | |
('caption_check_figure', lambda: self.caption_check(doc, doc_type, 'Figure')), | |
('table_figure_reference_check', lambda: self.table_figure_reference_check(doc, doc_type)), | |
('document_title_check', lambda: self.document_title_check(doc_path, doc_type) if not skip_title_check else DocumentCheckResult(success=True, issues=[])), | |
('double_period_check', lambda: self.double_period_check(doc)), | |
('spacing_check', lambda: self.spacing_check(doc)), | |
('abbreviation_usage_check', lambda: self.check_abbreviation_usage(doc)), | |
('date_formats_check', lambda: self.check_date_formats(doc)), | |
('placeholders_check', lambda: self.check_placeholders(doc)) | |
] | |
# Run each check and store results | |
for check_name, check_func in check_sequence: | |
try: | |
results[check_name] = check_func() | |
except Exception as e: | |
self.logger.error(f"Error running {check_name}: {str(e)}") | |
results[check_name] = DocumentCheckResult( | |
success=False, | |
issues=[{'error': f'Check failed with error: {str(e)}'}] | |
) | |
return results | |
class DocumentCheckResultsFormatter: | |
"""Formats document check results in a user-friendly way with detailed examples and fixes.""" | |
def __init__(self): | |
"""Initialize the formatter with colorama for cross-platform color support.""" | |
init() # Initialize colorama | |
# Enhanced issue categories with examples and specific fixes | |
self.issue_categories = { | |
'heading_title_check': { | |
'title': 'Required Headings Check', | |
'description': 'Document must contain all required headings for its type.', | |
'solution': 'Add all required headings in the correct order', | |
'example_fix': { | |
'before': 'Missing required heading "PURPOSE."', | |
'after': 'Added heading "PURPOSE." at the beginning of the document' | |
} | |
}, | |
'heading_title_period_check': { | |
'title': 'Heading Period Format', | |
'description': 'Headings must follow document type period requirements', | |
'solution': 'Format heading periods according to document type requirements', | |
'example_fix': { | |
'before': 'Purpose', | |
'after': 'Purpose.' # For ACs and Orders | |
} | |
}, | |
'table_figure_reference_check': { | |
'title': 'Table and Figure References', | |
'description': 'References to tables and figures must be properly capitalized', | |
'solution': 'Capitalize references at start of sentences, use lowercase within sentences', | |
'example_fix': { | |
'before': 'Table 1 shows... The Table 2 indicates...', | |
'after': 'Table 1 shows... The table 2 indicates...' | |
} | |
}, | |
'acronym_check': { | |
'title': 'Acronym Definition Issues', | |
'description': 'Acronyms must be defined at their first use in the document.', | |
'solution': 'Define each acronym at its first use, e.g., "Federal Aviation Administration (FAA)"', | |
'example_fix': { | |
'before': 'The FAA published new guidelines.', | |
'after': 'The Federal Aviation Administration (FAA) published new guidelines.' | |
} | |
}, | |
'terminology_check': { | |
'title': 'Incorrect Terminology', | |
'description': 'Non-standard or prohibited terms and phrases detected. Avoid relative position references.', | |
'solution': 'Use explicit references to paragraphs, sections, tables, and figures', | |
'example_fix': { | |
'before': 'See above section for details | Refer to below table | shall comply with', | |
'after': 'See paragraph 3.2 for details | Refer to table 2-1 | must comply with' | |
} | |
}, | |
'section_symbol_usage_check': { | |
'title': 'Section Symbol (Β§) Format Issues', | |
'description': 'Incorrect formatting of section symbols in references.', | |
'solution': 'Format section symbols correctly and never start sentences with them', | |
'example_fix': { | |
'before': 'Β§ 25.25 states | 14 CFR Β§ 21.21', | |
'after': 'Section 25.25 states | 14 CFR 21.21' | |
} | |
}, | |
'caption_check_table': { | |
'title': 'Table Caption Format Issues', | |
'description': 'Table captions do not follow the required format.', | |
'solution': 'Use consistent table numbering format', | |
'example_fix': { | |
'before': 'Table 5. | Table A | Tables', | |
'after': 'Table 5-1. | Table 1-1 | Table 2-1' | |
} | |
}, | |
'caption_check_figure': { | |
'title': 'Figure Caption Format Issues', | |
'description': 'Figure captions do not follow the required format.', | |
'solution': 'Use consistent figure numbering format', | |
'example_fix': { | |
'before': 'Figure 5. | Figure A | Figures', | |
'after': 'Figure 5-1. | Figure 1-1 | Figure 2-1' | |
} | |
}, | |
'document_title_check': { | |
'title': 'Document Title Format Issues', | |
'description': 'Document titles are not properly formatted.', | |
'solution': 'Format titles according to document type requirements', | |
'example_fix': { | |
'before': '"AC 20-114" | "Advisory Circular"', | |
'after': 'AC 20-114 | Advisory Circular' | |
} | |
}, | |
'double_period_check': { | |
'title': 'Multiple Period Issues', | |
'description': 'Sentences ending with multiple periods.', | |
'solution': 'Remove multiple periods that end sentences', | |
'example_fix': { | |
'before': 'The following ACs are related to the guidance in this document..', | |
'after': 'The following ACs are related to the guidance in this document.' | |
} | |
}, | |
'spacing_check': { | |
'title': 'Spacing Issues', | |
'description': 'Incorrect spacing in text.', | |
'solution': 'Fix spacing issues: remove any missing spaces, double spaces, or inadvertent tabs.', | |
'example_fix': { | |
'before': 'AC25.25 | The following ACs (double spaces)', | |
'after': 'AC 25.25 | The following ACs (single space)' | |
} | |
}, | |
'date_formats_check': { | |
'title': 'Date Format Issues', | |
'description': 'Dates not in the required format.', | |
'solution': 'Use the format "Month Day, Year"', | |
'example_fix': { | |
'before': '01/15/2024 | 2024-01-15 | 15 January 2024', | |
'after': 'January 15, 2024' | |
} | |
}, | |
'placeholders_check': { | |
'title': 'Placeholder Content', | |
'description': 'Placeholder text remains in the document.', | |
'solution': 'Replace all placeholder content with actual content', | |
'example_fix': { | |
'before': 'TBD | To be determined | [Insert text]', | |
'after': 'Actual content specific to the context' | |
} | |
} | |
} | |
# Add these two helper methods here, after __init__ and before other methods | |
def _format_colored_text(self, text: str, color: str) -> str: | |
"""Helper method to format colored text with reset. | |
Args: | |
text: The text to be colored | |
color: The color to apply (from colorama.Fore) | |
Returns: | |
str: The colored text with reset styling | |
""" | |
return f"{color}{text}{Style.RESET_ALL}" | |
def _format_example(self, example_fix: Dict[str, str]) -> List[str]: | |
"""Format example fixes consistently. | |
Args: | |
example_fix: Dictionary containing 'before' and 'after' examples | |
Returns: | |
List[str]: Formatted example lines | |
""" | |
return [ | |
f" β Incorrect: {example_fix['before']}", | |
f" β Correct: {example_fix['after']}" | |
] | |
def _format_heading_issues(self, result: DocumentCheckResult, doc_type: str) -> List[str]: | |
"""Format heading check issues consistently.""" | |
output = [] | |
for issue in result.issues: | |
if issue.get('type') == 'missing_headings': | |
missing = sorted(issue['missing']) | |
output.append(f"\n Missing Required Headings for {doc_type}:") | |
for heading in missing: | |
output.append(f" β’ {heading}") | |
elif issue.get('type') == 'unexpected_headings': | |
unexpected = sorted(issue['unexpected']) | |
output.append(f"\n Unexpected Headings Found:") | |
for heading in unexpected: | |
output.append(f" β’ {heading}") | |
return output | |
def _format_period_issues(self, result: DocumentCheckResult) -> List[str]: | |
"""Format period check issues consistently.""" | |
output = [] | |
if result.issues: | |
output.append(f"\n Heading Period Format Issues:") | |
for issue in result.issues: | |
if 'message' in issue: | |
output.append(f" β’ {issue['message']}") | |
return output | |
def _format_reference_issues(self, result: DocumentCheckResult) -> List[str]: | |
"""Format reference issues consistently.""" | |
output = [] | |
for issue in result.issues: | |
if isinstance(issue, dict): | |
reference_text = f" β’ {issue['reference']} should be {issue['correct_form']}" | |
output.append(reference_text) | |
if 'sentence' in issue: | |
context = textwrap.fill( | |
issue['sentence'], | |
width=76, | |
initial_indent=' ', | |
subsequent_indent=' ' | |
) | |
output.append(f"{Fore.YELLOW}Context: {context}{Style.RESET_ALL}") | |
return output | |
def _format_caption_issues(self, result: DocumentCheckResult) -> List[str]: | |
"""Format caption issues consistently.""" | |
output = [] | |
for issue in result.issues: | |
if isinstance(issue, dict): | |
output.append(f" β’ {issue.get('incorrect_caption', '')} (correct format: {issue.get('correct_format', '')})") | |
return output | |
def _format_standard_issue(self, issue: Dict[str, Any]) -> str: | |
"""Format a standard issue consistently.""" | |
if isinstance(issue, dict): | |
# Handle issues with occurrences list | |
if 'occurrences' in issue: | |
# Format the first 3 occurrences | |
examples = issue['occurrences'][:3] | |
formatted_examples = [] | |
for example in examples: | |
if 'sentence' in example: | |
formatted_examples.append(example['sentence']) | |
elif isinstance(example, str): | |
formatted_examples.append(example) | |
description = issue.get('description', '') | |
return textwrap.fill( | |
f" β’ {description} - Examples: {'; '.join(formatted_examples)}", | |
width=76, | |
subsequent_indent=' ' | |
) | |
# Handle issues with direct sentence reference | |
elif 'sentence' in issue: | |
return textwrap.fill( | |
issue['sentence'], | |
width=76, | |
initial_indent=' β’ ', | |
subsequent_indent=' ' | |
) | |
# Handle issues with specific error messages | |
elif 'error' in issue: | |
return f" β’ Error: {issue['error']}" | |
# Handle issues with description and matches | |
elif all(k in issue for k in ['issue_type', 'description', 'matches']): | |
matches_str = '; '.join(str(m) for m in issue['matches'][:3]) | |
return textwrap.fill( | |
f" β’ {issue['description']} - Found: {matches_str}", | |
width=76, | |
subsequent_indent=' ' | |
) | |
# Handle terminology issues | |
elif 'incorrect_term' in issue and 'correct_term' in issue: | |
return textwrap.fill( | |
f" β’ '{issue['incorrect_term']}' should be '{issue['correct_term']}' in: {issue.get('sentence', '')}", | |
width=76, | |
subsequent_indent=' ' | |
) | |
# Handle placeholder issues | |
elif 'placeholder' in issue: | |
return textwrap.fill( | |
f" β’ Found placeholder '{issue['placeholder']}' in: {issue.get('sentence', '')}", | |
width=76, | |
subsequent_indent=' ' | |
) | |
# Handle other dictionary formats | |
else: | |
message_parts = [] | |
for k, v in issue.items(): | |
if k not in ['type', 'error']: | |
if isinstance(v, list): | |
if all(isinstance(item, dict) for item in v): | |
# Handle list of dictionaries | |
v_str = '; '.join(str(item.get('sentence', str(item))) for item in v[:3]) | |
else: | |
# Handle list of strings | |
v_str = ', '.join(str(item) for item in v[:3]) | |
message_parts.append(f"{k}: {v_str}") | |
else: | |
message_parts.append(f"{k}: {v}") | |
return f" β’ {'; '.join(message_parts)}" | |
return f" β’ {str(issue)}" | |
def format_results(self, results: Dict[str, Any], doc_type: str) -> str: | |
""" | |
Format check results into a detailed, user-friendly report. | |
Args: | |
results: Dictionary of check results | |
doc_type: Type of document being checked | |
Returns: | |
str: Formatted report with consistent styling | |
""" | |
format_group = None | |
for group, rules in DOCUMENT_FORMATTING_RULES.items(): | |
if doc_type in rules["types"]: | |
format_group = rules | |
break | |
# Use default if document type not found | |
if not format_group: | |
format_group = DOCUMENT_FORMATTING_RULES["no_formatting"] | |
# Update the document title check category with global rules | |
self.issue_categories['document_title_check'] = { | |
'title': 'Referenced Document Title Format Issues', | |
'description': format_group['description'], | |
'solution': "Format referenced document titles as follows: " + ( | |
"Italicize the title" if format_group['italics'] else | |
"Put the title in quotes" if format_group['quotes'] else | |
"No special formatting required" | |
), | |
'example_fix': { | |
'before': 'See AC 20-135, Powerplant Installation for information on X.', | |
'after': format_group['example'] | |
} | |
} | |
# Determine caption format based on document type | |
if doc_type in ["Advisory Circular", "Order"]: | |
table_format = { | |
'title': 'Table Caption Format Issues', | |
'description': 'Table captions in Advisory Circulars and Orders must use X-Y numbering format.', | |
'solution': 'Use the format "Table X-Y" where X is the chapter number and Y is the sequence number', | |
'example_fix': { | |
'before': 'Table 5. | Table A | Tables', | |
'after': 'Table 5-1. | Table 1-1 | Table 2-1' | |
} | |
} | |
figure_format = { | |
'title': 'Figure Caption Format Issues', | |
'description': 'Figure captions in Advisory Circulars and Orders must use X-Y numbering format.', | |
'solution': 'Use the format "Figure X-Y" where X is the chapter number and Y is the sequence number', | |
'example_fix': { | |
'before': 'Figure 5. | Figure A | Figures', | |
'after': 'Figure 5-1. | Figure 1-1 | Figure 2-1' | |
} | |
} | |
else: | |
table_format = { | |
'title': 'Table Caption Format Issues', | |
'description': f'Table captions in {doc_type}s must use sequential numbering.', | |
'solution': 'Use the format "Table X" where X is a sequential number', | |
'example_fix': { | |
'before': 'Table A. | Tables | Table 1-1', | |
'after': 'Table 1 | Table 2 | Table 3' | |
} | |
} | |
figure_format = { | |
'title': 'Figure Caption Format Issues', | |
'description': f'Figure captions in {doc_type}s must use sequential numbering.', | |
'solution': 'Use the format "Figure X" where X is a sequential number', | |
'example_fix': { | |
'before': 'Figure A. | Figures | Figure 1-1', | |
'after': 'Figure 1 | Figure 2 | Figure 3' | |
} | |
} | |
# Update the issue categories with the correct format | |
self.issue_categories['caption_check_table'] = table_format | |
self.issue_categories['caption_check_figure'] = figure_format | |
output = [] | |
# Header | |
output.append(f"\n{Fore.CYAN}{'='*80}") | |
output.append(f"Document Check Results Summary") | |
output.append(f"{'='*80}{Style.RESET_ALL}\n") | |
# Count total issues | |
total_issues = sum(1 for r in results.values() if not r.success) | |
if total_issues == 0: | |
output.append(f"{self._format_colored_text('β All checks passed successfully!', Fore.GREEN)}\n") | |
return '\n'.join(output) | |
output.append(f"{Fore.YELLOW}Found {total_issues} categories of issues that need attention:{Style.RESET_ALL}\n") | |
# Process all check results consistently | |
for check_name, result in results.items(): | |
if not result.success and check_name in self.issue_categories: | |
category = self.issue_categories[check_name] | |
# Add extra line break before each category | |
output.append("\n") | |
# Category Header | |
output.append(self._format_colored_text(f"β {category['title']}", Fore.YELLOW)) | |
output.append(f" {category['description']}") | |
output.append(f" {self._format_colored_text('How to fix: ' + category['solution'], Fore.GREEN)}") | |
# Example Fix | |
output.append(f"\n {self._format_colored_text('Example Fix:', Fore.CYAN)}") | |
output.extend(self._format_example(category['example_fix'])) | |
output.append("") # Add blank line after example | |
# Actual Issues Found | |
output.append(f" {self._format_colored_text('Issues found in your document:', Fore.CYAN)}") | |
if check_name == 'heading_title_check': | |
output.extend(self._format_heading_issues(result, doc_type)) | |
elif check_name == 'heading_title_period_check': | |
output.extend(self._format_period_issues(result)) | |
elif check_name == 'table_figure_reference_check': | |
output.extend(self._format_reference_issues(result)) | |
elif check_name in ['caption_check_table', 'caption_check_figure']: | |
output.extend(self._format_caption_issues(result)) | |
else: | |
# Standard issue formatting | |
formatted_issues = [self._format_standard_issue(issue) for issue in result.issues[:3]] | |
output.extend(formatted_issues) | |
if len(result.issues) > 3: | |
output.append(f"\n ... and {len(result.issues) - 3} more similar issues.") | |
# Summary and Next Steps | |
output.append(f"\n{Fore.CYAN}{'='*80}") | |
output.append("NEXT STEPS") | |
output.append(f"{'='*80}{Style.RESET_ALL}") | |
output.append("1. Review each issue category in order of importance:") | |
output.append(" - Critical: Heading and terminology issues") | |
output.append(" - Important: Acronym definitions and section references") | |
output.append(" - Standard: Formatting and spacing issues") | |
output.append("\n2. Make corrections using the provided examples as guides") | |
output.append("3. Re-run the document check to verify all issues are resolved") | |
output.append("\n4. Common tips:") | |
output.append(" - Use search/replace for consistent fixes") | |
output.append(" - Update your document template to prevent future issues") | |
output.append(" - Keep the style manuals and Orders handy while making corrections") | |
output.append(f"\n{Fore.CYAN}{'='*80}{Style.RESET_ALL}\n") | |
return '\n'.join(output) | |
def save_report(self, results: Dict[str, Any], filepath: str, doc_type: str) -> None: | |
"""Save the formatted results to a file with proper formatting.""" | |
try: | |
with open(filepath, 'w', encoding='utf-8') as f: | |
# Create a report without color codes | |
report = self.format_results(results, doc_type) | |
# Strip color codes | |
for color in [Fore.CYAN, Fore.GREEN, Fore.YELLOW, Fore.RED, Style.RESET_ALL]: | |
report = report.replace(str(color), '') | |
# Convert markdown-style italics to alternative formatting for plain text | |
report = report.replace('*', '_') | |
f.write(report) | |
except Exception as e: | |
print(f"Error saving report: {e}") | |
def process_document(file_obj, doc_type: str, template_type: Optional[str] = None) -> str: | |
"""Process document and run all checks.""" | |
try: | |
checker = FAADocumentChecker() | |
if isinstance(file_obj, bytes): | |
file_obj = io.BytesIO(file_obj) | |
results = checker.run_all_checks(file_obj, doc_type, template_type) | |
return format_markdown_results(results, doc_type) | |
except Exception as e: | |
logging.error(f"Error processing document: {str(e)}") | |
traceback.print_exc() | |
return f""" | |
# β Error Processing Document | |
**Error Details:** {str(e)} | |
Please ensure: | |
1. The file is a valid .docx document | |
2. The file is not corrupted or password protected | |
3. The file is properly formatted | |
Try again after checking these issues. If the problem persists, contact support. | |
""" | |
def format_markdown_results(results: Dict[str, DocumentCheckResult], doc_type: str) -> str: | |
"""Format check results into a Markdown string for Gradio display.""" | |
output = [] | |
current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
output.extend([ | |
f"# Document Check Results - {current_time}", | |
f"## Document Type: {doc_type}", | |
"---\n" | |
]) | |
total_issues = sum(1 for r in results.values() if not r.success) | |
if total_issues == 0: | |
output.append("β **All checks passed successfully!**\n") | |
return "\n".join(output) | |
output.append(f"β Found issues in {total_issues} check categories\n") | |
check_categories = { | |
'heading_title_check': {'title': 'π Required Headings', 'priority': 1}, | |
'heading_title_period_check': {'title': 'π Heading Period Usage', 'priority': 1}, | |
'acronym_check': {'title': 'π Acronym Definitions', 'priority': 2}, | |
'terminology_check': {'title': 'π Terminology Usage', 'priority': 2}, | |
'section_symbol_usage_check': {'title': 'Β§ Section Symbol Usage', 'priority': 2}, | |
'caption_check_table': {'title': 'π Table Captions', 'priority': 3}, | |
'caption_check_figure': {'title': 'πΌοΈ Figure Captions', 'priority': 3}, | |
'table_figure_reference_check': {'title': 'π Table/Figure References', 'priority': 3}, | |
'document_title_check': {'title': 'π Document Title Format', 'priority': 1}, | |
'double_period_check': {'title': 'β‘ Double Periods', 'priority': 4}, | |
'spacing_check': {'title': 'β¨οΈ Spacing Issues', 'priority': 4}, | |
'abbreviation_usage_check': {'title': 'π Abbreviation Usage', 'priority': 3}, | |
'date_formats_check': {'title': 'π Date Formats', 'priority': 3}, | |
'placeholders_check': {'title': 'π© Placeholder Content', 'priority': 1} | |
} | |
sorted_checks = sorted( | |
[(name, result) for name, result in results.items()], | |
key=lambda x: check_categories.get(x[0], {'priority': 999})['priority'] | |
) | |
for check_name, result in sorted_checks: | |
if not result.success: | |
category = check_categories.get(check_name, {'title': check_name.replace('_', ' ').title()}) | |
output.append(f"### {category['title']}") | |
if isinstance(result.issues, list): | |
for issue in result.issues[:5]: | |
if isinstance(issue, dict): | |
for key, value in issue.items(): | |
if isinstance(value, list): | |
output.extend([f"- {item}" for item in value]) | |
else: | |
output.append(f"- {key}: {value}") | |
else: | |
output.append(f"- {issue}") | |
if len(result.issues) > 5: | |
output.append(f"\n*...and {len(result.issues) - 5} more similar issues*") | |
output.append("") | |
output.extend([ | |
"## π Summary and Recommendations", | |
"", | |
"### Priority Order for Fixes:", | |
"1. π΄ Critical: Heading formats, required content, and document structure", | |
"2. π‘ Important: Terminology, acronyms, and references", | |
"3. π’ Standard: Formatting, spacing, and style consistency", | |
"", | |
"### Next Steps:", | |
"1. Address issues in priority order", | |
"2. Use search/replace for consistent fixes", | |
"3. Re-run checker after making changes", | |
"4. Update your document template if needed", | |
"" | |
]) | |
return "\n".join(output) | |
def create_interface(): | |
"""Create and configure the Gradio interface.""" | |
document_types = [ | |
"Advisory Circular", | |
"Airworthiness Criteria", | |
"Deviation Memo", | |
"Exemption", | |
"Federal Register Notice", | |
"Order", | |
"Policy Statement", | |
"Rule", | |
"Special Condition", | |
"Technical Standard Order", | |
"Other" | |
] | |
template_types = ["Short AC template AC", "Long AC template AC"] | |
def format_results_as_html(text_results): | |
"""Convert the text results into styled HTML.""" | |
if not text_results: | |
return """ | |
<div class="p-4 text-gray-600"> | |
Results will appear here after processing... | |
</div> | |
""" | |
# Split into sections | |
sections = text_results.split('β ') | |
header = sections[0] | |
issues = sections[1:] | |
# Format header | |
header_html = f""" | |
<div class="max-w-4xl mx-auto p-4 bg-white rounded-lg shadow-sm mb-6"> | |
<h1 class="text-2xl font-bold text-gray-800 mb-4">Document Check Results Summary</h1> | |
<div class="text-lg {'text-green-600' if 'All checks passed' in header else 'text-amber-600'}"> | |
{header.strip()} | |
</div> | |
</div> | |
""" | |
# Format each issue section | |
issues_html = "" | |
for section in issues: | |
if not section.strip(): | |
continue | |
lines = section.strip().split('\n') | |
title = lines[0] | |
content = '\n'.join(lines[1:]) | |
# Split content into description, how to fix, and examples | |
parts = content.split('Example Fix:') | |
description = parts[0] | |
examples = parts[1] if len(parts) > 1 else "" | |
issues_html += f""" | |
<div class="bg-white rounded-lg shadow-sm mb-6 overflow-hidden"> | |
<div class="bg-gray-50 px-6 py-4 border-b"> | |
<h2 class="text-lg font-semibold text-gray-800">{title.strip()}</h2> | |
</div> | |
<div class="px-6 py-4"> | |
<div class="text-gray-600 mb-4"> | |
{description.strip()} | |
</div> | |
<div class="bg-green-50 rounded p-4 mb-4"> | |
<div class="text-green-800"> | |
<span class="font-medium">How to fix:</span> | |
{description.split('How to fix:')[1].strip() if 'How to fix:' in description else ''} | |
</div> | |
</div> | |
""" | |
if examples: | |
examples_lines = examples.strip().split('\n') | |
issues_html += """ | |
<div class="mb-4"> | |
<h3 class="font-medium text-gray-800 mb-2">Examples:</h3> | |
<div class="space-y-2 ml-4"> | |
""" | |
for line in examples_lines: | |
line = line.strip() | |
if line.startswith('β'): | |
issues_html += f""" | |
<div class="text-red-600"> | |
<span class="inline-block w-4">β</span> | |
{line.replace('β Incorrect:', '').strip()} | |
</div> | |
""" | |
elif line.startswith('β'): | |
issues_html += f""" | |
<div class="text-green-600"> | |
<span class="inline-block w-4">β</span> | |
{line.replace('β Correct :', '').strip()} | |
</div> | |
""" | |
elif line.startswith('β’'): | |
issues_html += f""" | |
<div class="text-gray-600 ml-4"> | |
β’ {line.replace('β’', '').strip()} | |
</div> | |
""" | |
elif 'more similar issues' in line: | |
issues_html += f""" | |
<div class="text-gray-500 italic mt-2"> | |
{line.strip()} | |
</div> | |
""" | |
issues_html += """ | |
</div> | |
</div> | |
""" | |
issues_html += """ | |
</div> | |
</div> | |
""" | |
# Combine all HTML | |
full_html = f""" | |
<div class="mx-auto p-4" style="font-family: system-ui, -apple-system, sans-serif;"> | |
<style> | |
.text-2xl {{ font-size: 1.5rem; }} | |
.text-lg {{ font-size: 1.125rem; }} | |
.font-bold {{ font-weight: 700; }} | |
.font-semibold {{ font-weight: 600; }} | |
.font-medium {{ font-weight: 500; }} | |
.text-gray-800 {{ color: #1f2937; }} | |
.text-gray-600 {{ color: #4b5563; }} | |
.text-gray-500 {{ color: #6b7280; }} | |
.text-green-600 {{ color: #059669; }} | |
.text-green-800 {{ color: #065f46; }} | |
.text-red-600 {{ color: #dc2626; }} | |
.text-amber-600 {{ color: #d97706; }} | |
.bg-white {{ background-color: #ffffff; }} | |
.bg-gray-50 {{ background-color: #f9fafb; }} | |
.bg-green-50 {{ background-color: #ecfdf5; }} | |
.rounded-lg {{ border-radius: 0.5rem; }} | |
.shadow-sm {{ box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05); }} | |
.mb-6 {{ margin-bottom: 1.5rem; }} | |
.mb-4 {{ margin-bottom: 1rem; }} | |
.mb-2 {{ margin-bottom: 0.5rem; }} | |
.ml-4 {{ margin-left: 1rem; }} | |
.mt-2 {{ margin-top: 0.5rem; }} | |
.p-4 {{ padding: 1rem; }} | |
.px-6 {{ padding-left: 1.5rem; padding-right: 1.5rem; }} | |
.py-4 {{ padding-top: 1rem; padding-bottom: 1rem; }} | |
.space-y-2 > * + * {{ margin-top: 0.5rem; }} | |
.italic {{ font-style: italic; }} | |
.border-b {{ border-bottom: 1px solid #e5e7eb; }} | |
.overflow-hidden {{ overflow: hidden; }} | |
</style> | |
{header_html} | |
{issues_html} | |
</div> | |
""" | |
return full_html | |
with gr.Blocks() as demo: | |
gr.Markdown( | |
""" | |
# π Document Checker Tool | |
### Purpose | |
This tool checks Word documents for compliance with U.S. federal documentation standards. | |
### How to Use | |
1. Upload your Word document (.docx format) | |
2. Select the document type | |
3. Click "Check Document" | |
> **Note:** Please ensure your document is clean (no track changes or comments) | |
""" | |
) | |
with gr.Row(): | |
with gr.Column(scale=1): | |
file_input = gr.File( | |
label="π Upload Word Document (.docx)", | |
file_types=[".docx"], | |
type="binary" | |
) | |
doc_type = gr.Dropdown( | |
choices=document_types, | |
label="π Document Type", | |
value="Advisory Circular", | |
info="Select the type of document you're checking" | |
) | |
template_type = gr.Radio( | |
choices=template_types, | |
label="π Template Type", | |
visible=False, | |
info="Only applicable for Advisory Circulars" | |
) | |
submit_btn = gr.Button( | |
"π Check Document", | |
variant="primary" | |
) | |
with gr.Column(scale=2): | |
results = gr.HTML() | |
def process_and_format(file_obj, doc_type, template_type): | |
"""Process document and format results as HTML.""" | |
try: | |
# Get text results from original process_document function | |
checker = FAADocumentChecker() | |
if isinstance(file_obj, bytes): | |
file_obj = io.BytesIO(file_obj) | |
results = checker.run_all_checks(file_obj, doc_type, template_type) | |
# Format results using DocumentCheckResultsFormatter | |
formatter = DocumentCheckResultsFormatter() | |
text_results = formatter.format_results(results, doc_type) | |
# Convert to HTML | |
return format_results_as_html(text_results) | |
except Exception as e: | |
logging.error(f"Error processing document: {str(e)}") | |
traceback.print_exc() | |
return f""" | |
<div style="color: red; padding: 1rem;"> | |
β Error processing document: {str(e)} | |
<br><br> | |
Please ensure the file is a valid .docx document and try again. | |
</div> | |
""" | |
# Update template type visibility based on document type | |
def update_template_visibility(doc_type): | |
return gr.update(visible=doc_type == "Advisory Circular") | |
doc_type.change( | |
fn=update_template_visibility, | |
inputs=[doc_type], | |
outputs=[template_type] | |
) | |
# Handle document processing | |
submit_btn.click( | |
fn=process_and_format, | |
inputs=[file_input, doc_type, template_type], | |
outputs=[results] | |
) | |
gr.Markdown( | |
""" | |
### π Important Notes | |
- This tool is in development; you may encounter false positives | |
- For questions or feedback, contact Eric Putnam | |
- Results are not stored or saved | |
""" | |
) | |
return demo | |
# Initialize and launch the interface | |
if __name__ == "__main__": | |
# Setup logging | |
logging.basicConfig( | |
level=logging.INFO, | |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' | |
) | |
# Create and launch the interface | |
demo = create_interface() | |
demo.launch( | |
share=False, # Set to True if you want to generate a public link | |
server_name="0.0.0.0", # Allows external access | |
server_port=7860, # Default Gradio port | |
debug=True | |
) |