Spaces:
Sleeping
Sleeping
import gradio as gr | |
import logging | |
import re | |
import json | |
import time | |
from typing import Dict, List, Any, Tuple, Optional | |
from dataclasses import dataclass | |
from functools import wraps | |
from docx import Document | |
import io | |
import os | |
import traceback | |
class DocumentCheckResult: | |
"""Structured result for document checks.""" | |
success: bool | |
issues: List[Dict[str, Any]] | |
details: Optional[Dict[str, Any]] = None | |
def profile_performance(func): | |
"""Decorator to profile function performance.""" | |
def wrapper(*args, **kwargs): | |
start_time = time.time() | |
result = func(*args, **kwargs) | |
end_time = time.time() | |
# Get logger from the class instance (first argument) | |
logger = args[0].logger if hasattr(args[0], 'logger') else logging.getLogger(__name__) | |
logger.info( | |
f"Performance: {func.__name__} took {end_time - start_time:.4f} seconds" | |
) | |
return result | |
return wrapper | |
class DocumentCheckerConfig: | |
"""Configuration management for document checks.""" | |
def __init__(self, config_path: Optional[str] = None): | |
"""Initialize configuration with optional config file.""" | |
self.config = self._load_config(config_path) | |
self.logger = self._setup_logger() | |
def _load_config(self, config_path: Optional[str] = None) -> Dict[str, Any]: | |
"""Load configuration from JSON file or use default settings.""" | |
default_config = { | |
"logging": { | |
"level": "INFO", | |
"format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s" | |
}, | |
"checks": { | |
"acronyms": True, | |
"terminology_check": True, | |
"headings": True | |
}, | |
"document_types": { | |
"Advisory Circular": { | |
"required_headings": [ | |
"Purpose.", | |
"Applicability.", | |
"Cancellation.", | |
"Related Material.", | |
"Definition of Key Terms." | |
], | |
"skip_title_check": False | |
}, | |
"Federal Register Notice": { | |
"required_headings": [ | |
"Purpose of This Notice", | |
"Audience", | |
"Where can I Find This Notice" | |
], | |
"skip_title_check": False | |
}, | |
"Order": { | |
"required_headings": [ | |
"Purpose of This Order.", | |
"Audience.", | |
"Where to Find This Order." | |
], | |
"skip_title_check": False | |
}, | |
"Policy Statement": { | |
"required_headings": [ | |
"SUMMARY", | |
"CURRENT REGULATORY AND ADVISORY MATERIAL", | |
"RELEVANT PAST PRACTICE", | |
"POLICY", | |
"EFFECT OF POLICY", | |
"CONCLUSION" | |
], | |
"skip_title_check": False | |
}, | |
"Technical Standard Order": { | |
"required_headings": [ | |
"PURPOSE.", | |
"APPLICABILITY.", | |
"REQUIREMENTS.", | |
"MARKING.", | |
"APPLICATION DATA REQUIREMENTS.", | |
"MANUFACTURER DATA REQUIREMENTS.", | |
"FURNISHED DATA REQUIREMENTS.", | |
"HOW TO GET REFERENCED DOCUMENTS." | |
], | |
"skip_title_check": False | |
}, | |
"Other": { | |
"required_headings": [], | |
"skip_title_check": True | |
} | |
} | |
} | |
if config_path and os.path.exists(config_path): | |
try: | |
with open(config_path, 'r') as f: | |
user_config = json.load(f) | |
self._deep_merge(default_config, user_config) | |
except (json.JSONDecodeError, IOError) as e: | |
logging.warning(f"Error loading config: {e}. Using default config.") | |
return default_config | |
def _deep_merge(self, base: Dict[str, Any], update: Dict[str, Any]) -> Dict[str, Any]: | |
"""Recursively merge two dictionaries.""" | |
for key, value in update.items(): | |
if isinstance(value, dict) and key in base and isinstance(base[key], dict): | |
self._deep_merge(base[key], value) | |
else: | |
base[key] = value | |
return base | |
def _setup_logger(self) -> logging.Logger: | |
"""Set up and configure logging based on configuration.""" | |
logger = logging.getLogger(__name__) | |
log_level = getattr(logging, self.config['logging']['level'].upper()) | |
formatter = logging.Formatter(self.config['logging']['format']) | |
console_handler = logging.StreamHandler() | |
console_handler.setFormatter(formatter) | |
console_handler.setLevel(log_level) | |
logger.addHandler(console_handler) | |
logger.setLevel(log_level) | |
return logger | |
class DocumentChecker: | |
"""Base class for document checking.""" | |
def __init__(self, config_path: Optional[str] = None): | |
self.config_manager = DocumentCheckerConfig(config_path) | |
self.logger = self.config_manager.logger | |
def validate_input(doc: List[str]) -> bool: | |
"""Validate input document.""" | |
return doc is not None and isinstance(doc, list) and len(doc) > 0 | |
def extract_paragraphs(cls, doc_path: str) -> List[str]: | |
"""Extract plain text paragraphs from a document.""" | |
try: | |
doc = Document(doc_path) | |
return [para.text for para in doc.paragraphs if para.text.strip()] | |
except Exception as e: | |
logging.error(f"Error extracting paragraphs: {e}") | |
return [] | |
class FAADocumentChecker(DocumentChecker): | |
def __init__(self, config_path: Optional[str] = None): | |
super().__init__(config_path) | |
# Use the decorator directly | |
def heading_title_check(self, doc: List[str], doc_type: str) -> DocumentCheckResult: | |
"""Check headings for a specific document type.""" | |
if not self.validate_input(doc): | |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}]) | |
# Use configuration for document-specific headings | |
checks = self.config_manager.config['document_types'].get( | |
doc_type, {} | |
) | |
required_headings = checks.get('required_headings', []) | |
headings_found = [] | |
# Create a set for faster lookup | |
required_headings_set = set(required_headings) | |
for para in doc: | |
para_strip = para.strip() | |
# Check if the paragraph is in the required headings list | |
if para_strip in required_headings_set: | |
headings_found.append(para_strip) | |
# Check if all required headings are found | |
all_headings_present = set(headings_found) == required_headings_set | |
issues = [] | |
if not all_headings_present: | |
missing_headings = required_headings_set - set(headings_found) | |
issues.append({'missing_headings': list(missing_headings)}) | |
return DocumentCheckResult( | |
success=all_headings_present, | |
issues=issues, | |
details={ | |
'found_headings': headings_found, | |
'required_headings': required_headings | |
} | |
) | |
def heading_title_period_check(self, doc: List[str], doc_type: str) -> DocumentCheckResult: | |
""" | |
Check if headings end with periods according to document type requirements. | |
Args: | |
doc (List[str]): List of document paragraphs | |
doc_type (str): Type of document being checked | |
Returns: | |
DocumentCheckResult: Result of the heading period check | |
""" | |
if not self.validate_input(doc): | |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}]) | |
# Define document types requiring periods in headings | |
period_required = { | |
"Advisory Circular": True, | |
"Airworthiness Criteria": False, | |
"Deviation Memo": False, | |
"Exemption": False, | |
"Federal Register Notice": False, | |
"Order": True, | |
"Policy Statement": False, | |
"Rule": False, | |
"Special Condition": False, | |
"Technical Standard Order": True, | |
"Other": False | |
} | |
# Get whether periods are required for this document type | |
should_have_period = period_required.get(doc_type, False) | |
# Get the headings configuration for this document type | |
checks = self.config_manager.config['document_types'].get(doc_type, {}) | |
required_headings = checks.get('required_headings', []) | |
required_headings_set = set(required_headings) | |
issues = [] | |
checked_headings = [] | |
for para in doc: | |
para_strip = para.strip() | |
# Check only if paragraph is a heading | |
if para_strip in required_headings_set: | |
ends_with_period = para_strip.endswith('.') | |
if should_have_period and not ends_with_period: | |
issues.append({ | |
'heading': para_strip, | |
'issue': 'missing_period', | |
'message': f"Heading should end with a period: '{para_strip}'" | |
}) | |
checked_headings.append({ | |
'heading': para_strip, | |
'has_period': False, | |
'needs_period': True | |
}) | |
elif not should_have_period and ends_with_period: | |
issues.append({ | |
'heading': para_strip, | |
'issue': 'unexpected_period', | |
'message': f"Heading should not end with a period: '{para_strip}'" | |
}) | |
checked_headings.append({ | |
'heading': para_strip, | |
'has_period': True, | |
'needs_period': False | |
}) | |
else: | |
checked_headings.append({ | |
'heading': para_strip, | |
'has_period': ends_with_period, | |
'needs_period': should_have_period | |
}) | |
success = len(issues) == 0 | |
return DocumentCheckResult( | |
success=success, | |
issues=issues, | |
details={ | |
'document_type': doc_type, | |
'periods_required': should_have_period, | |
'checked_headings': checked_headings | |
} | |
) | |
def acronym_check(self, doc: List[str]) -> DocumentCheckResult: | |
"""Check if acronyms are defined at their first use, only flagging the first instance of undefined acronyms.""" | |
if not self.validate_input(doc): | |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}]) | |
defined_acronyms = set() | |
first_occurrences = {} # Track first occurrence of each acronym | |
undefined_acronyms = [] | |
acronym_pattern = re.compile(r'\b[A-Z]{2,}\b') | |
defined_pattern = re.compile(r'\b([\w\s&]+?)\s*\((\b[A-Z]{2,}\b)\)') | |
# Predefined acronyms | |
defined_acronyms.add("14 CFR") | |
for paragraph in doc: | |
# Check for definitions first | |
defined_matches = defined_pattern.findall(paragraph) | |
for full_term, acronym in defined_matches: | |
defined_acronyms.add(acronym) | |
# If this was previously marked as undefined, remove it since we found its definition | |
if acronym in first_occurrences: | |
del first_occurrences[acronym] | |
# Check for acronyms in the paragraph | |
usage_matches = acronym_pattern.findall(paragraph) | |
for acronym in usage_matches: | |
if acronym not in defined_acronyms: | |
# Only process if we haven't seen this acronym before | |
if acronym not in first_occurrences: | |
# Find the sentence containing the first undefined acronym | |
sentences = re.split(r'(?<=[.!?])\s+', paragraph) | |
for sentence in sentences: | |
if acronym in sentence: | |
first_occurrences[acronym] = { | |
'acronym': acronym, | |
'sentence': sentence.strip() | |
} | |
break | |
# Convert first occurrences to list of issues | |
undefined_acronyms = list(first_occurrences.values()) | |
success = len(undefined_acronyms) == 0 | |
issues = undefined_acronyms if not success else [] | |
return DocumentCheckResult(success=success, issues=issues) | |
def check_terminology(self, doc: List[str]) -> DocumentCheckResult: | |
""" | |
Check document terminology for: | |
1. Legal reference formatting and preferred terms | |
2. Prohibited phrases and constructions | |
""" | |
if not self.validate_input(doc): | |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}]) | |
# Dictionary of terms that should be replaced with preferred alternatives | |
term_replacements = { | |
r'\bUSC\b': 'U.S.C.', | |
r'\bCFR Part\b': 'CFR part', | |
r'\bC\.F\.R\.\b': 'CFR', | |
r'\b14 CFR\s*§': '14 CFR', | |
r'\bWe\b': 'The FAA', | |
r'\bwe\b': 'the FAA', | |
r'\bcancelled\b': 'canceled', | |
r'\bshall\b': 'must', | |
r'\b\&\b': 'and', | |
r'\bflight crew\b': 'flightcrew' | |
} | |
# Prohibited phrases that should be flagged | |
prohibited_phrases = [ | |
r'\babove\b', | |
r'\bbelow\b', | |
r'(?:^|(?<=[.!?]\s))There\s+(?:is|are)\b' # Matches 'There is/are' at start of sentences | |
] | |
issues = [] | |
for paragraph in doc: | |
sentences = re.split(r'(?<=[.!?])\s+', paragraph) | |
for sentence in sentences: | |
# Check for incorrect terms that need replacement | |
for incorrect_pattern, correct_term in term_replacements.items(): | |
matches = re.finditer(incorrect_pattern, sentence) | |
for match in matches: | |
incorrect_term = match.group() | |
issues.append({ | |
'type': 'incorrect_term', | |
'incorrect_term': incorrect_term, | |
'correct_term': correct_term, | |
'sentence': sentence.strip() | |
}) | |
# Check for prohibited phrases | |
for phrase_pattern in prohibited_phrases: | |
match = re.search(phrase_pattern, sentence, re.IGNORECASE) | |
if match: | |
issues.append({ | |
'type': 'prohibited_phrase', | |
'phrase': match.group().strip(), | |
'sentence': sentence.strip() | |
}) | |
success = len(issues) == 0 | |
return DocumentCheckResult(success=success, issues=issues) | |
def check_section_symbol_usage(self, doc: List[str]) -> DocumentCheckResult: | |
"""Check for various section symbol (§) usage issues.""" | |
if not self.validate_input(doc): | |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}]) | |
issues = [] | |
# Patterns to identify issues | |
sentences_starting_with_section_symbol = [] | |
incorrect_14_CFR_section_symbol_usage = [] | |
single_section_symbol_multiple_sections = [] | |
missing_section_symbol_in_multiple_sections = [] | |
# Pattern to find '14 CFR §25.25' | |
pattern_14_CFR_section = re.compile(r'\b14 CFR §\s*\d+\.\d+\b') | |
# Patterns for multiple sections with single '§' | |
pattern_single_section_symbol_and = re.compile(r'§\s*\d+\.\d+\s+and\s+\d+\.\d+') | |
pattern_single_section_symbol_or = re.compile(r'§\s*\d+\.\d+\s+or\s+\d+\.\d+') | |
pattern_single_section_symbol_through = re.compile(r'§\s*\d+\.\d+\s+through\s+\d+\.\d+') | |
# Pattern for missing '§' before subsequent sections with 'or' | |
pattern_missing_section_symbol_or = re.compile(r'§\s*\d+\.\d+\s+or\s+§?\s*\d+\.\d+') | |
for paragraph in doc: | |
# Check for sentences starting with '§' | |
sentences = re.split(r'(?<=[.!?])\s+', paragraph) | |
for sentence in sentences: | |
if sentence.strip().startswith('§'): | |
sentences_starting_with_section_symbol.append(sentence.strip()) | |
# Check for '14 CFR §25.25' usage | |
matches_14_CFR = pattern_14_CFR_section.findall(paragraph) | |
for match in matches_14_CFR: | |
incorrect_14_CFR_section_symbol_usage.append(match) | |
# Check for single '§' with multiple sections using 'and' | |
matches_and = pattern_single_section_symbol_and.findall(paragraph) | |
for match in matches_and: | |
single_section_symbol_multiple_sections.append(match) | |
# Check for single '§' with multiple sections using 'or' | |
matches_or = pattern_single_section_symbol_or.findall(paragraph) | |
for match in matches_or: | |
single_section_symbol_multiple_sections.append(match) | |
# Check for single '§' with multiple sections using 'through' | |
matches_through = pattern_single_section_symbol_through.findall(paragraph) | |
for match in matches_through: | |
single_section_symbol_multiple_sections.append(match) | |
# Check for missing '§' before subsequent sections with 'or' | |
matches_missing_or = pattern_missing_section_symbol_or.findall(paragraph) | |
for match in matches_missing_or: | |
missing_section_symbol_in_multiple_sections.append(match) | |
if sentences_starting_with_section_symbol: | |
issues.append({ | |
'issue': 'sentences_starting_with_section_symbol', | |
'sentences': sentences_starting_with_section_symbol | |
}) | |
if incorrect_14_CFR_section_symbol_usage: | |
issues.append({ | |
'issue': 'incorrect_14_CFR_section_symbol_usage', | |
'matches': incorrect_14_CFR_section_symbol_usage | |
}) | |
if single_section_symbol_multiple_sections: | |
issues.append({ | |
'issue': 'single_section_symbol_multiple_sections', | |
'matches': single_section_symbol_multiple_sections | |
}) | |
if missing_section_symbol_in_multiple_sections: | |
issues.append({ | |
'issue': 'missing_section_symbol_in_multiple_sections', | |
'matches': missing_section_symbol_in_multiple_sections | |
}) | |
success = len(issues) == 0 | |
return DocumentCheckResult(success=success, issues=issues) | |
def caption_check(self, doc: List[str], doc_type: str, caption_type: str) -> DocumentCheckResult: | |
"""Check for correctly formatted captions (Table or Figure).""" | |
if not self.validate_input(doc): | |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}]) | |
# Determine the caption pattern based on document type | |
if doc_type in ["Advisory Circular", "Order"]: | |
caption_pattern = re.compile(rf'^{caption_type}\s+([A-Z0-9]+)-([A-Z0-9]+)[\.\s]', re.IGNORECASE) | |
correct_format = f"{caption_type} X-Y" | |
else: | |
caption_pattern = re.compile(rf'^{caption_type}\s+([A-Z0-9]+)[\.\s]', re.IGNORECASE) | |
correct_format = f"{caption_type} X" | |
incorrect_captions = [] | |
in_toc = False | |
for paragraph in doc: | |
# Check for start or end of Table of Contents (TOC) | |
if "Table of Contents" in paragraph or "Contents" in paragraph: | |
in_toc = True | |
continue | |
elif in_toc and paragraph.strip() == "": | |
in_toc = False # Assume blank line marks the end of TOC | |
# If within TOC, skip this paragraph | |
if in_toc: | |
continue | |
# Only check paragraphs that start with "Table" or "Figure" for proper caption format | |
paragraph_strip = paragraph.strip() | |
if paragraph_strip.lower().startswith(caption_type.lower()): | |
if not caption_pattern.match(paragraph_strip): | |
incorrect_captions.append({ | |
'incorrect_caption': paragraph_strip, | |
'correct_format': correct_format | |
}) | |
success = len(incorrect_captions) == 0 | |
return DocumentCheckResult(success=success, issues=incorrect_captions) | |
def table_figure_reference_check(self, doc: List[str], doc_type: str) -> DocumentCheckResult: | |
""" | |
Check for incorrect references to tables and figures in the document. | |
References should be lowercase within sentences and capitalized at sentence start. | |
""" | |
if not self.validate_input(doc): | |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}]) | |
incorrect_references = [] | |
# Define patterns based on document type | |
if doc_type in ["Advisory Circular", "Order"]: | |
# Matches both capitalized and lowercase variations | |
table_pattern = r'\b[Tt]able\s+\d+-\d+\b' | |
figure_pattern = r'\b[Ff]igure\s+\d+-\d+\b' | |
correct_mid_table_format = "table X-Y" | |
correct_start_table_format = "Table X-Y" | |
correct_mid_figure_format = "figure X-Y" | |
correct_start_figure_format = "Figure X-Y" | |
else: | |
table_pattern = r'\b[Tt]able\s+\d+\b' | |
figure_pattern = r'\b[Ff]igure\s+\d+\b' | |
correct_mid_table_format = "table X" | |
correct_start_table_format = "Table X" | |
correct_mid_figure_format = "figure X" | |
correct_start_figure_format = "Figure X" | |
table_ref_pattern = re.compile(table_pattern) | |
figure_ref_pattern = re.compile(figure_pattern) | |
for paragraph in doc: | |
paragraph_strip = paragraph.strip() | |
# Exclude captions | |
starts_with_table_or_figure = paragraph_strip.lower().startswith('table') or paragraph_strip.lower().startswith('figure') | |
if not starts_with_table_or_figure: | |
# Split into sentences while preserving the original text | |
sentences = re.split(r'(?<=[.!?])\s+', paragraph) | |
for sentence in sentences: | |
sentence = sentence.strip() | |
# Check table references | |
matches = table_ref_pattern.finditer(sentence) | |
for match in matches: | |
ref = match.group() | |
# Get the text before the reference | |
text_before = sentence[:match.start()].strip() | |
# Determine if reference is at start of sentence | |
is_sentence_start = text_before == "" | |
# Check if capitalization is correct | |
if is_sentence_start and not ref.startswith('Table'): | |
incorrect_references.append({ | |
'incorrect_ref': ref, | |
'correct_format': correct_start_table_format, | |
'sentence': sentence, | |
'issue': "Table reference at sentence start should be capitalized" | |
}) | |
elif not is_sentence_start and not ref.startswith('table'): | |
incorrect_references.append({ | |
'incorrect_ref': ref, | |
'correct_format': correct_mid_table_format, | |
'sentence': sentence, | |
'issue': "Table reference within sentence should be lowercase" | |
}) | |
# Check figure references | |
matches = figure_ref_pattern.finditer(sentence) | |
for match in matches: | |
ref = match.group() | |
# Get the text before the reference | |
text_before = sentence[:match.start()].strip() | |
# Determine if reference is at start of sentence | |
is_sentence_start = text_before == "" | |
# Check if capitalization is correct | |
if is_sentence_start and not ref.startswith('Figure'): | |
incorrect_references.append({ | |
'incorrect_ref': ref, | |
'correct_format': correct_start_figure_format, | |
'sentence': sentence, | |
'issue': "Figure reference at sentence start should be capitalized" | |
}) | |
elif not is_sentence_start and not ref.startswith('figure'): | |
incorrect_references.append({ | |
'incorrect_ref': ref, | |
'correct_format': correct_mid_figure_format, | |
'sentence': sentence, | |
'issue': "Figure reference within sentence should be lowercase" | |
}) | |
success = len(incorrect_references) == 0 | |
return DocumentCheckResult(success=success, issues=incorrect_references) | |
def document_title_check(self, doc_path, doc_type: str) -> DocumentCheckResult: | |
"""Check for correct formatting of document titles.""" | |
try: | |
# Handle both file paths and BytesIO objects | |
if isinstance(doc_path, (str, bytes, io.BytesIO)): | |
doc = Document(doc_path) | |
else: | |
return DocumentCheckResult( | |
success=False, | |
issues=[{'error': 'Invalid document input type'}] | |
) | |
# Rest of the method remains the same | |
incorrect_titles = [] | |
# Define formatting rules for different document types | |
formatting_rules = { | |
"Advisory Circular": {"italics": True, "quotes": False}, | |
"Airworthiness Criteria": {"italics": False, "quotes": True}, | |
"Deviation Memo": {"italics": False, "quotes": True}, | |
"Exemption": {"italics": False, "quotes": True}, | |
"Federal Register Notice": {"italics": False, "quotes": True}, | |
"Order": {"italics": False, "quotes": True}, | |
"Policy Statement": {"italics": False, "quotes": False}, | |
"Rule": {"italics": False, "quotes": True}, | |
"Special Condition": {"italics": False, "quotes": True}, | |
"Technical Standard Order": {"italics": False, "quotes": True}, | |
"Other": {"italics": False, "quotes": False} | |
} | |
if doc_type not in formatting_rules: | |
self.logger.warning(f"Unsupported document type: {doc_type}. Skipping title check.") | |
return DocumentCheckResult(success=True, issues=[]) | |
required_format = formatting_rules[doc_type] | |
ac_pattern = re.compile(r'(AC\s+\d+(?:-\d+)?(?:,|\s)+)(.+?)(?=\.|,|$)') | |
for paragraph in doc.paragraphs: | |
text = paragraph.text | |
matches = ac_pattern.finditer(text) | |
for match in matches: | |
full_match = match.group(0) | |
title_text = match.group(2).strip() | |
title_start = match.start(2) | |
title_end = match.end(2) | |
title_in_quotes = any(q in title_text for q in ['"', "'", '"', '"', ''', ''']) | |
title_is_italicized = False | |
current_pos = 0 | |
for run in paragraph.runs: | |
run_length = len(run.text) | |
run_start = current_pos | |
run_end = current_pos + run_length | |
if run_start <= title_start < run_end: | |
title_is_italicized = run.italic | |
break | |
current_pos += run_length | |
formatting_incorrect = False | |
issue_message = [] | |
if required_format["italics"] and not title_is_italicized: | |
formatting_incorrect = True | |
issue_message.append("should be italicized") | |
elif not required_format["italics"] and title_is_italicized: | |
formatting_incorrect = True | |
issue_message.append("should not be italicized") | |
if required_format["quotes"] and not title_in_quotes: | |
formatting_incorrect = True | |
issue_message.append("should be in quotes") | |
elif not required_format["quotes"] and title_in_quotes: | |
formatting_incorrect = True | |
issue_message.append("should not be in quotes") | |
if formatting_incorrect: | |
incorrect_titles.append({ | |
'text': title_text, | |
'issue': ', '.join(issue_message), | |
'sentence': text.strip() | |
}) | |
return DocumentCheckResult( | |
success=len(incorrect_titles) == 0, | |
issues=incorrect_titles | |
) | |
except Exception as e: | |
self.logger.error(f"Error in document_title_check: {e}") | |
return DocumentCheckResult( | |
success=False, | |
issues=[{'error': str(e)}] | |
) | |
def double_period_check(self, doc: List[str]) -> DocumentCheckResult: | |
"""Check for sentences that end with two periods.""" | |
if not self.validate_input(doc): | |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}]) | |
incorrect_sentences = [] | |
for paragraph in doc: | |
# Split the paragraph into sentences based on common sentence-ending punctuation | |
sentences = re.split(r'(?<=[.!?]) +', paragraph) | |
for sentence in sentences: | |
if sentence.endswith('..'): | |
incorrect_sentences.append({'sentence': sentence.strip()}) | |
success = len(incorrect_sentences) == 0 | |
return DocumentCheckResult(success=success, issues=incorrect_sentences) | |
def spacing_check(self, doc: List[str]) -> DocumentCheckResult: | |
"""Check for correct spacing in the document.""" | |
if not self.validate_input(doc): | |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}]) | |
incorrect_spacing = [] | |
# Regex patterns to find incorrect spacing | |
patterns = [ | |
(re.compile(r'(?<!\s)(AC|AD|CFR|FAA|N|SFAR)(\d+[-]?\d*)', re.IGNORECASE), "Missing space between document type and number"), | |
(re.compile(r'(?<!\s)(§|§§)(\d+\.\d+)', re.IGNORECASE), "Missing space after section symbol (§)"), | |
(re.compile(r'(?<!\s)Part(\d+)', re.IGNORECASE), "Missing space between 'Part' and number"), | |
(re.compile(r'(?<!\s)(\([a-z](?!\))|\([1-9](?!\)))', re.IGNORECASE), "Missing space before paragraph indication"), | |
(re.compile(r'\s{2,}'), "Double spaces between words") | |
] | |
for paragraph in doc: | |
sentences = re.split(r'(?<=[.!?])\s+', paragraph) | |
for sentence in sentences: | |
for pattern, issue in patterns: | |
if pattern.search(sentence): | |
incorrect_spacing.append({ | |
'issue_description': issue, | |
'sentence': sentence.strip() | |
}) | |
success = len(incorrect_spacing) == 0 | |
return DocumentCheckResult(success=success, issues=incorrect_spacing) | |
def check_abbreviation_usage(self, doc: List[str]) -> DocumentCheckResult: | |
"""Check for abbreviation consistency after first definition.""" | |
if not self.validate_input(doc): | |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}]) | |
abbreviations = {} | |
issues = [] | |
for paragraph in doc: | |
sentences = re.split(r'(?<=[.!?])\s+', paragraph) | |
for sentence in sentences: | |
# Find definitions like "Federal Aviation Administration (FAA)" | |
defined_matches = re.findall(r'\b([A-Za-z &]+)\s+\((\b[A-Z]{2,}\b)\)', sentence) | |
for full_term, acronym in defined_matches: | |
if acronym not in abbreviations: | |
abbreviations[acronym] = {"full_term": full_term.strip(), "defined": True} | |
# Check for full term usage after definition | |
for acronym, data in abbreviations.items(): | |
full_term = data["full_term"] | |
if full_term in sentence: | |
# Ignore first usage where it's defined | |
if data["defined"]: | |
data["defined"] = False # Mark it as now defined | |
else: | |
# Only flag subsequent occurrences | |
issues.append({ | |
'full_term': full_term, | |
'acronym': acronym, | |
'sentence': sentence.strip() | |
}) | |
success = len(issues) == 0 | |
return DocumentCheckResult(success=success, issues=issues) | |
def check_date_formats(self, doc: List[str]) -> DocumentCheckResult: | |
"""Check for inconsistent date formats while ignoring aviation reference numbers.""" | |
if not self.validate_input(doc): | |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}]) | |
date_issues = [] | |
# Patterns to ignore (aviation references) | |
ignore_patterns = [ | |
r'\bAD \d{4}-\d{2}-\d{2}\b', # Airworthiness Directive references | |
r'\bSWPM \d{2}-\d{2}-\d{2}\b', # Standard Wiring Practices Manual references | |
r'\bAMM \d{2}-\d{2}-\d{2}\b', # Aircraft Maintenance Manual references | |
r'\bSOPM \d{2}-\d{2}-\d{2}\b', # Standard Operating Procedure references | |
r'\b[A-Z]{2,4} \d{2}-\d{2}-\d{2}\b' # Generic manual reference pattern | |
] | |
# Combine ignore patterns into one | |
ignore_regex = '|'.join(ignore_patterns) | |
ignore_pattern = re.compile(ignore_regex) | |
# Correct date pattern: 'Month Day, Year' e.g., 'January 1, 2020' | |
correct_date_pattern = re.compile(r'\b(January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}, \d{4}\b') | |
# Incorrect date patterns | |
date_patterns = [ | |
(re.compile(r'(?<![\w/-])\d{1,2}/\d{1,2}/\d{2,4}(?![\w/-])'), "Use 'Month Day, Year' format instead of 'MM/DD/YYYY'"), | |
(re.compile(r'(?<![\w/-])\d{1,2}-\d{1,2}-\d{2,4}(?![\w/-])'), "Use 'Month Day, Year' format instead of 'MM-DD-YYYY'"), | |
(re.compile(r'(?<![\w/-])\d{4}-\d{1,2}-\d{1,2}(?![\w/-])'), "Use 'Month Day, Year' format instead of 'YYYY-MM-DD'") | |
] | |
for paragraph in doc: | |
sentences = re.split(r'(?<=[.!?])\s+', paragraph) | |
for sentence in sentences: | |
# First, identify and temporarily remove text that should be ignored | |
ignored_matches = list(ignore_pattern.finditer(sentence)) | |
working_sentence = sentence | |
# Replace ignored patterns with placeholders | |
for match in reversed(ignored_matches): | |
start, end = match.span() | |
working_sentence = working_sentence[:start] + 'X' * (end - start) + working_sentence[end:] | |
# Now check for date patterns in the modified sentence | |
for pattern, issue in date_patterns: | |
matches = pattern.finditer(working_sentence) | |
for match in matches: | |
# Get the original text from the match position | |
original_date = sentence[match.start():match.end()] | |
date_issues.append({ | |
'date': original_date, | |
'issue': issue, | |
'sentence': sentence.strip() | |
}) | |
success = len(date_issues) == 0 | |
return DocumentCheckResult(success=success, issues=date_issues) | |
def check_placeholders(self, doc: List[str]) -> DocumentCheckResult: | |
"""Check for placeholders that should be removed.""" | |
if not self.validate_input(doc): | |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}]) | |
placeholder_phrases = [ | |
r'\bTBD\b', | |
r'\bTo be determined\b', | |
r'\bTo be added\b' | |
] | |
issues = [] | |
for paragraph in doc: | |
sentences = re.split(r'(?<=[.!?])\s+', paragraph) | |
for sentence in sentences: | |
for phrase in placeholder_phrases: | |
match = re.search(phrase, sentence, re.IGNORECASE) | |
if match: | |
issues.append({ | |
'placeholder': match.group().strip(), | |
'sentence': sentence.strip() | |
}) | |
success = len(issues) == 0 | |
return DocumentCheckResult(success=success, issues=issues) | |
def run_all_checks(self, doc_path: str, doc_type: str, template_type: Optional[str] = None) -> Dict[str, DocumentCheckResult]: | |
""" | |
Run all checks on the document. | |
Args: | |
doc_path (str): Path to the document. | |
doc_type (str): Type of the document. | |
template_type (str, optional): Template type, if applicable. | |
Returns: | |
Dict[str, DocumentCheckResult]: Dictionary of check names to results. | |
""" | |
# Read the document | |
doc = self.extract_paragraphs(doc_path) | |
# Retrieve any specific flags | |
checks_config = self.config_manager.config['document_types'].get(doc_type, {}) | |
skip_title_check = checks_config.get('skip_title_check', False) | |
# Run checks | |
results = {} | |
results['heading_title_check'] = self.heading_title_check(doc, doc_type) | |
results['heading_title_period_check'] = self.heading_title_period_check(doc, doc_type) | |
results['acronym_check'] = self.acronym_check(doc) | |
results['terminology_check'] = self.check_terminology(doc) | |
results['section_symbol_usage_check'] = self.check_section_symbol_usage(doc) | |
results['caption_check_table'] = self.caption_check(doc, doc_type, 'Table') | |
results['caption_check_figure'] = self.caption_check(doc, doc_type, 'Figure') | |
results['table_figure_reference_check'] = self.table_figure_reference_check(doc, doc_type) | |
if not skip_title_check: | |
results['document_title_check'] = self.document_title_check(doc_path, doc_type) | |
else: | |
results['document_title_check'] = DocumentCheckResult(success=True, issues=[]) | |
results['double_period_check'] = self.double_period_check(doc) | |
results['spacing_check'] = self.spacing_check(doc) | |
results['abbreviation_usage_check'] = self.check_abbreviation_usage(doc) | |
results['date_formats_check'] = self.check_date_formats(doc) | |
results['placeholders_check'] = self.check_placeholders(doc) | |
return results | |
def process_document(file_obj, doc_type, template_type): | |
"""Process the document and run all checks.""" | |
try: | |
# Convert file object to BytesIO | |
if isinstance(file_obj, bytes): | |
file_obj = io.BytesIO(file_obj) | |
checker = FAADocumentChecker() | |
doc = Document(file_obj) | |
paragraphs = [para.text for para in doc.paragraphs if para.text.strip()] | |
# Rewind the file object for additional processing | |
file_obj.seek(0) | |
# Run all checks | |
results = {} | |
results['heading_check'] = checker.heading_title_check(paragraphs, doc_type) | |
results['heading_period_check'] = checker.heading_title_period_check(paragraphs, doc_type) | |
results['acronym_check'] = checker.acronym_check(paragraphs) | |
results['terminology_check'] = checker.check_terminology(paragraphs) | |
results['section_symbol_check'] = checker.check_section_symbol_usage(paragraphs) | |
results['table_caption_check'] = checker.caption_check(paragraphs, doc_type, 'Table') | |
results['figure_caption_check'] = checker.caption_check(paragraphs, doc_type, 'Figure') | |
results['references_check'] = checker.table_figure_reference_check(paragraphs, doc_type) | |
results['title_check'] = checker.document_title_check(file_obj, doc_type) | |
results['double_period_check'] = checker.double_period_check(paragraphs) | |
results['spacing_check'] = checker.spacing_check(paragraphs) | |
results['abbreviation_check'] = checker.check_abbreviation_usage(paragraphs) | |
results['date_check'] = checker.check_date_formats(paragraphs) | |
results['placeholder_check'] = checker.check_placeholders(paragraphs) | |
return format_results_for_gradio(results, doc_type) | |
except Exception as e: | |
print(f"Error in process_document: {str(e)}") | |
traceback.print_exc() # This will print the full traceback | |
return f"An error occurred while processing the document: {str(e)}" | |
def format_results_for_gradio(results: Dict[str, DocumentCheckResult], doc_type: str) -> str: | |
"""Format the results for display in Gradio.""" | |
output = ["# Document Check Results\n"] | |
# Map check names to display titles | |
check_titles = { | |
'heading_check': "Required Headings Check", | |
'heading_period_check': "Heading Period Check", | |
'acronym_check': "Acronym Check", | |
'terminology_check': "Terminology Check", | |
'section_symbol_check': "Section Symbol Usage", | |
'table_caption_check': "Table Caption Format", | |
'figure_caption_check': "Figure Caption Format", | |
'references_check': "Table and Figure References", | |
'title_check': "Document Title Style", | |
'double_period_check': "Double Period Check", | |
'spacing_check': "Spacing Check", | |
'abbreviation_check': "Abbreviation Usage", | |
'date_check': "Date Format Check", | |
'placeholder_check': "Placeholder Check" | |
} | |
for check_name, result in results.items(): | |
title = check_titles.get(check_name, check_name.replace('_', ' ').title()) | |
output.append(f"## {title}") | |
if result.success: | |
output.append("✅ All checks passed.\n") | |
else: | |
output.append("❌ Issues found:") | |
for issue in result.issues: | |
if isinstance(issue, dict): | |
for key, value in issue.items(): | |
if isinstance(value, list): | |
for item in value: | |
output.append(f"- {item}") | |
else: | |
output.append(f"- {key}: {value}") | |
else: | |
output.append(f"- {issue}") | |
output.append("") | |
if result.details: | |
output.append("Additional Details:") | |
for key, value in result.details.items(): | |
if isinstance(value, list): | |
output.append(f"- {key}:") | |
for item in value: | |
output.append(f" - {item}") | |
else: | |
output.append(f"- {key}: {value}") | |
output.append("") | |
return "\n".join(output) | |
# Create the Gradio interface | |
demo = gr.Blocks(theme='JohnSmith9982/small_and_pretty') | |
with demo: | |
gr.Markdown("# Document Checker Tool") | |
gr.Markdown("Upload a Word (docx) document to check for compliance with U.S. federal documentation standards.") | |
gr.Markdown("*This tool is still in development and you might get false positives in your results*") | |
gr.Markdown("Contact Eric Putnam if you have questions and comments.") | |
gr.Markdown(""" | |
1. Upload a clean (no track changes or comments) Word file. | |
2. Choose **Check Document**.""") | |
document_types = [ | |
"Advisory Circular", "Airworthiness Criteria", "Deviation Memo", "Exemption", | |
"Federal Register Notice", "Order", "Policy Statement", | |
"Rule", "Special Condition", "Technical Standard Order", "Other" | |
] | |
template_types = ["Short AC template AC", "Long AC template AC"] | |
with gr.Row(): | |
with gr.Column(scale=1): | |
file_input = gr.File( | |
label="Upload Word Document (.docx)", | |
file_types=[".docx"], | |
type="binary" | |
) | |
doc_type = gr.Dropdown( | |
choices=document_types, | |
label="Document Type", | |
value="Advisory Circular" | |
) | |
template_type = gr.Radio( | |
choices=template_types, | |
label="Template Type (Only for Advisory Circular)", | |
visible=True, | |
value="Short AC template AC" | |
) | |
submit_btn = gr.Button("Check Document", variant="primary") | |
with gr.Column(scale=2): | |
output = gr.Markdown( | |
label="Check Results", | |
value="Results will appear here after processing..." | |
) | |
def update_template_visibility(doc_type): | |
return gr.update(visible=doc_type == "Advisory Circular") | |
doc_type.change( | |
fn=update_template_visibility, | |
inputs=[doc_type], | |
outputs=[template_type] | |
) | |
submit_btn.click( | |
fn=process_document, | |
inputs=[file_input, doc_type, template_type], | |
outputs=[output] | |
) | |
# Launch the demo | |
if __name__ == "__main__": | |
demo.launch() |