Spaces:
Running
Running
import gradio as gr | |
import logging | |
import re | |
import json | |
import time | |
from typing import Dict, List, Any, Tuple, Optional | |
from dataclasses import dataclass | |
from functools import wraps | |
from docx import Document | |
import io | |
import os | |
import traceback | |
from datetime import datetime | |
# Core data structures | |
class DocumentCheckResult: | |
"""Structured result for document checks.""" | |
success: bool | |
issues: List[Dict[str, Any]] | |
details: Optional[Dict[str, Any]] = None | |
def profile_performance(func): | |
"""Decorator to profile function performance.""" | |
def wrapper(*args, **kwargs): | |
start_time = time.time() | |
result = func(*args, **kwargs) | |
end_time = time.time() | |
logger = args[0].logger if hasattr(args[0], 'logger') else logging.getLogger(__name__) | |
logger.info( | |
f"Performance: {func.__name__} took {end_time - start_time:.4f} seconds" | |
) | |
return result | |
return wrapper | |
class DocumentCheckerConfig: | |
"""Configuration management for document checks.""" | |
def __init__(self, config_path: Optional[str] = None): | |
self.config = self._load_config(config_path) | |
self.logger = self._setup_logger() | |
def _load_config(self, config_path: Optional[str] = None) -> Dict[str, Any]: | |
"""Load configuration from JSON file or use default settings.""" | |
default_config = { | |
"logging": { | |
"level": "INFO", | |
"format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s" | |
}, | |
"checks": { | |
"acronyms": True, | |
"terminology_check": True, | |
"headings": True | |
}, | |
"document_types": { | |
"Advisory Circular": { | |
"required_headings": [ | |
"Purpose.", | |
"Applicability.", | |
"Cancellation.", | |
"Related Material.", | |
"Definition of Key Terms." | |
], | |
"skip_title_check": False | |
}, | |
"Federal Register Notice": { | |
"required_headings": [ | |
"Purpose of This Notice", | |
"Audience", | |
"Where can I Find This Notice" | |
], | |
"skip_title_check": False | |
}, | |
"Order": { | |
"required_headings": [ | |
"Purpose of This Order.", | |
"Audience.", | |
"Where to Find This Order." | |
], | |
"skip_title_check": False | |
}, | |
"Policy Statement": { | |
"required_headings": [ | |
"SUMMARY", | |
"CURRENT REGULATORY AND ADVISORY MATERIAL", | |
"RELEVANT PAST PRACTICE", | |
"POLICY", | |
"EFFECT OF POLICY", | |
"CONCLUSION" | |
], | |
"skip_title_check": False | |
}, | |
"Technical Standard Order": { | |
"required_headings": [ | |
"PURPOSE.", | |
"APPLICABILITY.", | |
"REQUIREMENTS.", | |
"MARKING.", | |
"APPLICATION DATA REQUIREMENTS.", | |
"MANUFACTURER DATA REQUIREMENTS.", | |
"FURNISHED DATA REQUIREMENTS.", | |
"HOW TO GET REFERENCED DOCUMENTS." | |
], | |
"skip_title_check": False | |
}, | |
"Airworthiness Criteria": { | |
"required_headings": [], | |
"skip_title_check": True | |
}, | |
"Deviation Memo": { | |
"required_headings": [], | |
"skip_title_check": True | |
}, | |
"Exemption": { | |
"required_headings": [], | |
"skip_title_check": True | |
}, | |
"Rule": { | |
"required_headings": [], | |
"skip_title_check": True | |
}, | |
"Special Condition": { | |
"required_headings": [], | |
"skip_title_check": True | |
}, | |
"Other": { | |
"required_headings": [], | |
"skip_title_check": True | |
} | |
} | |
} | |
if config_path and os.path.exists(config_path): | |
try: | |
with open(config_path, 'r') as f: | |
user_config = json.load(f) | |
self._deep_merge(default_config, user_config) | |
except (json.JSONDecodeError, IOError) as e: | |
logging.warning(f"Error loading config: {e}. Using default config.") | |
return default_config | |
def _deep_merge(self, base: Dict[str, Any], update: Dict[str, Any]) -> Dict[str, Any]: | |
"""Recursively merge two dictionaries.""" | |
for key, value in update.items(): | |
if isinstance(value, dict) and key in base and isinstance(base[key], dict): | |
self._deep_merge(base[key], value) | |
else: | |
base[key] = value | |
return base | |
def _setup_logger(self) -> logging.Logger: | |
"""Set up and configure logging based on configuration.""" | |
logger = logging.getLogger(__name__) | |
log_level = getattr(logging, self.config['logging']['level'].upper()) | |
formatter = logging.Formatter(self.config['logging']['format']) | |
console_handler = logging.StreamHandler() | |
console_handler.setFormatter(formatter) | |
console_handler.setLevel(log_level) | |
logger.addHandler(console_handler) | |
logger.setLevel(log_level) | |
return logger | |
class DocumentChecker: | |
"""Base class for document checking.""" | |
def __init__(self, config_path: Optional[str] = None): | |
self.config_manager = DocumentCheckerConfig(config_path) | |
self.logger = self.config_manager.logger | |
def validate_input(doc: List[str]) -> bool: | |
"""Validate input document.""" | |
return doc is not None and isinstance(doc, list) and len(doc) > 0 | |
def extract_paragraphs(cls, doc_path: str) -> List[str]: | |
"""Extract plain text paragraphs from a document.""" | |
try: | |
doc = Document(doc_path) | |
return [para.text for para in doc.paragraphs if para.text.strip()] | |
except Exception as e: | |
logging.error(f"Error extracting paragraphs: {e}") | |
return [] | |
class FAADocumentChecker(DocumentChecker): | |
"""Main document checker implementation.""" | |
def __init__(self, config_path: Optional[str] = None): | |
super().__init__(config_path) | |
def heading_title_check(self, doc: List[str], doc_type: str) -> DocumentCheckResult: | |
"""Check headings for a specific document type.""" | |
if not self.validate_input(doc): | |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}]) | |
checks = self.config_manager.config['document_types'].get(doc_type, {}) | |
required_headings = checks.get('required_headings', []) | |
headings_found = [] | |
required_headings_set = set(required_headings) | |
for para in doc: | |
para_strip = para.strip() | |
if para_strip in required_headings_set: | |
headings_found.append(para_strip) | |
all_headings_present = set(headings_found) == required_headings_set | |
issues = [] | |
if not all_headings_present: | |
missing_headings = required_headings_set - set(headings_found) | |
issues.append({'missing_headings': list(missing_headings)}) | |
return DocumentCheckResult( | |
success=all_headings_present, | |
issues=issues, | |
details={ | |
'found_headings': headings_found, | |
'required_headings': required_headings | |
} | |
) | |
def heading_title_period_check(self, doc: List[str], doc_type: str) -> DocumentCheckResult: | |
"""Check if headings end with periods according to document type requirements.""" | |
if not self.validate_input(doc): | |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}]) | |
period_required = { | |
"Advisory Circular": True, | |
"Airworthiness Criteria": False, | |
"Deviation Memo": False, | |
"Exemption": False, | |
"Federal Register Notice": False, | |
"Order": True, | |
"Policy Statement": False, | |
"Rule": False, | |
"Special Condition": False, | |
"Technical Standard Order": True, | |
"Other": False | |
} | |
should_have_period = period_required.get(doc_type, False) | |
checks = self.config_manager.config['document_types'].get(doc_type, {}) | |
required_headings = checks.get('required_headings', []) | |
required_headings_set = set(required_headings) | |
issues = [] | |
checked_headings = [] | |
for para in doc: | |
para_strip = para.strip() | |
if para_strip in required_headings_set: | |
ends_with_period = para_strip.endswith('.') | |
if should_have_period and not ends_with_period: | |
issues.append({ | |
'heading': para_strip, | |
'issue': 'missing_period', | |
'message': f"Heading should end with a period: '{para_strip}'" | |
}) | |
checked_headings.append({ | |
'heading': para_strip, | |
'has_period': False, | |
'needs_period': True | |
}) | |
elif not should_have_period and ends_with_period: | |
issues.append({ | |
'heading': para_strip, | |
'issue': 'unexpected_period', | |
'message': f"Heading should not end with a period: '{para_strip}'" | |
}) | |
checked_headings.append({ | |
'heading': para_strip, | |
'has_period': True, | |
'needs_period': False | |
}) | |
else: | |
checked_headings.append({ | |
'heading': para_strip, | |
'has_period': ends_with_period, | |
'needs_period': should_have_period | |
}) | |
return DocumentCheckResult( | |
success=len(issues) == 0, | |
issues=issues, | |
details={ | |
'document_type': doc_type, | |
'periods_required': should_have_period, | |
'checked_headings': checked_headings | |
} | |
) | |
def acronym_check(self, doc: List[str]) -> DocumentCheckResult: | |
"""Check if acronyms are defined at their first use.""" | |
if not self.validate_input(doc): | |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}]) | |
defined_acronyms = set() | |
first_occurrences = {} | |
heading_words = { | |
'INFORMATION', 'GENERAL', 'SUMMARY', 'INTRODUCTION', 'BACKGROUND', | |
'DISCUSSION', 'CONCLUSION', 'APPENDIX', 'CHAPTER', 'SECTION', | |
'PURPOSE', 'APPLICABILITY', 'CANCELLATION', 'DEFINITION', 'REQUIREMENTS', | |
'AUTHORITY', 'POLICY', 'SCOPE', 'RELATED', 'MATERIAL', 'DISTRIBUTION', | |
'EXPLANATION', 'PROCEDURES', 'NOTE', 'WARNING', 'CAUTION', 'EXCEPTION', | |
'GROUPS', 'PARTS', 'TABLE', 'FIGURE', 'REFERENCES', 'DEFINITIONS' | |
} | |
predefined_acronyms = { | |
'CFR', 'U.S.', 'USA', 'US', 'U.S.C', 'e.g.', 'i.e.', 'FAQ', 'No.', 'ZIP', 'PDF', 'SSN', | |
'DC', 'MA', 'WA', 'TX', 'MO' | |
} | |
defined_acronyms.update(predefined_acronyms) | |
defined_pattern = re.compile(r'\b([\w\s&]+?)\s*\((\b[A-Z]{2,}\b)\)') | |
acronym_pattern = re.compile(r'\b[A-Z]{2,}\b(?!\s*[:.]\s*)') | |
for paragraph in doc: | |
words = paragraph.strip().split() | |
if all(word.isupper() for word in words) and any(word in heading_words for word in words): | |
continue | |
defined_matches = defined_pattern.findall(paragraph) | |
for full_term, acronym in defined_matches: | |
defined_acronyms.add(acronym) | |
if acronym in first_occurrences: | |
del first_occurrences[acronym] | |
usage_matches = acronym_pattern.finditer(paragraph) | |
for match in usage_matches: | |
acronym = match.group() | |
if (acronym not in defined_acronyms and | |
acronym not in heading_words and | |
not any(not c.isalpha() for c in acronym) and | |
len(acronym) <= 10): | |
if acronym not in first_occurrences: | |
sentences = re.split(r'(?<=[.!?])\s+', paragraph) | |
for sentence in sentences: | |
if acronym in sentence: | |
if not (sentence.isupper() and any(word in heading_words for word in sentence.split())): | |
first_occurrences[acronym] = { | |
'acronym': acronym, | |
'sentence': sentence.strip() | |
} | |
break | |
undefined_acronyms = list(first_occurrences.values()) | |
success = len(undefined_acronyms) == 0 | |
issues = undefined_acronyms if not success else [] | |
return DocumentCheckResult(success=success, issues=issues) | |
def check_terminology(self, doc: List[str]) -> DocumentCheckResult: | |
"""Check document terminology for consistency and preferred terms.""" | |
if not self.validate_input(doc): | |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}]) | |
term_replacements = { | |
r'\bUSC\b': 'U.S.C.', | |
r'\bCFR Part\b': 'CFR part', | |
r'\bC\.F\.R\.\b': 'CFR', | |
r'\b14 CFR\s*Β§': '14 CFR', | |
r'\bWe\b': 'The FAA', | |
r'\bwe\b': 'the FAA', | |
r'\bcancelled\b': 'canceled', | |
r'\bshall\b': 'must', | |
r'\b\&\b': 'and', | |
r'\bflight crew\b': 'flightcrew' | |
} | |
prohibited_phrases = [ | |
r'\babove\b', | |
r'\bbelow\b', | |
r'(?:^|(?<=[.!?]\s))There\s+(?:is|are)\b' | |
] | |
issues = [] | |
for paragraph in doc: | |
sentences = re.split(r'(?<=[.!?])\s+', paragraph) | |
for sentence in sentences: | |
for incorrect_pattern, correct_term in term_replacements.items(): | |
matches = re.finditer(incorrect_pattern, sentence) | |
for match in matches: | |
incorrect_term = match.group() | |
issues.append({ | |
'type': 'incorrect_term', | |
'incorrect_term': incorrect_term, | |
'correct_term': correct_term, | |
'sentence': sentence.strip() | |
}) | |
for phrase_pattern in prohibited_phrases: | |
match = re.search(phrase_pattern, sentence, re.IGNORECASE) | |
if match: | |
issues.append({ | |
'type': 'prohibited_phrase', | |
'phrase': match.group().strip(), | |
'sentence': sentence.strip() | |
}) | |
return DocumentCheckResult(success=len(issues) == 0, issues=issues) | |
def check_section_symbol_usage(self, doc: List[str]) -> DocumentCheckResult: | |
"""Check for correct usage of section symbols (Β§).""" | |
if not self.validate_input(doc): | |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}]) | |
issues = [] | |
sentences_starting_with_section_symbol = [] | |
incorrect_14_CFR_section_symbol_usage = [] | |
single_section_symbol_multiple_sections = [] | |
missing_section_symbol_in_multiple_sections = [] | |
pattern_14_CFR_section = re.compile(r'\b14 CFR Β§\s*\d+\.\d+\b') | |
pattern_single_section_symbol_and = re.compile(r'Β§\s*\d+\.\d+\s+and\s+\d+\.\d+') | |
pattern_single_section_symbol_or = re.compile(r'Β§\s*\d+\.\d+\s+or\s+\d+\.\d+') | |
pattern_single_section_symbol_through = re.compile(r'Β§\s*\d+\.\d+\s+through\s+\d+\.\d+') | |
pattern_missing_section_symbol_or = re.compile(r'Β§\s*\d+\.\d+\s+or\s+Β§?\s*\d+\.\d+') | |
for paragraph in doc: | |
sentences = re.split(r'(?<=[.!?])\s+', paragraph) | |
for sentence in sentences: | |
if sentence.strip().startswith('Β§'): | |
sentences_starting_with_section_symbol.append(sentence.strip()) | |
matches_14_CFR = pattern_14_CFR_section.findall(paragraph) | |
incorrect_14_CFR_section_symbol_usage.extend(matches_14_CFR) | |
matches_and = pattern_single_section_symbol_and.findall(paragraph) | |
single_section_symbol_multiple_sections.extend(matches_and) | |
matches_or = pattern_single_section_symbol_or.findall(paragraph) | |
single_section_symbol_multiple_sections.extend(matches_or) | |
matches_through = pattern_single_section_symbol_through.findall(paragraph) | |
single_section_symbol_multiple_sections.extend(matches_through) | |
matches_missing_or = pattern_missing_section_symbol_or.findall(paragraph) | |
missing_section_symbol_in_multiple_sections.extend(matches_missing_or) | |
if sentences_starting_with_section_symbol: | |
issues.append({ | |
'issue': 'sentences_starting_with_section_symbol', | |
'sentences': sentences_starting_with_section_symbol | |
}) | |
if incorrect_14_CFR_section_symbol_usage: | |
issues.append({ | |
'issue': 'incorrect_14_CFR_section_symbol_usage', | |
'matches': incorrect_14_CFR_section_symbol_usage | |
}) | |
if single_section_symbol_multiple_sections: | |
issues.append({ | |
'issue': 'single_section_symbol_multiple_sections', | |
'matches': single_section_symbol_multiple_sections | |
}) | |
if missing_section_symbol_in_multiple_sections: | |
issues.append({ | |
'issue': 'missing_section_symbol_in_multiple_sections', | |
'matches': missing_section_symbol_in_multiple_sections | |
}) | |
return DocumentCheckResult(success=len(issues) == 0, issues=issues) | |
def caption_check(self, doc: List[str], doc_type: str, caption_type: str) -> DocumentCheckResult: | |
"""Check for correctly formatted captions.""" | |
if not self.validate_input(doc): | |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}]) | |
if doc_type in ["Advisory Circular", "Order"]: | |
caption_pattern = re.compile(rf'^{caption_type}\s+([A-Z0-9]+)-([A-Z0-9]+)[\.\s]', re.IGNORECASE) | |
correct_format = f"{caption_type} X-Y" | |
else: | |
caption_pattern = re.compile(rf'^{caption_type}\s+([A-Z0-9]+)[\.\s]', re.IGNORECASE) | |
correct_format = f"{caption_type} X" | |
incorrect_captions = [] | |
in_toc = False | |
for paragraph in doc: | |
if "Table of Contents" in paragraph or "Contents" in paragraph: | |
in_toc = True | |
continue | |
elif in_toc and paragraph.strip() == "": | |
in_toc = False | |
if in_toc: | |
continue | |
paragraph_strip = paragraph.strip() | |
if paragraph_strip.lower().startswith(caption_type.lower()): | |
if not caption_pattern.match(paragraph_strip): | |
incorrect_captions.append({ | |
'incorrect_caption': paragraph_strip, | |
'correct_format': correct_format | |
}) | |
return DocumentCheckResult(success=len(incorrect_captions) == 0, issues=incorrect_captions) | |
def table_figure_reference_check(self, doc: List[str], doc_type: str) -> DocumentCheckResult: | |
"""Check for correct references to tables and figures.""" | |
if not self.validate_input(doc): | |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}]) | |
incorrect_references = [] | |
if doc_type in ["Advisory Circular", "Order"]: | |
table_pattern = r'\b[Tt]able\s+\d+-\d+\b' | |
figure_pattern = r'\b[Ff]igure\s+\d+-\d+\b' | |
correct_mid_table_format = "table X-Y" | |
correct_start_table_format = "Table X-Y" | |
correct_mid_figure_format = "figure X-Y" | |
correct_start_figure_format = "Figure X-Y" | |
else: | |
table_pattern = r'\b[Tt]able\s+\d+\b' | |
figure_pattern = r'\b[Ff]igure\s+\d+\b' | |
correct_mid_table_format = "table X" | |
correct_start_table_format = "Table X" | |
correct_mid_figure_format = "figure X" | |
correct_start_figure_format = "Figure X" | |
table_ref_pattern = re.compile(table_pattern) | |
figure_ref_pattern = re.compile(figure_pattern) | |
for paragraph in doc: | |
paragraph_strip = paragraph.strip() | |
starts_with_table_or_figure = paragraph_strip.lower().startswith('table') or paragraph_strip.lower().startswith('figure') | |
if not starts_with_table_or_figure: | |
sentences = re.split(r'(?<=[.!?])\s+', paragraph) | |
for sentence in sentences: | |
sentence = sentence.strip() | |
# Check table references | |
matches = table_ref_pattern.finditer(sentence) | |
for match in matches: | |
ref = match.group() | |
text_before = sentence[:match.start()].strip() | |
is_sentence_start = text_before == "" | |
if is_sentence_start and not ref.startswith('Table'): | |
incorrect_references.append({ | |
'incorrect_ref': ref, | |
'correct_format': correct_start_table_format, | |
'sentence': sentence, | |
'issue': "Table reference at sentence start should be capitalized" | |
}) | |
elif not is_sentence_start and not ref.startswith('table'): | |
incorrect_references.append({ | |
'incorrect_ref': ref, | |
'correct_format': correct_mid_table_format, | |
'sentence': sentence, | |
'issue': "Table reference within sentence should be lowercase" | |
}) | |
# Check figure references | |
matches = figure_ref_pattern.finditer(sentence) | |
for match in matches: | |
ref = match.group() | |
text_before = sentence[:match.start()].strip() | |
is_sentence_start = text_before == "" | |
if is_sentence_start and not ref.startswith('Figure'): | |
incorrect_references.append({ | |
'incorrect_ref': ref, | |
'correct_format': correct_start_figure_format, | |
'sentence': sentence, | |
'issue': "Figure reference at sentence start should be capitalized" | |
}) | |
elif not is_sentence_start and not ref.startswith('figure'): | |
incorrect_references.append({ | |
'incorrect_ref': ref, | |
'correct_format': correct_mid_figure_format, | |
'sentence': sentence, | |
'issue': "Figure reference within sentence should be lowercase" | |
}) | |
return DocumentCheckResult(success=len(incorrect_references) == 0, issues=incorrect_references) | |
def document_title_check(self, doc_path: str, doc_type: str) -> DocumentCheckResult: | |
"""Check for correct formatting of document titles.""" | |
try: | |
if isinstance(doc_path, (str, bytes, io.BytesIO)): | |
doc = Document(doc_path) | |
else: | |
return DocumentCheckResult( | |
success=False, | |
issues=[{'error': 'Invalid document input type'}] | |
) | |
incorrect_titles = [] | |
formatting_rules = { | |
"Advisory Circular": {"italics": True, "quotes": False}, | |
"Airworthiness Criteria": {"italics": False, "quotes": True}, | |
"Deviation Memo": {"italics": False, "quotes": True}, | |
"Exemption": {"italics": False, "quotes": True}, | |
"Federal Register Notice": {"italics": False, "quotes": True}, | |
"Order": {"italics": False, "quotes": True}, | |
"Policy Statement": {"italics": False, "quotes": False}, | |
"Rule": {"italics": False, "quotes": True}, | |
"Special Condition": {"italics": False, "quotes": True}, | |
"Technical Standard Order": {"italics": False, "quotes": True}, | |
"Other": {"italics": False, "quotes": False} | |
} | |
if doc_type not in formatting_rules: | |
self.logger.warning(f"Unsupported document type: {doc_type}. Skipping title check.") | |
return DocumentCheckResult(success=True, issues=[]) | |
required_format = formatting_rules[doc_type] | |
ac_pattern = re.compile(r'(AC\s+\d+(?:-\d+)?(?:,|\s)+)(.+?)(?=\.|,|$)') | |
for paragraph in doc.paragraphs: | |
text = paragraph.text | |
matches = ac_pattern.finditer(text) | |
for match in matches: | |
title_text = match.group(2).strip() | |
title_start = match.start(2) | |
title_in_quotes = any(q in title_text for q in ['"', "'", '"', '"', ''', ''']) | |
title_is_italicized = False | |
current_pos = 0 | |
for run in paragraph.runs: | |
run_length = len(run.text) | |
run_start = current_pos | |
run_end = current_pos + run_length | |
if run_start <= title_start < run_end: | |
title_is_italicized = run.italic | |
break | |
current_pos += run_length | |
formatting_incorrect = False | |
issue_message = [] | |
if required_format["italics"] and not title_is_italicized: | |
formatting_incorrect = True | |
issue_message.append("should be italicized") | |
elif not required_format["italics"] and title_is_italicized: | |
formatting_incorrect = True | |
issue_message.append("should not be italicized") | |
if required_format["quotes"] and not title_in_quotes: | |
formatting_incorrect = True | |
issue_message.append("should be in quotes") | |
elif not required_format["quotes"] and title_in_quotes: | |
formatting_incorrect = True | |
issue_message.append("should not be in quotes") | |
if formatting_incorrect: | |
incorrect_titles.append({ | |
'text': title_text, | |
'issue': ', '.join(issue_message), | |
'sentence': text.strip() | |
}) | |
return DocumentCheckResult( | |
success=len(incorrect_titles) == 0, | |
issues=incorrect_titles | |
) | |
except Exception as e: | |
self.logger.error(f"Error in document_title_check: {e}") | |
return DocumentCheckResult( | |
success=False, | |
issues=[{'error': str(e)}] | |
) | |
def double_period_check(self, doc: List[str]) -> DocumentCheckResult: | |
"""Check for sentences that end with two periods.""" | |
if not self.validate_input(doc): | |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}]) | |
incorrect_sentences = [] | |
for paragraph in doc: | |
sentences = re.split(r'(?<=[.!?]) +', paragraph) | |
for sentence in sentences: | |
if sentence.endswith('..'): | |
incorrect_sentences.append({'sentence': sentence.strip()}) | |
success = len(incorrect_sentences) == 0 | |
return DocumentCheckResult(success=success, issues=incorrect_sentences) | |
def spacing_check(self, doc: List[str]) -> DocumentCheckResult: | |
"""Check for correct spacing in the document.""" | |
if not self.validate_input(doc): | |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}]) | |
incorrect_spacing = [] | |
patterns = [ | |
(re.compile(r'(?<!\s)(AC|AD|CFR|FAA|N|SFAR)(\d+[-]?\d*)', re.IGNORECASE), | |
"Missing space between document type and number"), | |
(re.compile(r'(?<!\s)(Β§|Β§Β§)(\d+\.\d+)', re.IGNORECASE), | |
"Missing space after section symbol (Β§)"), | |
(re.compile(r'(?<!\s)Part(\d+)', re.IGNORECASE), | |
"Missing space between 'Part' and number"), | |
(re.compile(r'(?<!\s)(\([a-z](?!\))|\([1-9](?!\)))', re.IGNORECASE), | |
"Missing space before paragraph indication"), | |
(re.compile(r'\s{2,}'), | |
"Double spaces between words") | |
] | |
for paragraph in doc: | |
sentences = re.split(r'(?<=[.!?])\s+', paragraph) | |
for sentence in sentences: | |
for pattern, issue in patterns: | |
if pattern.search(sentence): | |
incorrect_spacing.append({ | |
'issue_description': issue, | |
'sentence': sentence.strip() | |
}) | |
success = len(incorrect_spacing) == 0 | |
return DocumentCheckResult(success=success, issues=incorrect_spacing) | |
def check_abbreviation_usage(self, doc: List[str]) -> DocumentCheckResult: | |
"""Check for abbreviation consistency after first definition.""" | |
if not self.validate_input(doc): | |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}]) | |
abbreviations = {} | |
issues = [] | |
for paragraph in doc: | |
sentences = re.split(r'(?<=[.!?])\s+', paragraph) | |
for sentence in sentences: | |
defined_matches = re.findall(r'\b([A-Za-z &]+)\s+\((\b[A-Z]{2,}\b)\)', sentence) | |
for full_term, acronym in defined_matches: | |
if acronym not in abbreviations: | |
abbreviations[acronym] = {"full_term": full_term.strip(), "defined": True} | |
for acronym, data in abbreviations.items(): | |
full_term = data["full_term"] | |
if full_term in sentence: | |
if data["defined"]: | |
data["defined"] = False | |
else: | |
issues.append({ | |
'full_term': full_term, | |
'acronym': acronym, | |
'sentence': sentence.strip() | |
}) | |
return DocumentCheckResult(success=len(issues) == 0, issues=issues) | |
def check_date_formats(self, doc: List[str]) -> DocumentCheckResult: | |
"""Check for inconsistent date formats.""" | |
if not self.validate_input(doc): | |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}]) | |
date_issues = [] | |
ignore_patterns = [ | |
r'\bAD \d{4}-\d{2}-\d{2}\b', | |
r'\bSWPM \d{2}-\d{2}-\d{2}\b', | |
r'\bAMM \d{2}-\d{2}-\d{2}\b', | |
r'\bSOPM \d{2}-\d{2}-\d{2}\b', | |
r'\b[A-Z]{2,4} \d{2}-\d{2}-\d{2}\b' | |
] | |
ignore_regex = '|'.join(ignore_patterns) | |
ignore_pattern = re.compile(ignore_regex) | |
correct_date_pattern = re.compile(r'\b(January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}, \d{4}\b') | |
date_patterns = [ | |
(re.compile(r'(?<![\w/-])\d{1,2}/\d{1,2}/\d{2,4}(?![\w/-])'), | |
"Use 'Month Day, Year' format instead of 'MM/DD/YYYY'"), | |
(re.compile(r'(?<![\w/-])\d{1,2}-\d{1,2}-\d{2,4}(?![\w/-])'), | |
"Use 'Month Day, Year' format instead of 'MM-DD-YYYY'"), | |
(re.compile(r'(?<![\w/-])\d{4}-\d{1,2}-\d{1,2}(?![\w/-])'), | |
"Use 'Month Day, Year' format instead of 'YYYY-MM-DD'") | |
] | |
for paragraph in doc: | |
sentences = re.split(r'(?<=[.!?])\s+', paragraph) | |
for sentence in sentences: | |
ignored_matches = list(ignore_pattern.finditer(sentence)) | |
working_sentence = sentence | |
for match in reversed(ignored_matches): | |
start, end = match.span() | |
working_sentence = working_sentence[:start] + 'X' * (end - start) + working_sentence[end:] | |
for pattern, issue in date_patterns: | |
matches = pattern.finditer(working_sentence) | |
for match in matches: | |
original_date = sentence[match.start():match.end()] | |
date_issues.append({ | |
'date': original_date, | |
'issue': issue, | |
'sentence': sentence.strip() | |
}) | |
return DocumentCheckResult(success=len(date_issues) == 0, issues=date_issues) | |
def check_placeholders(self, doc: List[str]) -> DocumentCheckResult: | |
"""Check for placeholders that should be removed.""" | |
if not self.validate_input(doc): | |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}]) | |
placeholder_phrases = [ | |
r'\bTBD\b', | |
r'\bTo be determined\b', | |
r'\bTo be added\b' | |
] | |
issues = [] | |
for paragraph in doc: | |
sentences = re.split(r'(?<=[.!?])\s+', paragraph) | |
for sentence in sentences: | |
for phrase in placeholder_phrases: | |
match = re.search(phrase, sentence, re.IGNORECASE) | |
if match: | |
issues.append({ | |
'placeholder': match.group().strip(), | |
'sentence': sentence.strip() | |
}) | |
return DocumentCheckResult(success=len(issues) == 0, issues=issues) | |
def run_all_checks(self, doc_path: str, doc_type: str, template_type: Optional[str] = None) -> Dict[str, DocumentCheckResult]: | |
"""Run all document checks.""" | |
# Read the document | |
doc = self.extract_paragraphs(doc_path) | |
# Get configuration flags | |
checks_config = self.config_manager.config['document_types'].get(doc_type, {}) | |
skip_title_check = checks_config.get('skip_title_check', False) | |
# Run all checks | |
results = {} | |
results['heading_title_check'] = self.heading_title_check(doc, doc_type) | |
results['heading_title_period_check'] = self.heading_title_period_check(doc, doc_type) | |
results['acronym_check'] = self.acronym_check(doc) | |
results['terminology_check'] = self.check_terminology(doc) | |
results['section_symbol_usage_check'] = self.check_section_symbol_usage(doc) | |
results['caption_check_table'] = self.caption_check(doc, doc_type, 'Table') | |
results['caption_check_figure'] = self.caption_check(doc, doc_type, 'Figure') | |
results['table_figure_reference_check'] = self.table_figure_reference_check(doc, doc_type) | |
if not skip_title_check: | |
results['document_title_check'] = self.document_title_check(doc_path, doc_type) | |
else: | |
results['document_title_check'] = DocumentCheckResult(success=True, issues=[]) | |
results['double_period_check'] = self.double_period_check(doc) | |
results['spacing_check'] = self.spacing_check(doc) | |
results['abbreviation_usage_check'] = self.check_abbreviation_usage(doc) | |
results['date_formats_check'] = self.check_date_formats(doc) | |
results['placeholders_check'] = self.check_placeholders(doc) | |
return results | |
def process_document(file_obj, doc_type: str, template_type: Optional[str] = None) -> str: | |
"""Process document and run all checks.""" | |
try: | |
# Initialize checker | |
checker = FAADocumentChecker() | |
# Convert file object to BytesIO if needed | |
if isinstance(file_obj, bytes): | |
file_obj = io.BytesIO(file_obj) | |
# Run all checks | |
results = checker.run_all_checks(file_obj, doc_type, template_type) | |
# Format results using DocumentCheckResultsFormatter | |
formatter = DocumentCheckResultsFormatter() | |
formatted_results = formatter.format_results(results, doc_type) | |
# Convert the formatted results to HTML | |
html_content = f""" | |
<div id="document-checker-results"></div> | |
<script type="module"> | |
import DocumentCheckerResults from './components/DocumentCheckerResults.jsx'; | |
const results = {json.dumps(formatted_results)}; | |
const root = document.getElementById('document-checker-results'); | |
ReactDOM.render(React.createElement(DocumentCheckerResults, {{ results }}), root); | |
</script> | |
""" | |
return html_content | |
except Exception as e: | |
logging.error(f"Error processing document: {str(e)}") | |
traceback.print_exc() | |
error_html = f""" | |
<div class="error-message" style="color: red; padding: 1rem;"> | |
β Error processing document: {str(e)} | |
<br><br> | |
Please ensure the file is a valid .docx document and try again. | |
</div> | |
""" | |
return error_html | |
def format_markdown_results(results: Dict[str, DocumentCheckResult], doc_type: str) -> str: | |
"""Format check results into a Markdown string for Gradio display.""" | |
output = [] | |
current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
output.extend([ | |
f"# Document Check Results - {current_time}", | |
f"## Document Type: {doc_type}", | |
"---\n" | |
]) | |
total_issues = sum(1 for r in results.values() if not r.success) | |
if total_issues == 0: | |
output.append("β **All checks passed successfully!**\n") | |
return "\n".join(output) | |
output.append(f"β Found issues in {total_issues} check categories\n") | |
check_categories = { | |
'heading_title_check': {'title': 'π Required Headings', 'priority': 1}, | |
'heading_title_period_check': {'title': 'π Heading Period Usage', 'priority': 1}, | |
'acronym_check': {'title': 'π Acronym Definitions', 'priority': 2}, | |
'terminology_check': {'title': 'π Terminology Usage', 'priority': 2}, | |
'section_symbol_usage_check': {'title': 'Β§ Section Symbol Usage', 'priority': 2}, | |
'caption_check_table': {'title': 'π Table Captions', 'priority': 3}, | |
'caption_check_figure': {'title': 'πΌοΈ Figure Captions', 'priority': 3}, | |
'table_figure_reference_check': {'title': 'π Table/Figure References', 'priority': 3}, | |
'document_title_check': {'title': 'π Document Title Format', 'priority': 1}, | |
'double_period_check': {'title': 'β‘ Double Periods', 'priority': 4}, | |
'spacing_check': {'title': 'β¨οΈ Spacing Issues', 'priority': 4}, | |
'abbreviation_usage_check': {'title': 'π Abbreviation Usage', 'priority': 3}, | |
'date_formats_check': {'title': 'π Date Formats', 'priority': 3}, | |
'placeholders_check': {'title': 'π© Placeholder Content', 'priority': 1} | |
} | |
sorted_checks = sorted( | |
[(name, result) for name, result in results.items()], | |
key=lambda x: check_categories.get(x[0], {'priority': 999})['priority'] | |
) | |
for check_name, result in sorted_checks: | |
if not result.success: | |
category = check_categories.get(check_name, {'title': check_name.replace('_', ' ').title()}) | |
output.append(f"### {category['title']}") | |
if isinstance(result.issues, list): | |
for issue in result.issues[:5]: | |
if isinstance(issue, dict): | |
for key, value in issue.items(): | |
if isinstance(value, list): | |
output.extend([f"- {item}" for item in value]) | |
else: | |
output.append(f"- {key}: {value}") | |
else: | |
output.append(f"- {issue}") | |
if len(result.issues) > 5: | |
output.append(f"\n*...and {len(result.issues) - 5} more similar issues*") | |
output.append("") | |
output.extend([ | |
"## π Summary and Recommendations", | |
"", | |
"### Priority Order for Fixes:", | |
"1. π΄ Critical: Heading formats, required content, and document structure", | |
"2. π‘ Important: Terminology, acronyms, and references", | |
"3. π’ Standard: Formatting, spacing, and style consistency", | |
"", | |
"### Next Steps:", | |
"1. Address issues in priority order", | |
"2. Use search/replace for consistent fixes", | |
"3. Re-run checker after making changes", | |
"4. Update your document template if needed", | |
"" | |
]) | |
return "\n".join(output) | |
def create_interface(): | |
"""Create and configure the Gradio interface.""" | |
document_types = [ | |
"Advisory Circular", | |
"Airworthiness Criteria", | |
"Deviation Memo", | |
"Exemption", | |
"Federal Register Notice", | |
"Order", | |
"Policy Statement", | |
"Rule", | |
"Special Condition", | |
"Technical Standard Order", | |
"Other" | |
] | |
template_types = ["Short AC template AC", "Long AC template AC"] | |
with gr.Blocks() as demo: | |
gr.Markdown( | |
""" | |
# π Document Checker Tool | |
### Purpose | |
This tool checks Word documents for compliance with U.S. federal documentation standards. | |
### How to Use | |
1. Upload your Word document (.docx format) | |
2. Select the document type | |
3. Click "Check Document" | |
> **Note:** Please ensure your document is clean (no track changes or comments) | |
""" | |
) | |
with gr.Row(): | |
with gr.Column(scale=1): | |
file_input = gr.File( | |
label="π Upload Word Document (.docx)", | |
file_types=[".docx"], | |
type="binary" | |
) | |
doc_type = gr.Dropdown( | |
choices=document_types, | |
label="π Document Type", | |
value="Advisory Circular", | |
info="Select the type of document you're checking" | |
) | |
template_type = gr.Radio( | |
choices=template_types, | |
label="π Template Type", | |
visible=False, | |
info="Only applicable for Advisory Circulars" | |
) | |
submit_btn = gr.Button( | |
"π Check Document", | |
variant="primary" | |
) | |
with gr.Column(scale=2): | |
results = gr.Component("DocumentCheckerResults", | |
value=None, | |
render=False | |
) | |
# Update template type visibility based on document type | |
def update_template_visibility(doc_type): | |
return gr.update(visible=doc_type == "Advisory Circular") | |
doc_type.change( | |
fn=update_template_visibility, | |
inputs=[doc_type], | |
outputs=[template_type] | |
) | |
# Handle document processing | |
submit_btn.click( | |
fn=process_document, | |
inputs=[file_input, doc_type, template_type], | |
outputs=[results] | |
) | |
gr.Markdown( | |
""" | |
### π Important Notes | |
- This tool is in development; you may encounter false positives | |
- For questions or feedback, contact Eric Putnam | |
- Results are not stored or saved | |
""" | |
) | |
return demo | |
# Initialize and launch the interface | |
if __name__ == "__main__": | |
# Setup logging | |
logging.basicConfig( | |
level=logging.INFO, | |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' | |
) | |
# Create and launch the interface | |
demo = create_interface() | |
demo.launch( | |
share=False, # Set to True if you want to generate a public link | |
server_name="0.0.0.0", # Allows external access | |
server_port=7860, # Default Gradio port | |
debug=True | |
) |