Hoctar77's picture
Update app.py
736fba7 verified
raw
history blame
48 kB
import gradio as gr
import logging
import re
import json
import time
from typing import Dict, List, Any, Tuple, Optional
from dataclasses import dataclass
from functools import wraps
from docx import Document
import io
import os
import traceback
@dataclass
class DocumentCheckResult:
"""Structured result for document checks."""
success: bool
issues: List[Dict[str, Any]]
details: Optional[Dict[str, Any]] = None
def profile_performance(func):
"""Decorator to profile function performance."""
@wraps(func)
def wrapper(*args, **kwargs):
start_time = time.time()
result = func(*args, **kwargs)
end_time = time.time()
# Get logger from the class instance (first argument)
logger = args[0].logger if hasattr(args[0], 'logger') else logging.getLogger(__name__)
logger.info(
f"Performance: {func.__name__} took {end_time - start_time:.4f} seconds"
)
return result
return wrapper
class DocumentCheckerConfig:
"""Configuration management for document checks."""
def __init__(self, config_path: Optional[str] = None):
"""Initialize configuration with optional config file."""
self.config = self._load_config(config_path)
self.logger = self._setup_logger()
def _load_config(self, config_path: Optional[str] = None) -> Dict[str, Any]:
"""Load configuration from JSON file or use default settings."""
default_config = {
"logging": {
"level": "INFO",
"format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
},
"checks": {
"acronyms": True,
"terminology_check": True,
"headings": True
},
"document_types": {
"Advisory Circular": {
"required_headings": [
"Purpose.",
"Applicability.",
"Cancellation.",
"Related Material.",
"Definition of Key Terms."
],
"skip_title_check": False
},
"Federal Register Notice": {
"required_headings": [
"Purpose of This Notice",
"Audience",
"Where can I Find This Notice"
],
"skip_title_check": False
},
"Order": {
"required_headings": [
"Purpose of This Order.",
"Audience.",
"Where to Find This Order."
],
"skip_title_check": False
},
"Policy Statement": {
"required_headings": [
"SUMMARY",
"CURRENT REGULATORY AND ADVISORY MATERIAL",
"RELEVANT PAST PRACTICE",
"POLICY",
"EFFECT OF POLICY",
"CONCLUSION"
],
"skip_title_check": False
},
"Technical Standard Order": {
"required_headings": [
"PURPOSE.",
"APPLICABILITY.",
"REQUIREMENTS.",
"MARKING.",
"APPLICATION DATA REQUIREMENTS.",
"MANUFACTURER DATA REQUIREMENTS.",
"FURNISHED DATA REQUIREMENTS.",
"HOW TO GET REFERENCED DOCUMENTS."
],
"skip_title_check": False
},
"Other": {
"required_headings": [],
"skip_title_check": True
}
}
}
if config_path and os.path.exists(config_path):
try:
with open(config_path, 'r') as f:
user_config = json.load(f)
self._deep_merge(default_config, user_config)
except (json.JSONDecodeError, IOError) as e:
logging.warning(f"Error loading config: {e}. Using default config.")
return default_config
def _deep_merge(self, base: Dict[str, Any], update: Dict[str, Any]) -> Dict[str, Any]:
"""Recursively merge two dictionaries."""
for key, value in update.items():
if isinstance(value, dict) and key in base and isinstance(base[key], dict):
self._deep_merge(base[key], value)
else:
base[key] = value
return base
def _setup_logger(self) -> logging.Logger:
"""Set up and configure logging based on configuration."""
logger = logging.getLogger(__name__)
log_level = getattr(logging, self.config['logging']['level'].upper())
formatter = logging.Formatter(self.config['logging']['format'])
console_handler = logging.StreamHandler()
console_handler.setFormatter(formatter)
console_handler.setLevel(log_level)
logger.addHandler(console_handler)
logger.setLevel(log_level)
return logger
class DocumentChecker:
"""Base class for document checking."""
def __init__(self, config_path: Optional[str] = None):
self.config_manager = DocumentCheckerConfig(config_path)
self.logger = self.config_manager.logger
@staticmethod
def validate_input(doc: List[str]) -> bool:
"""Validate input document."""
return doc is not None and isinstance(doc, list) and len(doc) > 0
@classmethod
def extract_paragraphs(cls, doc_path: str) -> List[str]:
"""Extract plain text paragraphs from a document."""
try:
doc = Document(doc_path)
return [para.text for para in doc.paragraphs if para.text.strip()]
except Exception as e:
logging.error(f"Error extracting paragraphs: {e}")
return []
class FAADocumentChecker(DocumentChecker):
def __init__(self, config_path: Optional[str] = None):
super().__init__(config_path)
@profile_performance # Use the decorator directly
def heading_title_check(self, doc: List[str], doc_type: str) -> DocumentCheckResult:
"""Check headings for a specific document type."""
if not self.validate_input(doc):
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
# Use configuration for document-specific headings
checks = self.config_manager.config['document_types'].get(
doc_type, {}
)
required_headings = checks.get('required_headings', [])
headings_found = []
# Create a set for faster lookup
required_headings_set = set(required_headings)
for para in doc:
para_strip = para.strip()
# Check if the paragraph is in the required headings list
if para_strip in required_headings_set:
headings_found.append(para_strip)
# Check if all required headings are found
all_headings_present = set(headings_found) == required_headings_set
issues = []
if not all_headings_present:
missing_headings = required_headings_set - set(headings_found)
issues.append({'missing_headings': list(missing_headings)})
return DocumentCheckResult(
success=all_headings_present,
issues=issues,
details={
'found_headings': headings_found,
'required_headings': required_headings
}
)
@profile_performance
def heading_title_period_check(self, doc: List[str], doc_type: str) -> DocumentCheckResult:
"""
Check if headings end with periods according to document type requirements.
Args:
doc (List[str]): List of document paragraphs
doc_type (str): Type of document being checked
Returns:
DocumentCheckResult: Result of the heading period check
"""
if not self.validate_input(doc):
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
# Define document types requiring periods in headings
period_required = {
"Advisory Circular": True,
"Airworthiness Criteria": False,
"Deviation Memo": False,
"Exemption": False,
"Federal Register Notice": False,
"Order": True,
"Policy Statement": False,
"Rule": False,
"Special Condition": False,
"Technical Standard Order": True,
"Other": False
}
# Get whether periods are required for this document type
should_have_period = period_required.get(doc_type, False)
# Get the headings configuration for this document type
checks = self.config_manager.config['document_types'].get(doc_type, {})
required_headings = checks.get('required_headings', [])
required_headings_set = set(required_headings)
issues = []
checked_headings = []
for para in doc:
para_strip = para.strip()
# Check only if paragraph is a heading
if para_strip in required_headings_set:
ends_with_period = para_strip.endswith('.')
if should_have_period and not ends_with_period:
issues.append({
'heading': para_strip,
'issue': 'missing_period',
'message': f"Heading should end with a period: '{para_strip}'"
})
checked_headings.append({
'heading': para_strip,
'has_period': False,
'needs_period': True
})
elif not should_have_period and ends_with_period:
issues.append({
'heading': para_strip,
'issue': 'unexpected_period',
'message': f"Heading should not end with a period: '{para_strip}'"
})
checked_headings.append({
'heading': para_strip,
'has_period': True,
'needs_period': False
})
else:
checked_headings.append({
'heading': para_strip,
'has_period': ends_with_period,
'needs_period': should_have_period
})
success = len(issues) == 0
return DocumentCheckResult(
success=success,
issues=issues,
details={
'document_type': doc_type,
'periods_required': should_have_period,
'checked_headings': checked_headings
}
)
@profile_performance
def acronym_check(self, doc: List[str]) -> DocumentCheckResult:
"""Check if acronyms are defined at their first use, only flagging the first instance of undefined acronyms."""
if not self.validate_input(doc):
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
defined_acronyms = set()
first_occurrences = {} # Track first occurrence of each acronym
undefined_acronyms = []
acronym_pattern = re.compile(r'\b[A-Z]{2,}\b')
defined_pattern = re.compile(r'\b([\w\s&]+?)\s*\((\b[A-Z]{2,}\b)\)')
# Predefined acronyms
defined_acronyms.add("14 CFR")
for paragraph in doc:
# Check for definitions first
defined_matches = defined_pattern.findall(paragraph)
for full_term, acronym in defined_matches:
defined_acronyms.add(acronym)
# If this was previously marked as undefined, remove it since we found its definition
if acronym in first_occurrences:
del first_occurrences[acronym]
# Check for acronyms in the paragraph
usage_matches = acronym_pattern.findall(paragraph)
for acronym in usage_matches:
if acronym not in defined_acronyms:
# Only process if we haven't seen this acronym before
if acronym not in first_occurrences:
# Find the sentence containing the first undefined acronym
sentences = re.split(r'(?<=[.!?])\s+', paragraph)
for sentence in sentences:
if acronym in sentence:
first_occurrences[acronym] = {
'acronym': acronym,
'sentence': sentence.strip()
}
break
# Convert first occurrences to list of issues
undefined_acronyms = list(first_occurrences.values())
success = len(undefined_acronyms) == 0
issues = undefined_acronyms if not success else []
return DocumentCheckResult(success=success, issues=issues)
@profile_performance
def check_terminology(self, doc: List[str]) -> DocumentCheckResult:
"""
Check document terminology for:
1. Legal reference formatting and preferred terms
2. Prohibited phrases and constructions
"""
if not self.validate_input(doc):
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
# Dictionary of terms that should be replaced with preferred alternatives
term_replacements = {
r'\bUSC\b': 'U.S.C.',
r'\bCFR Part\b': 'CFR part',
r'\bC\.F\.R\.\b': 'CFR',
r'\b14 CFR\s*§': '14 CFR',
r'\bWe\b': 'The FAA',
r'\bwe\b': 'the FAA',
r'\bcancelled\b': 'canceled',
r'\bshall\b': 'must',
r'\b\&\b': 'and',
r'\bflight crew\b': 'flightcrew'
}
# Prohibited phrases that should be flagged
prohibited_phrases = [
r'\babove\b',
r'\bbelow\b',
r'(?:^|(?<=[.!?]\s))There\s+(?:is|are)\b' # Matches 'There is/are' at start of sentences
]
issues = []
for paragraph in doc:
sentences = re.split(r'(?<=[.!?])\s+', paragraph)
for sentence in sentences:
# Check for incorrect terms that need replacement
for incorrect_pattern, correct_term in term_replacements.items():
matches = re.finditer(incorrect_pattern, sentence)
for match in matches:
incorrect_term = match.group()
issues.append({
'type': 'incorrect_term',
'incorrect_term': incorrect_term,
'correct_term': correct_term,
'sentence': sentence.strip()
})
# Check for prohibited phrases
for phrase_pattern in prohibited_phrases:
match = re.search(phrase_pattern, sentence, re.IGNORECASE)
if match:
issues.append({
'type': 'prohibited_phrase',
'phrase': match.group().strip(),
'sentence': sentence.strip()
})
success = len(issues) == 0
return DocumentCheckResult(success=success, issues=issues)
@profile_performance
def check_section_symbol_usage(self, doc: List[str]) -> DocumentCheckResult:
"""Check for various section symbol (§) usage issues."""
if not self.validate_input(doc):
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
issues = []
# Patterns to identify issues
sentences_starting_with_section_symbol = []
incorrect_14_CFR_section_symbol_usage = []
single_section_symbol_multiple_sections = []
missing_section_symbol_in_multiple_sections = []
# Pattern to find '14 CFR §25.25'
pattern_14_CFR_section = re.compile(r'\b14 CFR §\s*\d+\.\d+\b')
# Patterns for multiple sections with single '§'
pattern_single_section_symbol_and = re.compile(r'§\s*\d+\.\d+\s+and\s+\d+\.\d+')
pattern_single_section_symbol_or = re.compile(r'§\s*\d+\.\d+\s+or\s+\d+\.\d+')
pattern_single_section_symbol_through = re.compile(r'§\s*\d+\.\d+\s+through\s+\d+\.\d+')
# Pattern for missing '§' before subsequent sections with 'or'
pattern_missing_section_symbol_or = re.compile(r'§\s*\d+\.\d+\s+or\s+§?\s*\d+\.\d+')
for paragraph in doc:
# Check for sentences starting with '§'
sentences = re.split(r'(?<=[.!?])\s+', paragraph)
for sentence in sentences:
if sentence.strip().startswith('§'):
sentences_starting_with_section_symbol.append(sentence.strip())
# Check for '14 CFR §25.25' usage
matches_14_CFR = pattern_14_CFR_section.findall(paragraph)
for match in matches_14_CFR:
incorrect_14_CFR_section_symbol_usage.append(match)
# Check for single '§' with multiple sections using 'and'
matches_and = pattern_single_section_symbol_and.findall(paragraph)
for match in matches_and:
single_section_symbol_multiple_sections.append(match)
# Check for single '§' with multiple sections using 'or'
matches_or = pattern_single_section_symbol_or.findall(paragraph)
for match in matches_or:
single_section_symbol_multiple_sections.append(match)
# Check for single '§' with multiple sections using 'through'
matches_through = pattern_single_section_symbol_through.findall(paragraph)
for match in matches_through:
single_section_symbol_multiple_sections.append(match)
# Check for missing '§' before subsequent sections with 'or'
matches_missing_or = pattern_missing_section_symbol_or.findall(paragraph)
for match in matches_missing_or:
missing_section_symbol_in_multiple_sections.append(match)
if sentences_starting_with_section_symbol:
issues.append({
'issue': 'sentences_starting_with_section_symbol',
'sentences': sentences_starting_with_section_symbol
})
if incorrect_14_CFR_section_symbol_usage:
issues.append({
'issue': 'incorrect_14_CFR_section_symbol_usage',
'matches': incorrect_14_CFR_section_symbol_usage
})
if single_section_symbol_multiple_sections:
issues.append({
'issue': 'single_section_symbol_multiple_sections',
'matches': single_section_symbol_multiple_sections
})
if missing_section_symbol_in_multiple_sections:
issues.append({
'issue': 'missing_section_symbol_in_multiple_sections',
'matches': missing_section_symbol_in_multiple_sections
})
success = len(issues) == 0
return DocumentCheckResult(success=success, issues=issues)
@profile_performance
def caption_check(self, doc: List[str], doc_type: str, caption_type: str) -> DocumentCheckResult:
"""Check for correctly formatted captions (Table or Figure)."""
if not self.validate_input(doc):
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
# Determine the caption pattern based on document type
if doc_type in ["Advisory Circular", "Order"]:
caption_pattern = re.compile(rf'^{caption_type}\s+([A-Z0-9]+)-([A-Z0-9]+)[\.\s]', re.IGNORECASE)
correct_format = f"{caption_type} X-Y"
else:
caption_pattern = re.compile(rf'^{caption_type}\s+([A-Z0-9]+)[\.\s]', re.IGNORECASE)
correct_format = f"{caption_type} X"
incorrect_captions = []
in_toc = False
for paragraph in doc:
# Check for start or end of Table of Contents (TOC)
if "Table of Contents" in paragraph or "Contents" in paragraph:
in_toc = True
continue
elif in_toc and paragraph.strip() == "":
in_toc = False # Assume blank line marks the end of TOC
# If within TOC, skip this paragraph
if in_toc:
continue
# Only check paragraphs that start with "Table" or "Figure" for proper caption format
paragraph_strip = paragraph.strip()
if paragraph_strip.lower().startswith(caption_type.lower()):
if not caption_pattern.match(paragraph_strip):
incorrect_captions.append({
'incorrect_caption': paragraph_strip,
'correct_format': correct_format
})
success = len(incorrect_captions) == 0
return DocumentCheckResult(success=success, issues=incorrect_captions)
@profile_performance
def table_figure_reference_check(self, doc: List[str], doc_type: str) -> DocumentCheckResult:
"""
Check for incorrect references to tables and figures in the document.
References should be lowercase within sentences and capitalized at sentence start.
"""
if not self.validate_input(doc):
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
incorrect_references = []
# Define patterns based on document type
if doc_type in ["Advisory Circular", "Order"]:
# Matches both capitalized and lowercase variations
table_pattern = r'\b[Tt]able\s+\d+-\d+\b'
figure_pattern = r'\b[Ff]igure\s+\d+-\d+\b'
correct_mid_table_format = "table X-Y"
correct_start_table_format = "Table X-Y"
correct_mid_figure_format = "figure X-Y"
correct_start_figure_format = "Figure X-Y"
else:
table_pattern = r'\b[Tt]able\s+\d+\b'
figure_pattern = r'\b[Ff]igure\s+\d+\b'
correct_mid_table_format = "table X"
correct_start_table_format = "Table X"
correct_mid_figure_format = "figure X"
correct_start_figure_format = "Figure X"
table_ref_pattern = re.compile(table_pattern)
figure_ref_pattern = re.compile(figure_pattern)
for paragraph in doc:
paragraph_strip = paragraph.strip()
# Exclude captions
starts_with_table_or_figure = paragraph_strip.lower().startswith('table') or paragraph_strip.lower().startswith('figure')
if not starts_with_table_or_figure:
# Split into sentences while preserving the original text
sentences = re.split(r'(?<=[.!?])\s+', paragraph)
for sentence in sentences:
sentence = sentence.strip()
# Check table references
matches = table_ref_pattern.finditer(sentence)
for match in matches:
ref = match.group()
# Get the text before the reference
text_before = sentence[:match.start()].strip()
# Determine if reference is at start of sentence
is_sentence_start = text_before == ""
# Check if capitalization is correct
if is_sentence_start and not ref.startswith('Table'):
incorrect_references.append({
'incorrect_ref': ref,
'correct_format': correct_start_table_format,
'sentence': sentence,
'issue': "Table reference at sentence start should be capitalized"
})
elif not is_sentence_start and not ref.startswith('table'):
incorrect_references.append({
'incorrect_ref': ref,
'correct_format': correct_mid_table_format,
'sentence': sentence,
'issue': "Table reference within sentence should be lowercase"
})
# Check figure references
matches = figure_ref_pattern.finditer(sentence)
for match in matches:
ref = match.group()
# Get the text before the reference
text_before = sentence[:match.start()].strip()
# Determine if reference is at start of sentence
is_sentence_start = text_before == ""
# Check if capitalization is correct
if is_sentence_start and not ref.startswith('Figure'):
incorrect_references.append({
'incorrect_ref': ref,
'correct_format': correct_start_figure_format,
'sentence': sentence,
'issue': "Figure reference at sentence start should be capitalized"
})
elif not is_sentence_start and not ref.startswith('figure'):
incorrect_references.append({
'incorrect_ref': ref,
'correct_format': correct_mid_figure_format,
'sentence': sentence,
'issue': "Figure reference within sentence should be lowercase"
})
success = len(incorrect_references) == 0
return DocumentCheckResult(success=success, issues=incorrect_references)
@profile_performance
def document_title_check(self, doc_path, doc_type: str) -> DocumentCheckResult:
"""Check for correct formatting of document titles."""
try:
# Handle both file paths and BytesIO objects
if isinstance(doc_path, (str, bytes, io.BytesIO)):
doc = Document(doc_path)
else:
return DocumentCheckResult(
success=False,
issues=[{'error': 'Invalid document input type'}]
)
# Rest of the method remains the same
incorrect_titles = []
# Define formatting rules for different document types
formatting_rules = {
"Advisory Circular": {"italics": True, "quotes": False},
"Airworthiness Criteria": {"italics": False, "quotes": True},
"Deviation Memo": {"italics": False, "quotes": True},
"Exemption": {"italics": False, "quotes": True},
"Federal Register Notice": {"italics": False, "quotes": True},
"Order": {"italics": False, "quotes": True},
"Policy Statement": {"italics": False, "quotes": False},
"Rule": {"italics": False, "quotes": True},
"Special Condition": {"italics": False, "quotes": True},
"Technical Standard Order": {"italics": False, "quotes": True},
"Other": {"italics": False, "quotes": False}
}
if doc_type not in formatting_rules:
self.logger.warning(f"Unsupported document type: {doc_type}. Skipping title check.")
return DocumentCheckResult(success=True, issues=[])
required_format = formatting_rules[doc_type]
ac_pattern = re.compile(r'(AC\s+\d+(?:-\d+)?(?:,|\s)+)(.+?)(?=\.|,|$)')
for paragraph in doc.paragraphs:
text = paragraph.text
matches = ac_pattern.finditer(text)
for match in matches:
full_match = match.group(0)
title_text = match.group(2).strip()
title_start = match.start(2)
title_end = match.end(2)
title_in_quotes = any(q in title_text for q in ['"', "'", '"', '"', ''', '''])
title_is_italicized = False
current_pos = 0
for run in paragraph.runs:
run_length = len(run.text)
run_start = current_pos
run_end = current_pos + run_length
if run_start <= title_start < run_end:
title_is_italicized = run.italic
break
current_pos += run_length
formatting_incorrect = False
issue_message = []
if required_format["italics"] and not title_is_italicized:
formatting_incorrect = True
issue_message.append("should be italicized")
elif not required_format["italics"] and title_is_italicized:
formatting_incorrect = True
issue_message.append("should not be italicized")
if required_format["quotes"] and not title_in_quotes:
formatting_incorrect = True
issue_message.append("should be in quotes")
elif not required_format["quotes"] and title_in_quotes:
formatting_incorrect = True
issue_message.append("should not be in quotes")
if formatting_incorrect:
incorrect_titles.append({
'text': title_text,
'issue': ', '.join(issue_message),
'sentence': text.strip()
})
return DocumentCheckResult(
success=len(incorrect_titles) == 0,
issues=incorrect_titles
)
except Exception as e:
self.logger.error(f"Error in document_title_check: {e}")
return DocumentCheckResult(
success=False,
issues=[{'error': str(e)}]
)
@profile_performance
def double_period_check(self, doc: List[str]) -> DocumentCheckResult:
"""Check for sentences that end with two periods."""
if not self.validate_input(doc):
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
incorrect_sentences = []
for paragraph in doc:
# Split the paragraph into sentences based on common sentence-ending punctuation
sentences = re.split(r'(?<=[.!?]) +', paragraph)
for sentence in sentences:
if sentence.endswith('..'):
incorrect_sentences.append({'sentence': sentence.strip()})
success = len(incorrect_sentences) == 0
return DocumentCheckResult(success=success, issues=incorrect_sentences)
@profile_performance
def spacing_check(self, doc: List[str]) -> DocumentCheckResult:
"""Check for correct spacing in the document."""
if not self.validate_input(doc):
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
incorrect_spacing = []
# Regex patterns to find incorrect spacing
patterns = [
(re.compile(r'(?<!\s)(AC|AD|CFR|FAA|N|SFAR)(\d+[-]?\d*)', re.IGNORECASE), "Missing space between document type and number"),
(re.compile(r'(?<!\s)(§|§§)(\d+\.\d+)', re.IGNORECASE), "Missing space after section symbol (§)"),
(re.compile(r'(?<!\s)Part(\d+)', re.IGNORECASE), "Missing space between 'Part' and number"),
(re.compile(r'(?<!\s)(\([a-z](?!\))|\([1-9](?!\)))', re.IGNORECASE), "Missing space before paragraph indication"),
(re.compile(r'\s{2,}'), "Double spaces between words")
]
for paragraph in doc:
sentences = re.split(r'(?<=[.!?])\s+', paragraph)
for sentence in sentences:
for pattern, issue in patterns:
if pattern.search(sentence):
incorrect_spacing.append({
'issue_description': issue,
'sentence': sentence.strip()
})
success = len(incorrect_spacing) == 0
return DocumentCheckResult(success=success, issues=incorrect_spacing)
@profile_performance
def check_abbreviation_usage(self, doc: List[str]) -> DocumentCheckResult:
"""Check for abbreviation consistency after first definition."""
if not self.validate_input(doc):
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
abbreviations = {}
issues = []
for paragraph in doc:
sentences = re.split(r'(?<=[.!?])\s+', paragraph)
for sentence in sentences:
# Find definitions like "Federal Aviation Administration (FAA)"
defined_matches = re.findall(r'\b([A-Za-z &]+)\s+\((\b[A-Z]{2,}\b)\)', sentence)
for full_term, acronym in defined_matches:
if acronym not in abbreviations:
abbreviations[acronym] = {"full_term": full_term.strip(), "defined": True}
# Check for full term usage after definition
for acronym, data in abbreviations.items():
full_term = data["full_term"]
if full_term in sentence:
# Ignore first usage where it's defined
if data["defined"]:
data["defined"] = False # Mark it as now defined
else:
# Only flag subsequent occurrences
issues.append({
'full_term': full_term,
'acronym': acronym,
'sentence': sentence.strip()
})
success = len(issues) == 0
return DocumentCheckResult(success=success, issues=issues)
@profile_performance
def check_date_formats(self, doc: List[str]) -> DocumentCheckResult:
"""Check for inconsistent date formats while ignoring aviation reference numbers."""
if not self.validate_input(doc):
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
date_issues = []
# Patterns to ignore (aviation references)
ignore_patterns = [
r'\bAD \d{4}-\d{2}-\d{2}\b', # Airworthiness Directive references
r'\bSWPM \d{2}-\d{2}-\d{2}\b', # Standard Wiring Practices Manual references
r'\bAMM \d{2}-\d{2}-\d{2}\b', # Aircraft Maintenance Manual references
r'\bSOPM \d{2}-\d{2}-\d{2}\b', # Standard Operating Procedure references
r'\b[A-Z]{2,4} \d{2}-\d{2}-\d{2}\b' # Generic manual reference pattern
]
# Combine ignore patterns into one
ignore_regex = '|'.join(ignore_patterns)
ignore_pattern = re.compile(ignore_regex)
# Correct date pattern: 'Month Day, Year' e.g., 'January 1, 2020'
correct_date_pattern = re.compile(r'\b(January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}, \d{4}\b')
# Incorrect date patterns
date_patterns = [
(re.compile(r'(?<![\w/-])\d{1,2}/\d{1,2}/\d{2,4}(?![\w/-])'), "Use 'Month Day, Year' format instead of 'MM/DD/YYYY'"),
(re.compile(r'(?<![\w/-])\d{1,2}-\d{1,2}-\d{2,4}(?![\w/-])'), "Use 'Month Day, Year' format instead of 'MM-DD-YYYY'"),
(re.compile(r'(?<![\w/-])\d{4}-\d{1,2}-\d{1,2}(?![\w/-])'), "Use 'Month Day, Year' format instead of 'YYYY-MM-DD'")
]
for paragraph in doc:
sentences = re.split(r'(?<=[.!?])\s+', paragraph)
for sentence in sentences:
# First, identify and temporarily remove text that should be ignored
ignored_matches = list(ignore_pattern.finditer(sentence))
working_sentence = sentence
# Replace ignored patterns with placeholders
for match in reversed(ignored_matches):
start, end = match.span()
working_sentence = working_sentence[:start] + 'X' * (end - start) + working_sentence[end:]
# Now check for date patterns in the modified sentence
for pattern, issue in date_patterns:
matches = pattern.finditer(working_sentence)
for match in matches:
# Get the original text from the match position
original_date = sentence[match.start():match.end()]
date_issues.append({
'date': original_date,
'issue': issue,
'sentence': sentence.strip()
})
success = len(date_issues) == 0
return DocumentCheckResult(success=success, issues=date_issues)
@profile_performance
def check_placeholders(self, doc: List[str]) -> DocumentCheckResult:
"""Check for placeholders that should be removed."""
if not self.validate_input(doc):
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
placeholder_phrases = [
r'\bTBD\b',
r'\bTo be determined\b',
r'\bTo be added\b'
]
issues = []
for paragraph in doc:
sentences = re.split(r'(?<=[.!?])\s+', paragraph)
for sentence in sentences:
for phrase in placeholder_phrases:
match = re.search(phrase, sentence, re.IGNORECASE)
if match:
issues.append({
'placeholder': match.group().strip(),
'sentence': sentence.strip()
})
success = len(issues) == 0
return DocumentCheckResult(success=success, issues=issues)
def run_all_checks(self, doc_path: str, doc_type: str, template_type: Optional[str] = None) -> Dict[str, DocumentCheckResult]:
"""
Run all checks on the document.
Args:
doc_path (str): Path to the document.
doc_type (str): Type of the document.
template_type (str, optional): Template type, if applicable.
Returns:
Dict[str, DocumentCheckResult]: Dictionary of check names to results.
"""
# Read the document
doc = self.extract_paragraphs(doc_path)
# Retrieve any specific flags
checks_config = self.config_manager.config['document_types'].get(doc_type, {})
skip_title_check = checks_config.get('skip_title_check', False)
# Run checks
results = {}
results['heading_title_check'] = self.heading_title_check(doc, doc_type)
results['heading_title_period_check'] = self.heading_title_period_check(doc, doc_type)
results['acronym_check'] = self.acronym_check(doc)
results['terminology_check'] = self.check_terminology(doc)
results['section_symbol_usage_check'] = self.check_section_symbol_usage(doc)
results['caption_check_table'] = self.caption_check(doc, doc_type, 'Table')
results['caption_check_figure'] = self.caption_check(doc, doc_type, 'Figure')
results['table_figure_reference_check'] = self.table_figure_reference_check(doc, doc_type)
if not skip_title_check:
results['document_title_check'] = self.document_title_check(doc_path, doc_type)
else:
results['document_title_check'] = DocumentCheckResult(success=True, issues=[])
results['double_period_check'] = self.double_period_check(doc)
results['spacing_check'] = self.spacing_check(doc)
results['abbreviation_usage_check'] = self.check_abbreviation_usage(doc)
results['date_formats_check'] = self.check_date_formats(doc)
results['placeholders_check'] = self.check_placeholders(doc)
return results
def process_document(file_obj, doc_type, template_type):
"""Process the document and run all checks."""
try:
# Convert file object to BytesIO
if isinstance(file_obj, bytes):
file_obj = io.BytesIO(file_obj)
checker = FAADocumentChecker()
doc = Document(file_obj)
paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
# Rewind the file object for additional processing
file_obj.seek(0)
# Run all checks
results = {}
results['heading_check'] = checker.heading_title_check(paragraphs, doc_type)
results['heading_period_check'] = checker.heading_title_period_check(paragraphs, doc_type)
results['acronym_check'] = checker.acronym_check(paragraphs)
results['terminology_check'] = checker.check_terminology(paragraphs)
results['section_symbol_check'] = checker.check_section_symbol_usage(paragraphs)
results['table_caption_check'] = checker.caption_check(paragraphs, doc_type, 'Table')
results['figure_caption_check'] = checker.caption_check(paragraphs, doc_type, 'Figure')
results['references_check'] = checker.table_figure_reference_check(paragraphs, doc_type)
results['title_check'] = checker.document_title_check(file_obj, doc_type)
results['double_period_check'] = checker.double_period_check(paragraphs)
results['spacing_check'] = checker.spacing_check(paragraphs)
results['abbreviation_check'] = checker.check_abbreviation_usage(paragraphs)
results['date_check'] = checker.check_date_formats(paragraphs)
results['placeholder_check'] = checker.check_placeholders(paragraphs)
return format_results_for_gradio(results, doc_type)
except Exception as e:
print(f"Error in process_document: {str(e)}")
traceback.print_exc() # This will print the full traceback
return f"An error occurred while processing the document: {str(e)}"
def format_results_for_gradio(results: Dict[str, DocumentCheckResult], doc_type: str) -> str:
"""Format the results for display in Gradio."""
output = ["# Document Check Results\n"]
# Map check names to display titles
check_titles = {
'heading_check': "Required Headings Check",
'heading_period_check': "Heading Period Check",
'acronym_check': "Acronym Check",
'terminology_check': "Terminology Check",
'section_symbol_check': "Section Symbol Usage",
'table_caption_check': "Table Caption Format",
'figure_caption_check': "Figure Caption Format",
'references_check': "Table and Figure References",
'title_check': "Document Title Style",
'double_period_check': "Double Period Check",
'spacing_check': "Spacing Check",
'abbreviation_check': "Abbreviation Usage",
'date_check': "Date Format Check",
'placeholder_check': "Placeholder Check"
}
for check_name, result in results.items():
title = check_titles.get(check_name, check_name.replace('_', ' ').title())
output.append(f"## {title}")
if result.success:
output.append("✅ All checks passed.\n")
else:
output.append("❌ Issues found:")
for issue in result.issues:
if isinstance(issue, dict):
for key, value in issue.items():
if isinstance(value, list):
for item in value:
output.append(f"- {item}")
else:
output.append(f"- {key}: {value}")
else:
output.append(f"- {issue}")
output.append("")
if result.details:
output.append("Additional Details:")
for key, value in result.details.items():
if isinstance(value, list):
output.append(f"- {key}:")
for item in value:
output.append(f" - {item}")
else:
output.append(f"- {key}: {value}")
output.append("")
return "\n".join(output)
# Create the Gradio interface
demo = gr.Blocks(theme='JohnSmith9982/small_and_pretty')
with demo:
gr.Markdown("# Document Checker Tool")
gr.Markdown("Upload a Word (docx) document to check for compliance with U.S. federal documentation standards.")
gr.Markdown("*This tool is still in development and you might get false positives in your results*")
gr.Markdown("Contact Eric Putnam if you have questions and comments.")
gr.Markdown("""
1. Upload a clean (no track changes or comments) Word file.
2. Choose **Check Document**.""")
document_types = [
"Advisory Circular", "Airworthiness Criteria", "Deviation Memo", "Exemption",
"Federal Register Notice", "Order", "Policy Statement",
"Rule", "Special Condition", "Technical Standard Order", "Other"
]
template_types = ["Short AC template AC", "Long AC template AC"]
with gr.Row():
with gr.Column(scale=1):
file_input = gr.File(
label="Upload Word Document (.docx)",
file_types=[".docx"],
type="binary"
)
doc_type = gr.Dropdown(
choices=document_types,
label="Document Type",
value="Advisory Circular"
)
template_type = gr.Radio(
choices=template_types,
label="Template Type (Only for Advisory Circular)",
visible=True,
value="Short AC template AC"
)
submit_btn = gr.Button("Check Document", variant="primary")
with gr.Column(scale=2):
output = gr.Markdown(
label="Check Results",
value="Results will appear here after processing..."
)
def update_template_visibility(doc_type):
return gr.update(visible=doc_type == "Advisory Circular")
doc_type.change(
fn=update_template_visibility,
inputs=[doc_type],
outputs=[template_type]
)
submit_btn.click(
fn=process_document,
inputs=[file_input, doc_type, template_type],
outputs=[output]
)
# Launch the demo
if __name__ == "__main__":
demo.launch()