Spaces:

Hoctar77
/

DocumentCheckerTool

Sleeping

App Files Files Community

DocumentCheckerTool / app.py

Hoctar77

Update app.py

736fba7 verified 11 months ago

raw

history blame

48 kB

	import gradio as gr
	import logging
	import re
	import json
	import time
	from typing import Dict, List, Any, Tuple, Optional
	from dataclasses import dataclass
	from functools import wraps
	from docx import Document
	import io
	import os
	import traceback

	@dataclass
	class DocumentCheckResult:
	"""Structured result for document checks."""
	success: bool
	issues: List[Dict[str, Any]]
	details: Optional[Dict[str, Any]] = None

	def profile_performance(func):
	"""Decorator to profile function performance."""
	@wraps(func)
	def wrapper(args, *kwargs):
	start_time = time.time()
	result = func(args, *kwargs)
	end_time = time.time()
	# Get logger from the class instance (first argument)
	logger = args[0].logger if hasattr(args[0], 'logger') else logging.getLogger(__name__)
	logger.info(
	f"Performance: {func.__name__} took {end_time - start_time:.4f} seconds"
	)
	return result
	return wrapper

	class DocumentCheckerConfig:
	"""Configuration management for document checks."""
	def __init__(self, config_path: Optional[str] = None):
	"""Initialize configuration with optional config file."""
	self.config = self._load_config(config_path)
	self.logger = self._setup_logger()

	def _load_config(self, config_path: Optional[str] = None) -> Dict[str, Any]:
	"""Load configuration from JSON file or use default settings."""
	default_config = {
	"logging": {
	"level": "INFO",
	"format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
	},
	"checks": {
	"acronyms": True,
	"terminology_check": True,
	"headings": True
	},
	"document_types": {
	"Advisory Circular": {
	"required_headings": [
	"Purpose.",
	"Applicability.",
	"Cancellation.",
	"Related Material.",
	"Definition of Key Terms."
	],
	"skip_title_check": False
	},
	"Federal Register Notice": {
	"required_headings": [
	"Purpose of This Notice",
	"Audience",
	"Where can I Find This Notice"
	],
	"skip_title_check": False
	},
	"Order": {
	"required_headings": [
	"Purpose of This Order.",
	"Audience.",
	"Where to Find This Order."
	],
	"skip_title_check": False
	},
	"Policy Statement": {
	"required_headings": [
	"SUMMARY",
	"CURRENT REGULATORY AND ADVISORY MATERIAL",
	"RELEVANT PAST PRACTICE",
	"POLICY",
	"EFFECT OF POLICY",
	"CONCLUSION"
	],
	"skip_title_check": False
	},
	"Technical Standard Order": {
	"required_headings": [
	"PURPOSE.",
	"APPLICABILITY.",
	"REQUIREMENTS.",
	"MARKING.",
	"APPLICATION DATA REQUIREMENTS.",
	"MANUFACTURER DATA REQUIREMENTS.",
	"FURNISHED DATA REQUIREMENTS.",
	"HOW TO GET REFERENCED DOCUMENTS."
	],
	"skip_title_check": False
	},
	"Other": {
	"required_headings": [],
	"skip_title_check": True
	}
	}
	}

	if config_path and os.path.exists(config_path):
	try:
	with open(config_path, 'r') as f:
	user_config = json.load(f)
	self._deep_merge(default_config, user_config)
	except (json.JSONDecodeError, IOError) as e:
	logging.warning(f"Error loading config: {e}. Using default config.")

	return default_config

	def _deep_merge(self, base: Dict[str, Any], update: Dict[str, Any]) -> Dict[str, Any]:
	"""Recursively merge two dictionaries."""
	for key, value in update.items():
	if isinstance(value, dict) and key in base and isinstance(base[key], dict):
	self._deep_merge(base[key], value)
	else:
	base[key] = value
	return base

	def _setup_logger(self) -> logging.Logger:
	"""Set up and configure logging based on configuration."""
	logger = logging.getLogger(__name__)
	log_level = getattr(logging, self.config['logging']['level'].upper())
	formatter = logging.Formatter(self.config['logging']['format'])
	console_handler = logging.StreamHandler()
	console_handler.setFormatter(formatter)
	console_handler.setLevel(log_level)
	logger.addHandler(console_handler)
	logger.setLevel(log_level)
	return logger

	class DocumentChecker:
	"""Base class for document checking."""
	def __init__(self, config_path: Optional[str] = None):
	self.config_manager = DocumentCheckerConfig(config_path)
	self.logger = self.config_manager.logger

	@staticmethod
	def validate_input(doc: List[str]) -> bool:
	"""Validate input document."""
	return doc is not None and isinstance(doc, list) and len(doc) > 0

	@classmethod
	def extract_paragraphs(cls, doc_path: str) -> List[str]:
	"""Extract plain text paragraphs from a document."""
	try:
	doc = Document(doc_path)
	return [para.text for para in doc.paragraphs if para.text.strip()]
	except Exception as e:
	logging.error(f"Error extracting paragraphs: {e}")
	return []

	class FAADocumentChecker(DocumentChecker):
	def __init__(self, config_path: Optional[str] = None):
	super().__init__(config_path)

	@profile_performance # Use the decorator directly
	def heading_title_check(self, doc: List[str], doc_type: str) -> DocumentCheckResult:
	"""Check headings for a specific document type."""
	if not self.validate_input(doc):
	return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])

	# Use configuration for document-specific headings
	checks = self.config_manager.config['document_types'].get(
	doc_type, {}
	)
	required_headings = checks.get('required_headings', [])

	headings_found = []

	# Create a set for faster lookup
	required_headings_set = set(required_headings)

	for para in doc:
	para_strip = para.strip()
	# Check if the paragraph is in the required headings list
	if para_strip in required_headings_set:
	headings_found.append(para_strip)

	# Check if all required headings are found
	all_headings_present = set(headings_found) == required_headings_set

	issues = []
	if not all_headings_present:
	missing_headings = required_headings_set - set(headings_found)
	issues.append({'missing_headings': list(missing_headings)})

	return DocumentCheckResult(
	success=all_headings_present,
	issues=issues,
	details={
	'found_headings': headings_found,
	'required_headings': required_headings
	}
	)

	@profile_performance
	def heading_title_period_check(self, doc: List[str], doc_type: str) -> DocumentCheckResult:
	"""
	Check if headings end with periods according to document type requirements.

	Args:
	doc (List[str]): List of document paragraphs
	doc_type (str): Type of document being checked

	Returns:
	DocumentCheckResult: Result of the heading period check
	"""
	if not self.validate_input(doc):
	return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])

	# Define document types requiring periods in headings
	period_required = {
	"Advisory Circular": True,
	"Airworthiness Criteria": False,
	"Deviation Memo": False,
	"Exemption": False,
	"Federal Register Notice": False,
	"Order": True,
	"Policy Statement": False,
	"Rule": False,
	"Special Condition": False,
	"Technical Standard Order": True,
	"Other": False
	}

	# Get whether periods are required for this document type
	should_have_period = period_required.get(doc_type, False)

	# Get the headings configuration for this document type
	checks = self.config_manager.config['document_types'].get(doc_type, {})
	required_headings = checks.get('required_headings', [])
	required_headings_set = set(required_headings)

	issues = []
	checked_headings = []

	for para in doc:
	para_strip = para.strip()
	# Check only if paragraph is a heading
	if para_strip in required_headings_set:
	ends_with_period = para_strip.endswith('.')

	if should_have_period and not ends_with_period:
	issues.append({
	'heading': para_strip,
	'issue': 'missing_period',
	'message': f"Heading should end with a period: '{para_strip}'"
	})
	checked_headings.append({
	'heading': para_strip,
	'has_period': False,
	'needs_period': True
	})
	elif not should_have_period and ends_with_period:
	issues.append({
	'heading': para_strip,
	'issue': 'unexpected_period',
	'message': f"Heading should not end with a period: '{para_strip}'"
	})
	checked_headings.append({
	'heading': para_strip,
	'has_period': True,
	'needs_period': False
	})
	else:
	checked_headings.append({
	'heading': para_strip,
	'has_period': ends_with_period,
	'needs_period': should_have_period
	})

	success = len(issues) == 0

	return DocumentCheckResult(
	success=success,
	issues=issues,
	details={
	'document_type': doc_type,
	'periods_required': should_have_period,
	'checked_headings': checked_headings
	}
	)

	@profile_performance
	def acronym_check(self, doc: List[str]) -> DocumentCheckResult:
	"""Check if acronyms are defined at their first use, only flagging the first instance of undefined acronyms."""
	if not self.validate_input(doc):
	return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])

	defined_acronyms = set()
	first_occurrences = {} # Track first occurrence of each acronym
	undefined_acronyms = []

	acronym_pattern = re.compile(r'\b[A-Z]{2,}\b')
	defined_pattern = re.compile(r'\b([\w\s&]+?)\s*$(\b[A-Z]{2,}\b)$')

	# Predefined acronyms
	defined_acronyms.add("14 CFR")

	for paragraph in doc:
	# Check for definitions first
	defined_matches = defined_pattern.findall(paragraph)
	for full_term, acronym in defined_matches:
	defined_acronyms.add(acronym)
	# If this was previously marked as undefined, remove it since we found its definition
	if acronym in first_occurrences:
	del first_occurrences[acronym]

	# Check for acronyms in the paragraph
	usage_matches = acronym_pattern.findall(paragraph)
	for acronym in usage_matches:
	if acronym not in defined_acronyms:
	# Only process if we haven't seen this acronym before
	if acronym not in first_occurrences:
	# Find the sentence containing the first undefined acronym
	sentences = re.split(r'(?<=[.!?])\s+', paragraph)
	for sentence in sentences:
	if acronym in sentence:
	first_occurrences[acronym] = {
	'acronym': acronym,
	'sentence': sentence.strip()
	}
	break

	# Convert first occurrences to list of issues
	undefined_acronyms = list(first_occurrences.values())

	success = len(undefined_acronyms) == 0
	issues = undefined_acronyms if not success else []

	return DocumentCheckResult(success=success, issues=issues)

	@profile_performance
	def check_terminology(self, doc: List[str]) -> DocumentCheckResult:
	"""
	Check document terminology for:
	1. Legal reference formatting and preferred terms
	2. Prohibited phrases and constructions
	"""
	if not self.validate_input(doc):
	return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])

	# Dictionary of terms that should be replaced with preferred alternatives
	term_replacements = {
	r'\bUSC\b': 'U.S.C.',
	r'\bCFR Part\b': 'CFR part',
	r'\bC\.F\.R\.\b': 'CFR',
	r'\b14 CFR\s*§': '14 CFR',
	r'\bWe\b': 'The FAA',
	r'\bwe\b': 'the FAA',
	r'\bcancelled\b': 'canceled',
	r'\bshall\b': 'must',
	r'\b\&\b': 'and',
	r'\bflight crew\b': 'flightcrew'
	}

	# Prohibited phrases that should be flagged
	prohibited_phrases = [
	r'\babove\b',
	r'\bbelow\b',
	r'(?:^\|(?<=[.!?]\s))There\s+(?:is\|are)\b' # Matches 'There is/are' at start of sentences
	]

	issues = []

	for paragraph in doc:
	sentences = re.split(r'(?<=[.!?])\s+', paragraph)
	for sentence in sentences:
	# Check for incorrect terms that need replacement
	for incorrect_pattern, correct_term in term_replacements.items():
	matches = re.finditer(incorrect_pattern, sentence)
	for match in matches:
	incorrect_term = match.group()
	issues.append({
	'type': 'incorrect_term',
	'incorrect_term': incorrect_term,
	'correct_term': correct_term,
	'sentence': sentence.strip()
	})

	# Check for prohibited phrases
	for phrase_pattern in prohibited_phrases:
	match = re.search(phrase_pattern, sentence, re.IGNORECASE)
	if match:
	issues.append({
	'type': 'prohibited_phrase',
	'phrase': match.group().strip(),
	'sentence': sentence.strip()
	})

	success = len(issues) == 0
	return DocumentCheckResult(success=success, issues=issues)

	@profile_performance
	def check_section_symbol_usage(self, doc: List[str]) -> DocumentCheckResult:
	"""Check for various section symbol (§) usage issues."""
	if not self.validate_input(doc):
	return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])

	issues = []

	# Patterns to identify issues
	sentences_starting_with_section_symbol = []
	incorrect_14_CFR_section_symbol_usage = []
	single_section_symbol_multiple_sections = []
	missing_section_symbol_in_multiple_sections = []

	# Pattern to find '14 CFR §25.25'
	pattern_14_CFR_section = re.compile(r'\b14 CFR §\s*\d+\.\d+\b')

	# Patterns for multiple sections with single '§'
	pattern_single_section_symbol_and = re.compile(r'§\s*\d+\.\d+\s+and\s+\d+\.\d+')
	pattern_single_section_symbol_or = re.compile(r'§\s*\d+\.\d+\s+or\s+\d+\.\d+')
	pattern_single_section_symbol_through = re.compile(r'§\s*\d+\.\d+\s+through\s+\d+\.\d+')

	# Pattern for missing '§' before subsequent sections with 'or'
	pattern_missing_section_symbol_or = re.compile(r'§\s\d+\.\d+\s+or\s+§?\s\d+\.\d+')

	for paragraph in doc:
	# Check for sentences starting with '§'
	sentences = re.split(r'(?<=[.!?])\s+', paragraph)
	for sentence in sentences:
	if sentence.strip().startswith('§'):
	sentences_starting_with_section_symbol.append(sentence.strip())

	# Check for '14 CFR §25.25' usage
	matches_14_CFR = pattern_14_CFR_section.findall(paragraph)
	for match in matches_14_CFR:
	incorrect_14_CFR_section_symbol_usage.append(match)

	# Check for single '§' with multiple sections using 'and'
	matches_and = pattern_single_section_symbol_and.findall(paragraph)
	for match in matches_and:
	single_section_symbol_multiple_sections.append(match)

	# Check for single '§' with multiple sections using 'or'
	matches_or = pattern_single_section_symbol_or.findall(paragraph)
	for match in matches_or:
	single_section_symbol_multiple_sections.append(match)

	# Check for single '§' with multiple sections using 'through'
	matches_through = pattern_single_section_symbol_through.findall(paragraph)
	for match in matches_through:
	single_section_symbol_multiple_sections.append(match)

	# Check for missing '§' before subsequent sections with 'or'
	matches_missing_or = pattern_missing_section_symbol_or.findall(paragraph)
	for match in matches_missing_or:
	missing_section_symbol_in_multiple_sections.append(match)

	if sentences_starting_with_section_symbol:
	issues.append({
	'issue': 'sentences_starting_with_section_symbol',
	'sentences': sentences_starting_with_section_symbol
	})
	if incorrect_14_CFR_section_symbol_usage:
	issues.append({
	'issue': 'incorrect_14_CFR_section_symbol_usage',
	'matches': incorrect_14_CFR_section_symbol_usage
	})
	if single_section_symbol_multiple_sections:
	issues.append({
	'issue': 'single_section_symbol_multiple_sections',
	'matches': single_section_symbol_multiple_sections
	})
	if missing_section_symbol_in_multiple_sections:
	issues.append({
	'issue': 'missing_section_symbol_in_multiple_sections',
	'matches': missing_section_symbol_in_multiple_sections
	})

	success = len(issues) == 0

	return DocumentCheckResult(success=success, issues=issues)

	@profile_performance
	def caption_check(self, doc: List[str], doc_type: str, caption_type: str) -> DocumentCheckResult:
	"""Check for correctly formatted captions (Table or Figure)."""
	if not self.validate_input(doc):
	return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])

	# Determine the caption pattern based on document type
	if doc_type in ["Advisory Circular", "Order"]:
	caption_pattern = re.compile(rf'^{caption_type}\s+([A-Z0-9]+)-([A-Z0-9]+)[\.\s]', re.IGNORECASE)
	correct_format = f"{caption_type} X-Y"
	else:
	caption_pattern = re.compile(rf'^{caption_type}\s+([A-Z0-9]+)[\.\s]', re.IGNORECASE)
	correct_format = f"{caption_type} X"

	incorrect_captions = []
	in_toc = False

	for paragraph in doc:
	# Check for start or end of Table of Contents (TOC)
	if "Table of Contents" in paragraph or "Contents" in paragraph:
	in_toc = True
	continue
	elif in_toc and paragraph.strip() == "":
	in_toc = False # Assume blank line marks the end of TOC

	# If within TOC, skip this paragraph
	if in_toc:
	continue

	# Only check paragraphs that start with "Table" or "Figure" for proper caption format
	paragraph_strip = paragraph.strip()
	if paragraph_strip.lower().startswith(caption_type.lower()):
	if not caption_pattern.match(paragraph_strip):
	incorrect_captions.append({
	'incorrect_caption': paragraph_strip,
	'correct_format': correct_format
	})

	success = len(incorrect_captions) == 0

	return DocumentCheckResult(success=success, issues=incorrect_captions)

	@profile_performance
	def table_figure_reference_check(self, doc: List[str], doc_type: str) -> DocumentCheckResult:
	"""
	Check for incorrect references to tables and figures in the document.
	References should be lowercase within sentences and capitalized at sentence start.
	"""
	if not self.validate_input(doc):
	return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])

	incorrect_references = []

	# Define patterns based on document type
	if doc_type in ["Advisory Circular", "Order"]:
	# Matches both capitalized and lowercase variations
	table_pattern = r'\b[Tt]able\s+\d+-\d+\b'
	figure_pattern = r'\b[Ff]igure\s+\d+-\d+\b'
	correct_mid_table_format = "table X-Y"
	correct_start_table_format = "Table X-Y"
	correct_mid_figure_format = "figure X-Y"
	correct_start_figure_format = "Figure X-Y"
	else:
	table_pattern = r'\b[Tt]able\s+\d+\b'
	figure_pattern = r'\b[Ff]igure\s+\d+\b'
	correct_mid_table_format = "table X"
	correct_start_table_format = "Table X"
	correct_mid_figure_format = "figure X"
	correct_start_figure_format = "Figure X"

	table_ref_pattern = re.compile(table_pattern)
	figure_ref_pattern = re.compile(figure_pattern)

	for paragraph in doc:
	paragraph_strip = paragraph.strip()
	# Exclude captions
	starts_with_table_or_figure = paragraph_strip.lower().startswith('table') or paragraph_strip.lower().startswith('figure')

	if not starts_with_table_or_figure:
	# Split into sentences while preserving the original text
	sentences = re.split(r'(?<=[.!?])\s+', paragraph)

	for sentence in sentences:
	sentence = sentence.strip()

	# Check table references
	matches = table_ref_pattern.finditer(sentence)
	for match in matches:
	ref = match.group()
	# Get the text before the reference
	text_before = sentence[:match.start()].strip()

	# Determine if reference is at start of sentence
	is_sentence_start = text_before == ""

	# Check if capitalization is correct
	if is_sentence_start and not ref.startswith('Table'):
	incorrect_references.append({
	'incorrect_ref': ref,
	'correct_format': correct_start_table_format,
	'sentence': sentence,
	'issue': "Table reference at sentence start should be capitalized"
	})
	elif not is_sentence_start and not ref.startswith('table'):
	incorrect_references.append({
	'incorrect_ref': ref,
	'correct_format': correct_mid_table_format,
	'sentence': sentence,
	'issue': "Table reference within sentence should be lowercase"
	})

	# Check figure references
	matches = figure_ref_pattern.finditer(sentence)
	for match in matches:
	ref = match.group()
	# Get the text before the reference
	text_before = sentence[:match.start()].strip()

	# Determine if reference is at start of sentence
	is_sentence_start = text_before == ""

	# Check if capitalization is correct
	if is_sentence_start and not ref.startswith('Figure'):
	incorrect_references.append({
	'incorrect_ref': ref,
	'correct_format': correct_start_figure_format,
	'sentence': sentence,
	'issue': "Figure reference at sentence start should be capitalized"
	})
	elif not is_sentence_start and not ref.startswith('figure'):
	incorrect_references.append({
	'incorrect_ref': ref,
	'correct_format': correct_mid_figure_format,
	'sentence': sentence,
	'issue': "Figure reference within sentence should be lowercase"
	})

	success = len(incorrect_references) == 0
	return DocumentCheckResult(success=success, issues=incorrect_references)

	@profile_performance
	def document_title_check(self, doc_path, doc_type: str) -> DocumentCheckResult:
	"""Check for correct formatting of document titles."""
	try:
	# Handle both file paths and BytesIO objects
	if isinstance(doc_path, (str, bytes, io.BytesIO)):
	doc = Document(doc_path)
	else:
	return DocumentCheckResult(
	success=False,
	issues=[{'error': 'Invalid document input type'}]
	)

	# Rest of the method remains the same
	incorrect_titles = []

	# Define formatting rules for different document types
	formatting_rules = {
	"Advisory Circular": {"italics": True, "quotes": False},
	"Airworthiness Criteria": {"italics": False, "quotes": True},
	"Deviation Memo": {"italics": False, "quotes": True},
	"Exemption": {"italics": False, "quotes": True},
	"Federal Register Notice": {"italics": False, "quotes": True},
	"Order": {"italics": False, "quotes": True},
	"Policy Statement": {"italics": False, "quotes": False},
	"Rule": {"italics": False, "quotes": True},
	"Special Condition": {"italics": False, "quotes": True},
	"Technical Standard Order": {"italics": False, "quotes": True},
	"Other": {"italics": False, "quotes": False}
	}

	if doc_type not in formatting_rules:
	self.logger.warning(f"Unsupported document type: {doc_type}. Skipping title check.")
	return DocumentCheckResult(success=True, issues=[])

	required_format = formatting_rules[doc_type]
	ac_pattern = re.compile(r'(AC\s+\d+(?:-\d+)?(?:,\|\s)+)(.+?)(?=\.\|,\|$)')

	for paragraph in doc.paragraphs:
	text = paragraph.text
	matches = ac_pattern.finditer(text)

	for match in matches:
	full_match = match.group(0)
	title_text = match.group(2).strip()
	title_start = match.start(2)
	title_end = match.end(2)
	title_in_quotes = any(q in title_text for q in ['"', "'", '"', '"', ''', '''])

	title_is_italicized = False
	current_pos = 0
	for run in paragraph.runs:
	run_length = len(run.text)
	run_start = current_pos
	run_end = current_pos + run_length
	if run_start <= title_start < run_end:
	title_is_italicized = run.italic
	break
	current_pos += run_length

	formatting_incorrect = False
	issue_message = []

	if required_format["italics"] and not title_is_italicized:
	formatting_incorrect = True
	issue_message.append("should be italicized")
	elif not required_format["italics"] and title_is_italicized:
	formatting_incorrect = True
	issue_message.append("should not be italicized")

	if required_format["quotes"] and not title_in_quotes:
	formatting_incorrect = True
	issue_message.append("should be in quotes")
	elif not required_format["quotes"] and title_in_quotes:
	formatting_incorrect = True
	issue_message.append("should not be in quotes")

	if formatting_incorrect:
	incorrect_titles.append({
	'text': title_text,
	'issue': ', '.join(issue_message),
	'sentence': text.strip()
	})

	return DocumentCheckResult(
	success=len(incorrect_titles) == 0,
	issues=incorrect_titles
	)

	except Exception as e:
	self.logger.error(f"Error in document_title_check: {e}")
	return DocumentCheckResult(
	success=False,
	issues=[{'error': str(e)}]
	)

	@profile_performance
	def double_period_check(self, doc: List[str]) -> DocumentCheckResult:
	"""Check for sentences that end with two periods."""
	if not self.validate_input(doc):
	return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])

	incorrect_sentences = []

	for paragraph in doc:
	# Split the paragraph into sentences based on common sentence-ending punctuation
	sentences = re.split(r'(?<=[.!?]) +', paragraph)
	for sentence in sentences:
	if sentence.endswith('..'):
	incorrect_sentences.append({'sentence': sentence.strip()})

	success = len(incorrect_sentences) == 0

	return DocumentCheckResult(success=success, issues=incorrect_sentences)

	@profile_performance
	def spacing_check(self, doc: List[str]) -> DocumentCheckResult:
	"""Check for correct spacing in the document."""
	if not self.validate_input(doc):
	return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])

	incorrect_spacing = []

	# Regex patterns to find incorrect spacing
	patterns = [
	(re.compile(r'(?<!\s)(AC\|AD\|CFR\|FAA\|N\|SFAR)(\d+[-]?\d*)', re.IGNORECASE), "Missing space between document type and number"),
	(re.compile(r'(?<!\s)(§\|§§)(\d+\.\d+)', re.IGNORECASE), "Missing space after section symbol (§)"),
	(re.compile(r'(?<!\s)Part(\d+)', re.IGNORECASE), "Missing space between 'Part' and number"),
	(re.compile(r'(?<!\s)($[a-z](?!$)\|$[1-9](?!$))', re.IGNORECASE), "Missing space before paragraph indication"),
	(re.compile(r'\s{2,}'), "Double spaces between words")
	]

	for paragraph in doc:
	sentences = re.split(r'(?<=[.!?])\s+', paragraph)
	for sentence in sentences:
	for pattern, issue in patterns:
	if pattern.search(sentence):
	incorrect_spacing.append({
	'issue_description': issue,
	'sentence': sentence.strip()
	})

	success = len(incorrect_spacing) == 0

	return DocumentCheckResult(success=success, issues=incorrect_spacing)

	@profile_performance
	def check_abbreviation_usage(self, doc: List[str]) -> DocumentCheckResult:
	"""Check for abbreviation consistency after first definition."""
	if not self.validate_input(doc):
	return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])

	abbreviations = {}
	issues = []
	for paragraph in doc:
	sentences = re.split(r'(?<=[.!?])\s+', paragraph)
	for sentence in sentences:
	# Find definitions like "Federal Aviation Administration (FAA)"
	defined_matches = re.findall(r'\b([A-Za-z &]+)\s+$(\b[A-Z]{2,}\b)$', sentence)
	for full_term, acronym in defined_matches:
	if acronym not in abbreviations:
	abbreviations[acronym] = {"full_term": full_term.strip(), "defined": True}

	# Check for full term usage after definition
	for acronym, data in abbreviations.items():
	full_term = data["full_term"]
	if full_term in sentence:
	# Ignore first usage where it's defined
	if data["defined"]:
	data["defined"] = False # Mark it as now defined
	else:
	# Only flag subsequent occurrences
	issues.append({
	'full_term': full_term,
	'acronym': acronym,
	'sentence': sentence.strip()
	})

	success = len(issues) == 0

	return DocumentCheckResult(success=success, issues=issues)

	@profile_performance
	def check_date_formats(self, doc: List[str]) -> DocumentCheckResult:
	"""Check for inconsistent date formats while ignoring aviation reference numbers."""
	if not self.validate_input(doc):
	return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])

	date_issues = []

	# Patterns to ignore (aviation references)
	ignore_patterns = [
	r'\bAD \d{4}-\d{2}-\d{2}\b', # Airworthiness Directive references
	r'\bSWPM \d{2}-\d{2}-\d{2}\b', # Standard Wiring Practices Manual references
	r'\bAMM \d{2}-\d{2}-\d{2}\b', # Aircraft Maintenance Manual references
	r'\bSOPM \d{2}-\d{2}-\d{2}\b', # Standard Operating Procedure references
	r'\b[A-Z]{2,4} \d{2}-\d{2}-\d{2}\b' # Generic manual reference pattern
	]

	# Combine ignore patterns into one
	ignore_regex = '\|'.join(ignore_patterns)
	ignore_pattern = re.compile(ignore_regex)

	# Correct date pattern: 'Month Day, Year' e.g., 'January 1, 2020'
	correct_date_pattern = re.compile(r'\b(January\|February\|March\|April\|May\|June\|July\|August\|September\|October\|November\|December) \d{1,2}, \d{4}\b')

	# Incorrect date patterns
	date_patterns = [
	(re.compile(r'(?<![\w/-])\d{1,2}/\d{1,2}/\d{2,4}(?![\w/-])'), "Use 'Month Day, Year' format instead of 'MM/DD/YYYY'"),
	(re.compile(r'(?<![\w/-])\d{1,2}-\d{1,2}-\d{2,4}(?![\w/-])'), "Use 'Month Day, Year' format instead of 'MM-DD-YYYY'"),
	(re.compile(r'(?<![\w/-])\d{4}-\d{1,2}-\d{1,2}(?![\w/-])'), "Use 'Month Day, Year' format instead of 'YYYY-MM-DD'")
	]

	for paragraph in doc:
	sentences = re.split(r'(?<=[.!?])\s+', paragraph)
	for sentence in sentences:
	# First, identify and temporarily remove text that should be ignored
	ignored_matches = list(ignore_pattern.finditer(sentence))
	working_sentence = sentence

	# Replace ignored patterns with placeholders
	for match in reversed(ignored_matches):
	start, end = match.span()
	working_sentence = working_sentence[:start] + 'X' * (end - start) + working_sentence[end:]

	# Now check for date patterns in the modified sentence
	for pattern, issue in date_patterns:
	matches = pattern.finditer(working_sentence)
	for match in matches:
	# Get the original text from the match position
	original_date = sentence[match.start():match.end()]
	date_issues.append({
	'date': original_date,
	'issue': issue,
	'sentence': sentence.strip()
	})

	success = len(date_issues) == 0
	return DocumentCheckResult(success=success, issues=date_issues)

	@profile_performance
	def check_placeholders(self, doc: List[str]) -> DocumentCheckResult:
	"""Check for placeholders that should be removed."""
	if not self.validate_input(doc):
	return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])

	placeholder_phrases = [
	r'\bTBD\b',
	r'\bTo be determined\b',
	r'\bTo be added\b'
	]
	issues = []
	for paragraph in doc:
	sentences = re.split(r'(?<=[.!?])\s+', paragraph)
	for sentence in sentences:
	for phrase in placeholder_phrases:
	match = re.search(phrase, sentence, re.IGNORECASE)
	if match:
	issues.append({
	'placeholder': match.group().strip(),
	'sentence': sentence.strip()
	})

	success = len(issues) == 0

	return DocumentCheckResult(success=success, issues=issues)

	def run_all_checks(self, doc_path: str, doc_type: str, template_type: Optional[str] = None) -> Dict[str, DocumentCheckResult]:
	"""
	Run all checks on the document.

	Args:
	doc_path (str): Path to the document.
	doc_type (str): Type of the document.
	template_type (str, optional): Template type, if applicable.

	Returns:
	Dict[str, DocumentCheckResult]: Dictionary of check names to results.
	"""
	# Read the document
	doc = self.extract_paragraphs(doc_path)

	# Retrieve any specific flags
	checks_config = self.config_manager.config['document_types'].get(doc_type, {})
	skip_title_check = checks_config.get('skip_title_check', False)

	# Run checks
	results = {}
	results['heading_title_check'] = self.heading_title_check(doc, doc_type)
	results['heading_title_period_check'] = self.heading_title_period_check(doc, doc_type)
	results['acronym_check'] = self.acronym_check(doc)
	results['terminology_check'] = self.check_terminology(doc)
	results['section_symbol_usage_check'] = self.check_section_symbol_usage(doc)
	results['caption_check_table'] = self.caption_check(doc, doc_type, 'Table')
	results['caption_check_figure'] = self.caption_check(doc, doc_type, 'Figure')
	results['table_figure_reference_check'] = self.table_figure_reference_check(doc, doc_type)
	if not skip_title_check:
	results['document_title_check'] = self.document_title_check(doc_path, doc_type)
	else:
	results['document_title_check'] = DocumentCheckResult(success=True, issues=[])
	results['double_period_check'] = self.double_period_check(doc)
	results['spacing_check'] = self.spacing_check(doc)
	results['abbreviation_usage_check'] = self.check_abbreviation_usage(doc)
	results['date_formats_check'] = self.check_date_formats(doc)
	results['placeholders_check'] = self.check_placeholders(doc)

	return results

	def process_document(file_obj, doc_type, template_type):
	"""Process the document and run all checks."""
	try:
	# Convert file object to BytesIO
	if isinstance(file_obj, bytes):
	file_obj = io.BytesIO(file_obj)

	checker = FAADocumentChecker()
	doc = Document(file_obj)
	paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]

	# Rewind the file object for additional processing
	file_obj.seek(0)

	# Run all checks
	results = {}
	results['heading_check'] = checker.heading_title_check(paragraphs, doc_type)
	results['heading_period_check'] = checker.heading_title_period_check(paragraphs, doc_type)
	results['acronym_check'] = checker.acronym_check(paragraphs)
	results['terminology_check'] = checker.check_terminology(paragraphs)
	results['section_symbol_check'] = checker.check_section_symbol_usage(paragraphs)
	results['table_caption_check'] = checker.caption_check(paragraphs, doc_type, 'Table')
	results['figure_caption_check'] = checker.caption_check(paragraphs, doc_type, 'Figure')
	results['references_check'] = checker.table_figure_reference_check(paragraphs, doc_type)
	results['title_check'] = checker.document_title_check(file_obj, doc_type)
	results['double_period_check'] = checker.double_period_check(paragraphs)
	results['spacing_check'] = checker.spacing_check(paragraphs)
	results['abbreviation_check'] = checker.check_abbreviation_usage(paragraphs)
	results['date_check'] = checker.check_date_formats(paragraphs)
	results['placeholder_check'] = checker.check_placeholders(paragraphs)

	return format_results_for_gradio(results, doc_type)
	except Exception as e:
	print(f"Error in process_document: {str(e)}")
	traceback.print_exc() # This will print the full traceback
	return f"An error occurred while processing the document: {str(e)}"

	def format_results_for_gradio(results: Dict[str, DocumentCheckResult], doc_type: str) -> str:
	"""Format the results for display in Gradio."""
	output = ["# Document Check Results\n"]

	# Map check names to display titles
	check_titles = {
	'heading_check': "Required Headings Check",
	'heading_period_check': "Heading Period Check",
	'acronym_check': "Acronym Check",
	'terminology_check': "Terminology Check",
	'section_symbol_check': "Section Symbol Usage",
	'table_caption_check': "Table Caption Format",
	'figure_caption_check': "Figure Caption Format",
	'references_check': "Table and Figure References",
	'title_check': "Document Title Style",
	'double_period_check': "Double Period Check",
	'spacing_check': "Spacing Check",
	'abbreviation_check': "Abbreviation Usage",
	'date_check': "Date Format Check",
	'placeholder_check': "Placeholder Check"
	}

	for check_name, result in results.items():
	title = check_titles.get(check_name, check_name.replace('_', ' ').title())
	output.append(f"## {title}")

	if result.success:
	output.append("✅ All checks passed.\n")
	else:
	output.append("❌ Issues found:")
	for issue in result.issues:
	if isinstance(issue, dict):
	for key, value in issue.items():
	if isinstance(value, list):
	for item in value:
	output.append(f"- {item}")
	else:
	output.append(f"- {key}: {value}")
	else:
	output.append(f"- {issue}")
	output.append("")

	if result.details:
	output.append("Additional Details:")
	for key, value in result.details.items():
	if isinstance(value, list):
	output.append(f"- {key}:")
	for item in value:
	output.append(f" - {item}")
	else:
	output.append(f"- {key}: {value}")
	output.append("")

	return "\n".join(output)

	# Create the Gradio interface
	demo = gr.Blocks(theme='JohnSmith9982/small_and_pretty')

	with demo:
	gr.Markdown("# Document Checker Tool")
	gr.Markdown("Upload a Word (docx) document to check for compliance with U.S. federal documentation standards.")
	gr.Markdown("This tool is still in development and you might get false positives in your results")
	gr.Markdown("Contact Eric Putnam if you have questions and comments.")
	gr.Markdown("""
	1. Upload a clean (no track changes or comments) Word file.
	2. Choose Check Document.""")

	document_types = [
	"Advisory Circular", "Airworthiness Criteria", "Deviation Memo", "Exemption",
	"Federal Register Notice", "Order", "Policy Statement",
	"Rule", "Special Condition", "Technical Standard Order", "Other"
	]

	template_types = ["Short AC template AC", "Long AC template AC"]

	with gr.Row():
	with gr.Column(scale=1):
	file_input = gr.File(
	label="Upload Word Document (.docx)",
	file_types=[".docx"],
	type="binary"
	)
	doc_type = gr.Dropdown(
	choices=document_types,
	label="Document Type",
	value="Advisory Circular"
	)
	template_type = gr.Radio(
	choices=template_types,
	label="Template Type (Only for Advisory Circular)",
	visible=True,
	value="Short AC template AC"
	)
	submit_btn = gr.Button("Check Document", variant="primary")

	with gr.Column(scale=2):
	output = gr.Markdown(
	label="Check Results",
	value="Results will appear here after processing..."
	)

	def update_template_visibility(doc_type):
	return gr.update(visible=doc_type == "Advisory Circular")

	doc_type.change(
	fn=update_template_visibility,
	inputs=[doc_type],
	outputs=[template_type]
	)

	submit_btn.click(
	fn=process_document,
	inputs=[file_input, doc_type, template_type],
	outputs=[output]
	)

	# Launch the demo
	if __name__ == "__main__":
	demo.launch()