Spaces:
Sleeping
Sleeping
Added paragraph and sentence length checks
Browse files
app.py
CHANGED
@@ -662,7 +662,7 @@ class FAADocumentChecker(DocumentChecker):
|
|
662 |
'DISTRIBUTION', 'EXCEPTION', 'EXPLANATION', 'FIGURE', 'GENERAL', 'GROUPS',
|
663 |
'INFORMATION', 'INSERT', 'INTRODUCTION', 'MATERIAL', 'NOTE', 'PARTS', 'PAST',
|
664 |
'POLICY', 'PRACTICE', 'PROCEDURES', 'PURPOSE', 'RELEVANT', 'RELATED',
|
665 |
-
'REQUIREMENTS', 'SCOPE', 'SECTION', 'SUMMARY', 'TABLE', 'WARNING'
|
666 |
}
|
667 |
|
668 |
PREDEFINED_ACRONYMS = {
|
@@ -1457,6 +1457,7 @@ class FAADocumentChecker(DocumentChecker):
|
|
1457 |
|
1458 |
# Patterns to ignore (aviation references)
|
1459 |
ignore_patterns = [
|
|
|
1460 |
r'\bAD \d{4}-\d{2}-\d{2}\b', # Airworthiness Directive references
|
1461 |
r'\bSWPM \d{2}-\d{2}-\d{2}\b', # Standard Wiring Practices Manual references
|
1462 |
r'\bAMM \d{2}-\d{2}-\d{2}\b', # Aircraft Maintenance Manual references
|
@@ -1667,20 +1668,21 @@ class FAADocumentChecker(DocumentChecker):
|
|
1667 |
check_sequence = [
|
1668 |
('heading_title_check', lambda: self.heading_title_check(doc, doc_type)),
|
1669 |
('heading_title_period_check', lambda: self.heading_title_period_check(doc, doc_type)),
|
|
|
1670 |
('acronym_check', lambda: self.acronym_check(doc)),
|
1671 |
('acronym_usage_check', lambda: self.acronym_usage_check(doc)),
|
1672 |
-
('terminology_check', lambda: self.check_terminology(doc)),
|
1673 |
('section_symbol_usage_check', lambda: self.check_section_symbol_usage(doc)),
|
|
|
|
|
|
|
1674 |
('caption_check_table', lambda: self.caption_check(doc, doc_type, 'Table')),
|
1675 |
('caption_check_figure', lambda: self.caption_check(doc, doc_type, 'Figure')),
|
1676 |
('table_figure_reference_check', lambda: self.table_figure_reference_check(doc, doc_type)),
|
1677 |
-
('
|
1678 |
('double_period_check', lambda: self.double_period_check(doc)),
|
1679 |
('spacing_check', lambda: self.spacing_check(doc)),
|
1680 |
-
('
|
1681 |
-
('
|
1682 |
-
('placeholders_check', lambda: self.check_placeholders(doc)),
|
1683 |
-
('parentheses_check', lambda: self.check_parentheses(doc))
|
1684 |
]
|
1685 |
|
1686 |
# Run each check and store results
|
@@ -1871,6 +1873,369 @@ class FAADocumentChecker(DocumentChecker):
|
|
1871 |
|
1872 |
return formatted_issues
|
1873 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1874 |
class DocumentCheckResultsFormatter:
|
1875 |
|
1876 |
def __init__(self):
|
@@ -1890,7 +2255,7 @@ class DocumentCheckResultsFormatter:
|
|
1890 |
'heading_title_period_check': {
|
1891 |
'title': 'Heading Period Format',
|
1892 |
'description': 'Examines heading punctuation to ensure compliance with FAA document formatting standards. Some FAA documents (like Advisory Circulars and Orders) require periods at the end of headings, while others (like Federal Register Notices) don\'t.',
|
1893 |
-
'solution': 'Format heading periods according to document type requirements',
|
1894 |
'example_fix': {
|
1895 |
'before': 'Purpose',
|
1896 |
'after': 'Purpose.' # For ACs and Orders
|
@@ -1899,7 +2264,7 @@ class DocumentCheckResultsFormatter:
|
|
1899 |
'table_figure_reference_check': {
|
1900 |
'title': 'Table and Figure References',
|
1901 |
'description': 'Analyzes how tables and figures are referenced within your document text. Capitalize references at the beginning of sentences (e.g., "Table 2-1 shows...") and use lowercase references within sentences (e.g., "...as shown in table 2-1").',
|
1902 |
-
'solution': 'Capitalize references at start of sentences, use lowercase within sentences',
|
1903 |
'example_fix': {
|
1904 |
'before': 'The DTR values are specified in Table 3-1 and Figure 3-2.',
|
1905 |
'after': 'The DTR values are specified in table 3-1 and figure 3-2.'
|
@@ -1908,7 +2273,7 @@ class DocumentCheckResultsFormatter:
|
|
1908 |
'acronym_check': {
|
1909 |
'title': 'Acronym Definition Issues',
|
1910 |
'description': 'Ensures every acronym is properly introduced with its full term at first use. The check identifies undefined acronyms while recognizing common exceptions (like U.S.) that don\'t require definition.',
|
1911 |
-
'solution': 'Define each acronym at its first use, e.g., "Federal Aviation Administration (FAA)"',
|
1912 |
'example_fix': {
|
1913 |
'before': 'This order establishes general FAA organizational policies.',
|
1914 |
'after': 'This order establishes general Federal Aviation Administration (FAA) organizational policies.'
|
@@ -1926,7 +2291,7 @@ class DocumentCheckResultsFormatter:
|
|
1926 |
'terminology_check': {
|
1927 |
'title': 'Incorrect Terminology',
|
1928 |
'description': 'Evaluates document text against the various style manuals and orders to identify non-compliant terminology, ambiguous references, and outdated phrases. This includes checking for prohibited relative references (like "above" or "below"), proper legal terminology (like "must" instead of "shall"), and consistent formatting of regulatory citations.',
|
1929 |
-
'solution': 'Use explicit references to paragraphs, sections, tables, and figures',
|
1930 |
'example_fix': {
|
1931 |
'before': 'Operators shall comply with ADs to ensure aircraft safety and regulatory compliance',
|
1932 |
'after': 'Operators must comply with ADs to ensure aircraft safety and regulatory compliance.'
|
@@ -1935,7 +2300,7 @@ class DocumentCheckResultsFormatter:
|
|
1935 |
'section_symbol_usage_check': {
|
1936 |
'title': 'Section Symbol (Β§) Format Issues',
|
1937 |
'description': 'Examines the usage of section symbols (Β§) throughout your document. This includes verifying proper symbol placement in regulatory references, ensuring sections aren\'t started with the symbol, checking consistency in multiple-section citations, and validating proper CFR citations. For ACs, see FAA Order 1320.46.',
|
1938 |
-
'solution': 'Format section symbols correctly and never start sentences with them',
|
1939 |
'example_fix': {
|
1940 |
'before': 'Β§ 23.3 establishes design criteria.',
|
1941 |
'after': 'Section 23.3 establishes design criteria.'
|
@@ -1944,7 +2309,7 @@ class DocumentCheckResultsFormatter:
|
|
1944 |
'double_period_check': {
|
1945 |
'title': 'Multiple Period Issues',
|
1946 |
'description': 'Examines sentences for accidental double periods that often occur during document editing and revision. While double periods are sometimes found in ellipses (...) or web addresses, they should never appear at the end of standard sentences in FAA documentation.',
|
1947 |
-
'solution': 'Remove multiple periods that end sentences',
|
1948 |
'example_fix': {
|
1949 |
'before': 'The following ACs are related to the guidance in this document..',
|
1950 |
'after': 'The following ACs are related to the guidance in this document.'
|
@@ -1971,7 +2336,7 @@ class DocumentCheckResultsFormatter:
|
|
1971 |
'placeholders_check': {
|
1972 |
'title': 'Placeholder Content',
|
1973 |
'description': 'Identifies incomplete content and temporary placeholders that must be finalized before document publication. This includes common placeholder text (like "TBD" or "To be determined"), draft markers, and incomplete sections.',
|
1974 |
-
'solution': 'Replace all placeholder content with actual content',
|
1975 |
'example_fix': {
|
1976 |
'before': 'Pilots must submit the [Insert text] form to the FAA for approval.',
|
1977 |
'after': 'Pilots must submit the Report of Eye Evaluation form 8500-7 to the FAA for approval.'
|
@@ -1980,11 +2345,29 @@ class DocumentCheckResultsFormatter:
|
|
1980 |
'parentheses_check': {
|
1981 |
'title': 'Parentheses Balance Check',
|
1982 |
'description': 'Ensures that all parentheses in the document are properly paired with matching opening and closing characters.',
|
1983 |
-
'solution': 'Add missing opening or closing parentheses where indicated',
|
1984 |
'example_fix': {
|
1985 |
'before': 'The system (as defined in AC 25-11B performs...',
|
1986 |
'after': 'The system (as defined in AC 25-11B) performs...'
|
1987 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1988 |
}
|
1989 |
}
|
1990 |
|
@@ -2093,12 +2476,18 @@ class DocumentCheckResultsFormatter:
|
|
2093 |
if 'incorrect_term' in issue and 'correct_term' in issue:
|
2094 |
return f" β’ Replace '{issue['incorrect_term']}' with '{issue['correct_term']}'"
|
2095 |
|
|
|
|
|
|
|
2096 |
if 'sentence' in issue:
|
2097 |
return f" β’ {issue['sentence']}"
|
2098 |
|
2099 |
if 'description' in issue:
|
2100 |
return f" β’ {issue['description']}"
|
2101 |
|
|
|
|
|
|
|
2102 |
# Fallback for other issue formats
|
2103 |
return f" β’ {str(issue)}"
|
2104 |
|
@@ -2324,12 +2713,14 @@ class DocumentCheckResultsFormatter:
|
|
2324 |
output.extend(self._format_section_symbol_issues(result))
|
2325 |
elif check_name == 'parentheses_check':
|
2326 |
output.extend(self._format_parentheses_issues(result))
|
|
|
|
|
2327 |
else:
|
2328 |
-
formatted_issues = [self._format_standard_issue(issue) for issue in result.issues[:
|
2329 |
output.extend(formatted_issues)
|
2330 |
|
2331 |
if len(result.issues) > 10:
|
2332 |
-
output.append(f"\n ... and {len(result.issues) -
|
2333 |
|
2334 |
return '\n'.join(output)
|
2335 |
|
@@ -2401,18 +2792,21 @@ def format_markdown_results(results: Dict[str, DocumentCheckResult], doc_type: s
|
|
2401 |
check_categories = {
|
2402 |
'heading_title_check': {'title': 'π Required Headings', 'priority': 1},
|
2403 |
'heading_title_period_check': {'title': 'π Heading Period Usage', 'priority': 1},
|
2404 |
-
'
|
2405 |
-
'
|
|
|
2406 |
'section_symbol_usage_check': {'title': 'Β§ Section Symbol Usage', 'priority': 2},
|
|
|
|
|
|
|
2407 |
'caption_check_table': {'title': 'π Table Captions', 'priority': 3},
|
2408 |
'caption_check_figure': {'title': 'πΌοΈ Figure Captions', 'priority': 3},
|
2409 |
'table_figure_reference_check': {'title': 'π Table/Figure References', 'priority': 3},
|
2410 |
-
'
|
2411 |
'double_period_check': {'title': 'β‘ Double Periods', 'priority': 4},
|
2412 |
'spacing_check': {'title': 'β¨οΈ Spacing Issues', 'priority': 4},
|
2413 |
-
'
|
2414 |
-
'
|
2415 |
-
'placeholders_check': {'title': 'π© Placeholder Content', 'priority': 1}
|
2416 |
}
|
2417 |
|
2418 |
sorted_checks = sorted(
|
|
|
662 |
'DISTRIBUTION', 'EXCEPTION', 'EXPLANATION', 'FIGURE', 'GENERAL', 'GROUPS',
|
663 |
'INFORMATION', 'INSERT', 'INTRODUCTION', 'MATERIAL', 'NOTE', 'PARTS', 'PAST',
|
664 |
'POLICY', 'PRACTICE', 'PROCEDURES', 'PURPOSE', 'RELEVANT', 'RELATED',
|
665 |
+
'REQUIREMENTS', 'REPORT', 'SCOPE', 'SECTION', 'SUMMARY', 'TABLE', 'WARNING'
|
666 |
}
|
667 |
|
668 |
PREDEFINED_ACRONYMS = {
|
|
|
1457 |
|
1458 |
# Patterns to ignore (aviation references)
|
1459 |
ignore_patterns = [
|
1460 |
+
r'\bAC\s*\d+(?:[-.]\d+)*[A-Z]*\b', # AC reference pattern
|
1461 |
r'\bAD \d{4}-\d{2}-\d{2}\b', # Airworthiness Directive references
|
1462 |
r'\bSWPM \d{2}-\d{2}-\d{2}\b', # Standard Wiring Practices Manual references
|
1463 |
r'\bAMM \d{2}-\d{2}-\d{2}\b', # Aircraft Maintenance Manual references
|
|
|
1668 |
check_sequence = [
|
1669 |
('heading_title_check', lambda: self.heading_title_check(doc, doc_type)),
|
1670 |
('heading_title_period_check', lambda: self.heading_title_period_check(doc, doc_type)),
|
1671 |
+
('terminology_check', lambda: self.check_terminology(doc)),
|
1672 |
('acronym_check', lambda: self.acronym_check(doc)),
|
1673 |
('acronym_usage_check', lambda: self.acronym_usage_check(doc)),
|
|
|
1674 |
('section_symbol_usage_check', lambda: self.check_section_symbol_usage(doc)),
|
1675 |
+
('date_formats_check', lambda: self.check_date_formats(doc)),
|
1676 |
+
('placeholders_check', lambda: self.check_placeholders(doc)),
|
1677 |
+
('document_title_check', lambda: self.document_title_check(doc_path, doc_type) if not skip_title_check else DocumentCheckResult(success=True, issues=[])),
|
1678 |
('caption_check_table', lambda: self.caption_check(doc, doc_type, 'Table')),
|
1679 |
('caption_check_figure', lambda: self.caption_check(doc, doc_type, 'Figure')),
|
1680 |
('table_figure_reference_check', lambda: self.table_figure_reference_check(doc, doc_type)),
|
1681 |
+
('parentheses_check', lambda: self.check_parentheses(doc)),
|
1682 |
('double_period_check', lambda: self.double_period_check(doc)),
|
1683 |
('spacing_check', lambda: self.spacing_check(doc)),
|
1684 |
+
('paragraph_length_check', lambda: self.check_paragraph_length(doc)),
|
1685 |
+
('sentence_length_check', lambda: self.check_sentence_length(doc)),
|
|
|
|
|
1686 |
]
|
1687 |
|
1688 |
# Run each check and store results
|
|
|
1873 |
|
1874 |
return formatted_issues
|
1875 |
|
1876 |
+
@profile_performance
|
1877 |
+
def check_abbreviation_usage(self, doc: List[str]) -> DocumentCheckResult:
|
1878 |
+
"""Check for abbreviation consistency after first definition."""
|
1879 |
+
if not self.validate_input(doc):
|
1880 |
+
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
|
1881 |
+
|
1882 |
+
# Track abbreviations and their usage
|
1883 |
+
abbreviations = {} # Store defined abbreviations
|
1884 |
+
inconsistent_uses = [] # Track full term usage after definition
|
1885 |
+
|
1886 |
+
def process_sentence(sentence: str) -> None:
|
1887 |
+
"""Process a single sentence for abbreviation usage."""
|
1888 |
+
for acronym, data in abbreviations.items():
|
1889 |
+
full_term = data["full_term"]
|
1890 |
+
if full_term not in sentence:
|
1891 |
+
continue
|
1892 |
+
|
1893 |
+
# Skip if this is the definition sentence
|
1894 |
+
if sentence.strip() == data["first_occurrence"]:
|
1895 |
+
continue
|
1896 |
+
|
1897 |
+
# Track inconsistent usage
|
1898 |
+
if not data["defined"]:
|
1899 |
+
inconsistent_uses.append({
|
1900 |
+
'issue_type': 'full_term_after_acronym',
|
1901 |
+
'full_term': full_term,
|
1902 |
+
'acronym': acronym,
|
1903 |
+
'sentence': sentence.strip(),
|
1904 |
+
'definition_context': data["first_occurrence"]
|
1905 |
+
})
|
1906 |
+
data["defined"] = False # Mark as used
|
1907 |
+
|
1908 |
+
# Process each paragraph
|
1909 |
+
for paragraph in doc:
|
1910 |
+
sentences = re.split(r'(?<=[.!?])\s+', paragraph)
|
1911 |
+
for sentence in sentences:
|
1912 |
+
process_sentence(sentence.strip())
|
1913 |
+
|
1914 |
+
success = len(inconsistent_uses) == 0
|
1915 |
+
return DocumentCheckResult(success=success, issues=inconsistent_uses)
|
1916 |
+
|
1917 |
+
@profile_performance
|
1918 |
+
def check_date_formats(self, doc: List[str]) -> DocumentCheckResult:
|
1919 |
+
"""Check for inconsistent date formats while ignoring aviation reference numbers."""
|
1920 |
+
if not self.validate_input(doc):
|
1921 |
+
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
|
1922 |
+
|
1923 |
+
# Get patterns from registry
|
1924 |
+
date_patterns = self.config_manager.pattern_registry.get('dates', [])
|
1925 |
+
|
1926 |
+
# Patterns to ignore (aviation references)
|
1927 |
+
ignore_patterns = [
|
1928 |
+
r'\bAD \d{4}-\d{2}-\d{2}\b', # Airworthiness Directive references
|
1929 |
+
r'\bSWPM \d{2}-\d{2}-\d{2}\b', # Standard Wiring Practices Manual references
|
1930 |
+
r'\bAMM \d{2}-\d{2}-\d{2}\b', # Aircraft Maintenance Manual references
|
1931 |
+
r'\bSOPM \d{2}-\d{2}-\d{2}\b', # Standard Operating Procedure references
|
1932 |
+
r'\b[A-Z]{2,4} \d{2}-\d{2}-\d{2}\b' # Generic manual reference pattern
|
1933 |
+
]
|
1934 |
+
|
1935 |
+
# Combine ignore patterns into one
|
1936 |
+
ignore_regex = '|'.join(f'(?:{pattern})' for pattern in ignore_patterns)
|
1937 |
+
ignore_pattern = re.compile(ignore_regex)
|
1938 |
+
|
1939 |
+
# Track unique issues
|
1940 |
+
unique_issues = []
|
1941 |
+
|
1942 |
+
# Use _process_sentences helper
|
1943 |
+
for sentence, paragraph in self._process_sentences(doc, skip_empty=True, skip_headings=True):
|
1944 |
+
# First, identify and temporarily remove text that should be ignored
|
1945 |
+
working_sentence = sentence
|
1946 |
+
|
1947 |
+
# Find all matches to ignore
|
1948 |
+
ignored_matches = list(ignore_pattern.finditer(sentence))
|
1949 |
+
|
1950 |
+
# Replace ignored patterns with placeholders
|
1951 |
+
for match in reversed(ignored_matches):
|
1952 |
+
start, end = match.span()
|
1953 |
+
working_sentence = working_sentence[:start] + 'X' * (end - start) + working_sentence[end:]
|
1954 |
+
|
1955 |
+
# Now check for date patterns in the modified sentence
|
1956 |
+
for pattern_config in date_patterns:
|
1957 |
+
matches = list(re.finditer(pattern_config.pattern, working_sentence))
|
1958 |
+
|
1959 |
+
for match in matches:
|
1960 |
+
# Get the original text from the match position
|
1961 |
+
original_date = sentence[match.start():match.end()]
|
1962 |
+
|
1963 |
+
# Create formatted issue with incorrect/correct format
|
1964 |
+
formatted_issue = {
|
1965 |
+
'incorrect': original_date,
|
1966 |
+
'correct': 'Month Day, Year'
|
1967 |
+
}
|
1968 |
+
unique_issues.append(formatted_issue)
|
1969 |
+
|
1970 |
+
return DocumentCheckResult(success=len(unique_issues) == 0, issues=unique_issues)
|
1971 |
+
|
1972 |
+
@profile_performance
|
1973 |
+
def check_placeholders(self, doc: List[str]) -> DocumentCheckResult:
|
1974 |
+
"""Check for placeholders that should be removed."""
|
1975 |
+
def process_placeholders(doc: List[str], patterns: List[PatternConfig]) -> DocumentCheckResult:
|
1976 |
+
tbd_placeholders = []
|
1977 |
+
to_be_determined_placeholders = []
|
1978 |
+
to_be_added_placeholders = []
|
1979 |
+
|
1980 |
+
pattern_categories = {
|
1981 |
+
r'\bTBD\b': ('tbd', tbd_placeholders),
|
1982 |
+
r'\bTo be determined\b': ('to_be_determined', to_be_determined_placeholders),
|
1983 |
+
r'\bTo be added\b': ('to_be_added', to_be_added_placeholders)
|
1984 |
+
}
|
1985 |
+
|
1986 |
+
# Use _process_sentences helper
|
1987 |
+
for sentence, paragraph in self._process_sentences(doc, skip_empty=True, skip_headings=True):
|
1988 |
+
for pattern_config in patterns:
|
1989 |
+
compiled_pattern = re.compile(pattern_config.pattern, re.IGNORECASE)
|
1990 |
+
|
1991 |
+
for pattern_key, (category_name, category_list) in pattern_categories.items():
|
1992 |
+
if pattern_config.pattern == pattern_key:
|
1993 |
+
matches = compiled_pattern.finditer(sentence)
|
1994 |
+
for match in matches:
|
1995 |
+
category_list.append({
|
1996 |
+
'placeholder': match.group().strip(),
|
1997 |
+
'sentence': sentence.strip(),
|
1998 |
+
'description': pattern_config.description
|
1999 |
+
})
|
2000 |
+
|
2001 |
+
# Compile issues
|
2002 |
+
issues = []
|
2003 |
+
if tbd_placeholders:
|
2004 |
+
issues.append({
|
2005 |
+
'issue_type': 'tbd_placeholder',
|
2006 |
+
'description': 'Remove TBD placeholder',
|
2007 |
+
'occurrences': tbd_placeholders
|
2008 |
+
})
|
2009 |
+
|
2010 |
+
if to_be_determined_placeholders:
|
2011 |
+
issues.append({
|
2012 |
+
'issue_type': 'to_be_determined_placeholder',
|
2013 |
+
'description': "Remove 'To be determined' placeholder",
|
2014 |
+
'occurrences': to_be_determined_placeholders
|
2015 |
+
})
|
2016 |
+
|
2017 |
+
if to_be_added_placeholders:
|
2018 |
+
issues.append({
|
2019 |
+
'issue_type': 'to_be_added_placeholder',
|
2020 |
+
'description': "Remove 'To be added' placeholder",
|
2021 |
+
'occurrences': to_be_added_placeholders
|
2022 |
+
})
|
2023 |
+
|
2024 |
+
details = {
|
2025 |
+
'total_placeholders': len(tbd_placeholders) +
|
2026 |
+
len(to_be_determined_placeholders) +
|
2027 |
+
len(to_be_added_placeholders),
|
2028 |
+
'placeholder_types': {
|
2029 |
+
'TBD': len(tbd_placeholders),
|
2030 |
+
'To be determined': len(to_be_determined_placeholders),
|
2031 |
+
'To be added': len(to_be_added_placeholders)
|
2032 |
+
}
|
2033 |
+
}
|
2034 |
+
|
2035 |
+
return DocumentCheckResult(success=len(issues) == 0, issues=issues, details=details)
|
2036 |
+
|
2037 |
+
return self._process_patterns(doc, 'placeholders', process_placeholders)
|
2038 |
+
|
2039 |
+
@profile_performance
|
2040 |
+
def _process_patterns(
|
2041 |
+
self,
|
2042 |
+
doc: List[str],
|
2043 |
+
pattern_category: str,
|
2044 |
+
process_func: Optional[Callable] = None
|
2045 |
+
) -> DocumentCheckResult:
|
2046 |
+
"""
|
2047 |
+
Process document text against patterns from a specific category.
|
2048 |
+
|
2049 |
+
Args:
|
2050 |
+
doc: List of document paragraphs
|
2051 |
+
pattern_category: Category of patterns to check against
|
2052 |
+
process_func: Optional custom processing function
|
2053 |
+
|
2054 |
+
Returns:
|
2055 |
+
DocumentCheckResult with processed issues
|
2056 |
+
"""
|
2057 |
+
if not self.validate_input(doc):
|
2058 |
+
self.logger.error("Invalid document input for pattern check")
|
2059 |
+
return DocumentCheckResult(
|
2060 |
+
success=False,
|
2061 |
+
issues=[{'error': 'Invalid document input'}]
|
2062 |
+
)
|
2063 |
+
|
2064 |
+
# Get patterns from registry
|
2065 |
+
patterns = self.config_manager.pattern_registry.get(pattern_category, [])
|
2066 |
+
if not patterns:
|
2067 |
+
self.logger.warning(f"No patterns found for category: {pattern_category}")
|
2068 |
+
return DocumentCheckResult(
|
2069 |
+
success=True,
|
2070 |
+
issues=[],
|
2071 |
+
details={'message': f'No patterns defined for {pattern_category}'}
|
2072 |
+
)
|
2073 |
+
|
2074 |
+
# Use custom processing function if provided
|
2075 |
+
if process_func:
|
2076 |
+
return process_func(doc, patterns)
|
2077 |
+
|
2078 |
+
# Default processing with deduplication
|
2079 |
+
unique_issues = set() # Using a set to track unique issues
|
2080 |
+
|
2081 |
+
for paragraph in doc:
|
2082 |
+
sentences = re.split(r'(?<=[.!?])\s+', paragraph)
|
2083 |
+
for sentence in sentences:
|
2084 |
+
sentence = sentence.strip()
|
2085 |
+
if not sentence:
|
2086 |
+
continue
|
2087 |
+
|
2088 |
+
for pattern_config in patterns:
|
2089 |
+
matches = list(re.finditer(pattern_config.pattern, sentence))
|
2090 |
+
if matches:
|
2091 |
+
# Add each match as a tuple to ensure uniqueness
|
2092 |
+
for match in matches:
|
2093 |
+
unique_issues.add((
|
2094 |
+
match.group(), # The matched text
|
2095 |
+
pattern_config.description, # The issue description
|
2096 |
+
pattern_config.replacement if hasattr(pattern_config, 'replacement') else None
|
2097 |
+
))
|
2098 |
+
|
2099 |
+
# Convert unique issues back to the expected format
|
2100 |
+
formatted_issues = [
|
2101 |
+
{
|
2102 |
+
'incorrect': issue[0],
|
2103 |
+
'description': issue[1],
|
2104 |
+
'replacement': issue[2]
|
2105 |
+
}
|
2106 |
+
for issue in sorted(unique_issues) # Sort for consistent output
|
2107 |
+
]
|
2108 |
+
|
2109 |
+
return DocumentCheckResult(success=len(formatted_issues) == 0, issues=formatted_issues)
|
2110 |
+
|
2111 |
+
@profile_performance
|
2112 |
+
def check_paragraph_length(self, doc: List[str]) -> DocumentCheckResult:
|
2113 |
+
"""
|
2114 |
+
Check for overly long paragraphs that may need to be split up.
|
2115 |
+
|
2116 |
+
Args:
|
2117 |
+
doc (List[str]): List of document paragraphs
|
2118 |
+
|
2119 |
+
Returns:
|
2120 |
+
DocumentCheckResult: Results of paragraph length check
|
2121 |
+
"""
|
2122 |
+
if not self.validate_input(doc):
|
2123 |
+
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
|
2124 |
+
|
2125 |
+
issues = []
|
2126 |
+
|
2127 |
+
for paragraph in doc:
|
2128 |
+
if not paragraph.strip(): # Skip empty paragraphs
|
2129 |
+
continue
|
2130 |
+
|
2131 |
+
# Count sentences (split on period, exclamation, question mark followed by space)
|
2132 |
+
sentences = re.split(r'(?<=[.!?])\s+', paragraph.strip())
|
2133 |
+
# Count lines (split on newlines or when length exceeds ~80 characters)
|
2134 |
+
lines = []
|
2135 |
+
current_line = ""
|
2136 |
+
|
2137 |
+
for word in paragraph.split():
|
2138 |
+
if len(current_line) + len(word) + 1 <= 80:
|
2139 |
+
current_line += " " + word if current_line else word
|
2140 |
+
else:
|
2141 |
+
lines.append(current_line)
|
2142 |
+
current_line = word
|
2143 |
+
if current_line:
|
2144 |
+
lines.append(current_line)
|
2145 |
+
|
2146 |
+
# Check if paragraph exceeds either threshold
|
2147 |
+
if len(sentences) > 6 or len(lines) > 8:
|
2148 |
+
# Get first sentence for context
|
2149 |
+
first_sentence = sentences[0].strip()
|
2150 |
+
issues.append(f"Review the paragraph that starts with: \"{first_sentence}\"")
|
2151 |
+
|
2152 |
+
return DocumentCheckResult(success=len(issues) == 0, issues=issues)
|
2153 |
+
|
2154 |
+
@profile_performance
|
2155 |
+
def check_sentence_length(self, doc: List[str]) -> DocumentCheckResult:
|
2156 |
+
"""
|
2157 |
+
Check for overly long sentences that may need to be split for clarity.
|
2158 |
+
|
2159 |
+
Args:
|
2160 |
+
doc (List[str]): List of document paragraphs
|
2161 |
+
|
2162 |
+
Returns:
|
2163 |
+
DocumentCheckResult: Results of sentence length check including:
|
2164 |
+
- success: Boolean indicating if all sentences are acceptable length
|
2165 |
+
- issues: List of dicts with long sentence details
|
2166 |
+
- details: Additional statistics about sentence lengths
|
2167 |
+
"""
|
2168 |
+
if not self.validate_input(doc):
|
2169 |
+
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
|
2170 |
+
|
2171 |
+
issues = []
|
2172 |
+
sentence_stats = {
|
2173 |
+
'total_sentences': 0,
|
2174 |
+
'long_sentences': 0,
|
2175 |
+
'max_length': 0,
|
2176 |
+
'avg_length': 0
|
2177 |
+
}
|
2178 |
+
|
2179 |
+
# Skip patterns for technical content that might naturally be longer
|
2180 |
+
skip_patterns = [
|
2181 |
+
r'^(?:Note:|Warning:|Caution:)', # Notes and warnings
|
2182 |
+
r'^\d+\.', # Numbered lists
|
2183 |
+
r'^\([a-z]\)', # Letter lists
|
2184 |
+
r'^\([0-9]\)', # Number lists in parentheses
|
2185 |
+
r'^Table \d', # Table captions
|
2186 |
+
r'^Figure \d', # Figure captions
|
2187 |
+
r'(?:e\.g\.|i\.e\.|viz\.,)', # Latin abbreviations often used in complex sentences
|
2188 |
+
r'\b(?:AC|AD|TSO|SFAR)\s+\d', # Technical references
|
2189 |
+
r'\d+\s*(?:CFR|U\.S\.C\.)', # Regulatory references
|
2190 |
+
]
|
2191 |
+
skip_regex = re.compile('|'.join(skip_patterns), re.IGNORECASE)
|
2192 |
+
|
2193 |
+
total_words = 0
|
2194 |
+
|
2195 |
+
for paragraph in doc:
|
2196 |
+
if not paragraph.strip():
|
2197 |
+
continue
|
2198 |
+
|
2199 |
+
# Skip if paragraph matches any skip patterns
|
2200 |
+
if skip_regex.search(paragraph):
|
2201 |
+
continue
|
2202 |
+
|
2203 |
+
# Split into sentences while preserving punctuation
|
2204 |
+
sentences = re.split(r'(?<=[.!?])\s+', paragraph.strip())
|
2205 |
+
|
2206 |
+
for sentence in sentences:
|
2207 |
+
sentence = sentence.strip()
|
2208 |
+
if not sentence:
|
2209 |
+
continue
|
2210 |
+
|
2211 |
+
# Count words (splitting on whitespace)
|
2212 |
+
words = sentence.split()
|
2213 |
+
word_count = len(words)
|
2214 |
+
|
2215 |
+
sentence_stats['total_sentences'] += 1
|
2216 |
+
total_words += word_count
|
2217 |
+
|
2218 |
+
if word_count > sentence_stats['max_length']:
|
2219 |
+
sentence_stats['max_length'] = word_count
|
2220 |
+
|
2221 |
+
# Flag sentences over 35 words
|
2222 |
+
if word_count > 35:
|
2223 |
+
sentence_stats['long_sentences'] += 1
|
2224 |
+
issues.append({
|
2225 |
+
'sentence': sentence,
|
2226 |
+
'word_count': word_count
|
2227 |
+
})
|
2228 |
+
|
2229 |
+
# Calculate average sentence length
|
2230 |
+
if sentence_stats['total_sentences'] > 0:
|
2231 |
+
sentence_stats['avg_length'] = round(total_words / sentence_stats['total_sentences'], 1)
|
2232 |
+
|
2233 |
+
return DocumentCheckResult(
|
2234 |
+
success=len(issues) == 0,
|
2235 |
+
issues=issues,
|
2236 |
+
details=sentence_stats
|
2237 |
+
)
|
2238 |
+
|
2239 |
class DocumentCheckResultsFormatter:
|
2240 |
|
2241 |
def __init__(self):
|
|
|
2255 |
'heading_title_period_check': {
|
2256 |
'title': 'Heading Period Format',
|
2257 |
'description': 'Examines heading punctuation to ensure compliance with FAA document formatting standards. Some FAA documents (like Advisory Circulars and Orders) require periods at the end of headings, while others (like Federal Register Notices) don\'t.',
|
2258 |
+
'solution': 'Format heading periods according to document type requirements.',
|
2259 |
'example_fix': {
|
2260 |
'before': 'Purpose',
|
2261 |
'after': 'Purpose.' # For ACs and Orders
|
|
|
2264 |
'table_figure_reference_check': {
|
2265 |
'title': 'Table and Figure References',
|
2266 |
'description': 'Analyzes how tables and figures are referenced within your document text. Capitalize references at the beginning of sentences (e.g., "Table 2-1 shows...") and use lowercase references within sentences (e.g., "...as shown in table 2-1").',
|
2267 |
+
'solution': 'Capitalize references at start of sentences, use lowercase within sentences.',
|
2268 |
'example_fix': {
|
2269 |
'before': 'The DTR values are specified in Table 3-1 and Figure 3-2.',
|
2270 |
'after': 'The DTR values are specified in table 3-1 and figure 3-2.'
|
|
|
2273 |
'acronym_check': {
|
2274 |
'title': 'Acronym Definition Issues',
|
2275 |
'description': 'Ensures every acronym is properly introduced with its full term at first use. The check identifies undefined acronyms while recognizing common exceptions (like U.S.) that don\'t require definition.',
|
2276 |
+
'solution': 'Define each acronym at its first use, e.g., "Federal Aviation Administration (FAA)".',
|
2277 |
'example_fix': {
|
2278 |
'before': 'This order establishes general FAA organizational policies.',
|
2279 |
'after': 'This order establishes general Federal Aviation Administration (FAA) organizational policies.'
|
|
|
2291 |
'terminology_check': {
|
2292 |
'title': 'Incorrect Terminology',
|
2293 |
'description': 'Evaluates document text against the various style manuals and orders to identify non-compliant terminology, ambiguous references, and outdated phrases. This includes checking for prohibited relative references (like "above" or "below"), proper legal terminology (like "must" instead of "shall"), and consistent formatting of regulatory citations.',
|
2294 |
+
'solution': 'Use explicit references to paragraphs, sections, tables, and figures.',
|
2295 |
'example_fix': {
|
2296 |
'before': 'Operators shall comply with ADs to ensure aircraft safety and regulatory compliance',
|
2297 |
'after': 'Operators must comply with ADs to ensure aircraft safety and regulatory compliance.'
|
|
|
2300 |
'section_symbol_usage_check': {
|
2301 |
'title': 'Section Symbol (Β§) Format Issues',
|
2302 |
'description': 'Examines the usage of section symbols (Β§) throughout your document. This includes verifying proper symbol placement in regulatory references, ensuring sections aren\'t started with the symbol, checking consistency in multiple-section citations, and validating proper CFR citations. For ACs, see FAA Order 1320.46.',
|
2303 |
+
'solution': 'Format section symbols correctly and never start sentences with them.',
|
2304 |
'example_fix': {
|
2305 |
'before': 'Β§ 23.3 establishes design criteria.',
|
2306 |
'after': 'Section 23.3 establishes design criteria.'
|
|
|
2309 |
'double_period_check': {
|
2310 |
'title': 'Multiple Period Issues',
|
2311 |
'description': 'Examines sentences for accidental double periods that often occur during document editing and revision. While double periods are sometimes found in ellipses (...) or web addresses, they should never appear at the end of standard sentences in FAA documentation.',
|
2312 |
+
'solution': 'Remove multiple periods that end sentences.',
|
2313 |
'example_fix': {
|
2314 |
'before': 'The following ACs are related to the guidance in this document..',
|
2315 |
'after': 'The following ACs are related to the guidance in this document.'
|
|
|
2336 |
'placeholders_check': {
|
2337 |
'title': 'Placeholder Content',
|
2338 |
'description': 'Identifies incomplete content and temporary placeholders that must be finalized before document publication. This includes common placeholder text (like "TBD" or "To be determined"), draft markers, and incomplete sections.',
|
2339 |
+
'solution': 'Replace all placeholder content with actual content.',
|
2340 |
'example_fix': {
|
2341 |
'before': 'Pilots must submit the [Insert text] form to the FAA for approval.',
|
2342 |
'after': 'Pilots must submit the Report of Eye Evaluation form 8500-7 to the FAA for approval.'
|
|
|
2345 |
'parentheses_check': {
|
2346 |
'title': 'Parentheses Balance Check',
|
2347 |
'description': 'Ensures that all parentheses in the document are properly paired with matching opening and closing characters.',
|
2348 |
+
'solution': 'Add missing opening or closing parentheses where indicated.',
|
2349 |
'example_fix': {
|
2350 |
'before': 'The system (as defined in AC 25-11B performs...',
|
2351 |
'after': 'The system (as defined in AC 25-11B) performs...'
|
2352 |
}
|
2353 |
+
},
|
2354 |
+
'paragraph_length_check': {
|
2355 |
+
'title': 'Paragraph Length Issues',
|
2356 |
+
'description': 'Flags paragraphs exceeding 6 sentences or 8 lines to enhance readability and clarity. While concise paragraphs are encouraged, with each focusing on a single idea or related points, exceeding these limits doesn\'t necessarily indicate a problem. Some content may appropriately extend beyond 8 lines, especially if it includes necessary details. Boilerplate language or template text exceeding these limits is not subject to modification or division.',
|
2357 |
+
'solution': 'Where possible, split long paragraphs into smaller sections, ensuring each focuses on one primary idea. If restructuring is not feasible or the content is boilerplate text, no changes are needed.',
|
2358 |
+
'example_fix': {
|
2359 |
+
'before': 'A very long paragraph covering multiple topics and spanning many lines...',
|
2360 |
+
'after': 'Multiple shorter paragraphs or restructured paragraphs, each focused on a single topic or related points.'
|
2361 |
+
}
|
2362 |
+
},
|
2363 |
+
'sentence_length_check': {
|
2364 |
+
'title': 'Sentence Length Issues',
|
2365 |
+
'description': 'Analyzes sentence length to ensure readability. While the ideal length varies with content complexity, sentences over 35 words often become difficult to follow. Technical content, regulatory references, notes, warnings, and list items are excluded from this check.',
|
2366 |
+
'solution': 'Break long sentences into smaller ones where possible, focusing on one main point per sentence. Consider using lists for complex items.',
|
2367 |
+
'example_fix': {
|
2368 |
+
'before': 'The operator must ensure that all required maintenance procedures are performed in accordance with the manufacturer\'s specifications and that proper documentation is maintained throughout the entire process to demonstrate compliance with regulatory requirements.',
|
2369 |
+
'after': 'The operator must ensure all required maintenance procedures are performed according to manufacturer specifications. Additionally, proper documentation must be maintained to demonstrate regulatory compliance.'
|
2370 |
+
}
|
2371 |
}
|
2372 |
}
|
2373 |
|
|
|
2476 |
if 'incorrect_term' in issue and 'correct_term' in issue:
|
2477 |
return f" β’ Replace '{issue['incorrect_term']}' with '{issue['correct_term']}'"
|
2478 |
|
2479 |
+
if 'sentence' in issue and 'word_count' in issue: # For sentence length check
|
2480 |
+
return f" β’ Review this sentence: \"{issue['sentence']}\""
|
2481 |
+
|
2482 |
if 'sentence' in issue:
|
2483 |
return f" β’ {issue['sentence']}"
|
2484 |
|
2485 |
if 'description' in issue:
|
2486 |
return f" β’ {issue['description']}"
|
2487 |
|
2488 |
+
if 'type' in issue and issue['type'] == 'long_paragraph':
|
2489 |
+
return f" β’ {issue['message']}"
|
2490 |
+
|
2491 |
# Fallback for other issue formats
|
2492 |
return f" β’ {str(issue)}"
|
2493 |
|
|
|
2713 |
output.extend(self._format_section_symbol_issues(result))
|
2714 |
elif check_name == 'parentheses_check':
|
2715 |
output.extend(self._format_parentheses_issues(result))
|
2716 |
+
elif check_name == 'paragraph_length_check':
|
2717 |
+
output.extend(self._format_paragraph_length_issues(result))
|
2718 |
else:
|
2719 |
+
formatted_issues = [self._format_standard_issue(issue) for issue in result.issues[:15]]
|
2720 |
output.extend(formatted_issues)
|
2721 |
|
2722 |
if len(result.issues) > 10:
|
2723 |
+
output.append(f"\n ... and {len(result.issues) - 15} more similar issues.")
|
2724 |
|
2725 |
return '\n'.join(output)
|
2726 |
|
|
|
2792 |
check_categories = {
|
2793 |
'heading_title_check': {'title': 'π Required Headings', 'priority': 1},
|
2794 |
'heading_title_period_check': {'title': 'π Heading Period Usage', 'priority': 1},
|
2795 |
+
'terminology_check': {'title': 'π Terminology Usage', 'priority': 1},
|
2796 |
+
'acronym_check': {'title': 'π Acronym Definitions', 'priority': 1},
|
2797 |
+
'acronym_usage_check': {'title': 'π Acronym Usage', 'priority': 1},
|
2798 |
'section_symbol_usage_check': {'title': 'Β§ Section Symbol Usage', 'priority': 2},
|
2799 |
+
'date_formats_check': {'title': 'π
Date Formats', 'priority': 2},
|
2800 |
+
'placeholders_check': {'title': 'π© Placeholder Content', 'priority': 2},
|
2801 |
+
'document_title_check': {'title': 'π Document Title Format', 'priority': 2},
|
2802 |
'caption_check_table': {'title': 'π Table Captions', 'priority': 3},
|
2803 |
'caption_check_figure': {'title': 'πΌοΈ Figure Captions', 'priority': 3},
|
2804 |
'table_figure_reference_check': {'title': 'π Table/Figure References', 'priority': 3},
|
2805 |
+
'parentheses_check': {'title': 'π Parentheses Usage', 'priority': 4},
|
2806 |
'double_period_check': {'title': 'β‘ Double Periods', 'priority': 4},
|
2807 |
'spacing_check': {'title': 'β¨οΈ Spacing Issues', 'priority': 4},
|
2808 |
+
'paragraph_length_check': {'title': 'π Paragraph Length', 'priority': 5},
|
2809 |
+
'sentence_length_check': {'title': 'π Sentence Length', 'priority': 5}
|
|
|
2810 |
}
|
2811 |
|
2812 |
sorted_checks = sorted(
|