Spaces:
Sleeping
Sleeping
Cleaned up code
Browse files
app.py
CHANGED
@@ -42,20 +42,19 @@ DOCUMENT_TYPES = [
|
|
42 |
|
43 |
TEMPLATE_TYPES = ["Short AC template AC", "Long AC template AC"]
|
44 |
|
45 |
-
# Heading Word Constants
|
46 |
HEADING_WORDS = {
|
47 |
-
'
|
48 |
-
'
|
49 |
-
'
|
50 |
-
'
|
51 |
-
'
|
52 |
-
'
|
53 |
-
'INSERT'
|
54 |
}
|
55 |
|
56 |
PREDEFINED_ACRONYMS = {
|
57 |
-
'
|
58 |
-
'
|
|
|
59 |
}
|
60 |
|
61 |
# Configuration Constants
|
@@ -463,19 +462,19 @@ class DocumentCheckerConfig:
|
|
463 |
),
|
464 |
PatternConfig(
|
465 |
pattern=r'\bUSC\b',
|
466 |
-
description="USC should be U.S.C.",
|
467 |
is_error=True,
|
468 |
replacement="U.S.C."
|
469 |
),
|
470 |
PatternConfig(
|
471 |
pattern=r'\bCFR Part\b',
|
472 |
-
description="CFR Part should be CFR part",
|
473 |
is_error=True,
|
474 |
replacement="CFR part"
|
475 |
),
|
476 |
PatternConfig(
|
477 |
pattern=r'\bC\.F\.R\.\b',
|
478 |
-
description="C.F.R. should be CFR",
|
479 |
is_error=True,
|
480 |
replacement="CFR"
|
481 |
),
|
@@ -493,79 +492,79 @@ class DocumentCheckerConfig:
|
|
493 |
),
|
494 |
PatternConfig(
|
495 |
pattern=r'\bcancelled\b',
|
496 |
-
description="'cancelled' should be 'canceled'",
|
497 |
is_error=True,
|
498 |
replacement="canceled"
|
499 |
),
|
500 |
PatternConfig(
|
501 |
pattern=r'\bshall\b',
|
502 |
-
description="'shall' should be 'must'",
|
503 |
is_error=True,
|
504 |
replacement="must"
|
505 |
),
|
506 |
PatternConfig(
|
507 |
pattern=r'\b\&\b',
|
508 |
-
description="'&' should be 'and'",
|
509 |
is_error=True,
|
510 |
replacement="and"
|
511 |
),
|
512 |
PatternConfig(
|
513 |
pattern=r'\bflight crew\b',
|
514 |
-
description="'flight crew' should be 'flightcrew'",
|
515 |
is_error=True,
|
516 |
replacement="flightcrew"
|
517 |
),
|
518 |
PatternConfig(
|
519 |
pattern=r'\bchairman\b',
|
520 |
-
description="'chairman' should be 'chair'",
|
521 |
is_error=True,
|
522 |
replacement="chair"
|
523 |
),
|
524 |
PatternConfig(
|
525 |
pattern=r'\bflagman\b',
|
526 |
-
description="'flagman' should be 'flagger' or 'flagperson'",
|
527 |
is_error=True,
|
528 |
replacement="flagperson"
|
529 |
),
|
530 |
PatternConfig(
|
531 |
pattern=r'\bman\b',
|
532 |
-
description="'man' should be 'individual' or 'person'",
|
533 |
is_error=True,
|
534 |
replacement="person"
|
535 |
),
|
536 |
PatternConfig(
|
537 |
pattern=r'\bmanmade\b',
|
538 |
-
description="'manmade' should be 'personmade'",
|
539 |
is_error=True,
|
540 |
replacement="personmade"
|
541 |
),
|
542 |
PatternConfig(
|
543 |
pattern=r'\bmanpower\b',
|
544 |
-
description="'manpower' should be 'labor force'",
|
545 |
is_error=True,
|
546 |
replacement="labor force"
|
547 |
),
|
548 |
PatternConfig(
|
549 |
pattern=r'\bnotice to airman\b',
|
550 |
-
description="'notice to airman' should be 'notice to air missions'",
|
551 |
is_error=True,
|
552 |
replacement="notice to air missions"
|
553 |
),
|
554 |
PatternConfig(
|
555 |
pattern=r'\bnotice to airmen\b',
|
556 |
-
description="'notice to airmen' should be 'notice to air missions'",
|
557 |
is_error=True,
|
558 |
replacement="notice to air missions"
|
559 |
),
|
560 |
PatternConfig(
|
561 |
pattern=r'\bcockpit\b',
|
562 |
-
description="'cockpit' should be 'flight deck'",
|
563 |
is_error=True,
|
564 |
replacement="flight deck"
|
565 |
),
|
566 |
PatternConfig(
|
567 |
pattern=r'\bA321 neo\b',
|
568 |
-
description="'A321 neo' should be 'A321neo'",
|
569 |
is_error=True,
|
570 |
replacement="A321neo"
|
571 |
)
|
@@ -573,7 +572,7 @@ class DocumentCheckerConfig:
|
|
573 |
'section_symbol': [
|
574 |
PatternConfig(
|
575 |
pattern=r'^§',
|
576 |
-
description="
|
577 |
is_error=True
|
578 |
),
|
579 |
PatternConfig(
|
@@ -599,28 +598,23 @@ class DocumentCheckerConfig:
|
|
599 |
],
|
600 |
'spacing': [
|
601 |
PatternConfig(
|
602 |
-
pattern=r'(
|
603 |
-
description="
|
604 |
is_error=True
|
605 |
),
|
606 |
PatternConfig(
|
607 |
-
pattern=r'(?<!\s)(
|
608 |
-
description="
|
609 |
is_error=True
|
610 |
),
|
611 |
PatternConfig(
|
612 |
-
pattern=r'(
|
613 |
-
description="
|
614 |
is_error=True
|
615 |
),
|
616 |
PatternConfig(
|
617 |
-
pattern=r'(?<!\s)(
|
618 |
-
description="
|
619 |
-
is_error=True
|
620 |
-
),
|
621 |
-
PatternConfig(
|
622 |
-
pattern=r'\s{2,}',
|
623 |
-
description="Double spaces between words",
|
624 |
is_error=True
|
625 |
)
|
626 |
],
|
@@ -1078,16 +1072,14 @@ class FAADocumentChecker(DocumentChecker):
|
|
1078 |
|
1079 |
@profile_performance
|
1080 |
def check_terminology(self, doc: List[str]) -> DocumentCheckResult:
|
1081 |
-
"""
|
1082 |
-
Check document terminology and output only unique sentences needing correction.
|
1083 |
-
"""
|
1084 |
if not self.validate_input(doc):
|
1085 |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
|
1086 |
|
1087 |
terminology_patterns = self.config_manager.pattern_registry.get('terminology', [])
|
1088 |
prohibited_patterns = self.config_manager.pattern_registry.get('reference_terms', [])
|
1089 |
|
1090 |
-
|
1091 |
|
1092 |
# Process each sentence
|
1093 |
for paragraph in doc:
|
@@ -1097,46 +1089,27 @@ class FAADocumentChecker(DocumentChecker):
|
|
1097 |
if not sentence:
|
1098 |
continue
|
1099 |
|
1100 |
-
|
1101 |
-
|
1102 |
for pattern_config in terminology_patterns:
|
1103 |
matches = list(re.finditer(pattern_config.pattern, sentence))
|
1104 |
for match in matches:
|
1105 |
-
|
1106 |
-
|
1107 |
-
'correct_term': pattern_config.replacement,
|
1108 |
-
'description': pattern_config.description,
|
1109 |
-
'sentence': sentence
|
1110 |
-
})
|
1111 |
|
|
|
1112 |
for pattern_config in prohibited_patterns:
|
1113 |
if re.search(pattern_config.pattern, sentence, re.IGNORECASE):
|
1114 |
-
|
1115 |
-
|
1116 |
-
|
1117 |
-
})
|
1118 |
-
|
1119 |
-
if current_sentence_issues:
|
1120 |
-
if sentence not in sentence_issues:
|
1121 |
-
sentence_issues[sentence] = current_sentence_issues
|
1122 |
-
else:
|
1123 |
-
sentence_issues[sentence].extend(current_sentence_issues)
|
1124 |
|
1125 |
-
#
|
1126 |
-
|
1127 |
-
|
1128 |
-
|
1129 |
-
|
1130 |
-
if 'incorrect_term' in issue and issue.get('correct_term'):
|
1131 |
-
replacements.append(f"'{issue['incorrect_term']}' with '{issue['correct_term']}'")
|
1132 |
-
|
1133 |
-
replacement_text = "; ".join(replacements)
|
1134 |
-
formatted_issue = {
|
1135 |
-
'sentence': f"{sentence} (Replace {replacement_text})" if replacements else sentence
|
1136 |
-
}
|
1137 |
-
unique_issues.append(formatted_issue)
|
1138 |
|
1139 |
-
return DocumentCheckResult(success=not
|
1140 |
|
1141 |
@profile_performance
|
1142 |
def check_section_symbol_usage(self, doc: List[str]) -> DocumentCheckResult:
|
@@ -1168,7 +1141,8 @@ class FAADocumentChecker(DocumentChecker):
|
|
1168 |
matches = compiled_pattern.finditer(sentence)
|
1169 |
for match in matches:
|
1170 |
incorrect = match.group()
|
1171 |
-
|
|
|
1172 |
issues.append({
|
1173 |
'incorrect': incorrect,
|
1174 |
'correct': correct
|
@@ -1402,57 +1376,65 @@ class FAADocumentChecker(DocumentChecker):
|
|
1402 |
if not self.validate_input(doc):
|
1403 |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
|
1404 |
|
1405 |
-
|
1406 |
-
spacing_patterns = self.config_manager.pattern_registry.get('spacing', [])
|
1407 |
-
|
1408 |
-
# Initialize issue groups
|
1409 |
-
issue_groups = {
|
1410 |
-
'document_type_spacing': [], # AC25.25, FAA123, etc.
|
1411 |
-
'section_symbol_spacing': [], # §25.25
|
1412 |
-
'part_number_spacing': [], # Part25
|
1413 |
-
'paragraph_spacing': [], # text(a) or text(1)
|
1414 |
-
'double_space': [] # Multiple spaces between words
|
1415 |
-
}
|
1416 |
-
|
1417 |
-
# Define descriptions for each issue type
|
1418 |
-
category_descriptions = {
|
1419 |
-
'document_type_spacing': 'Missing space between document type and number',
|
1420 |
-
'section_symbol_spacing': 'Missing space after section symbol',
|
1421 |
-
'part_number_spacing': 'Missing space between Part and number',
|
1422 |
-
'paragraph_spacing': 'Missing space before paragraph indication',
|
1423 |
-
'double_space': 'Multiple spaces between words'
|
1424 |
-
}
|
1425 |
|
1426 |
-
|
1427 |
-
|
1428 |
-
|
1429 |
-
|
1430 |
-
|
1431 |
-
|
1432 |
-
|
1433 |
-
|
1434 |
-
|
1435 |
-
|
1436 |
-
|
1437 |
-
|
1438 |
-
|
1439 |
|
1440 |
-
#
|
1441 |
-
|
1442 |
-
|
1443 |
-
|
1444 |
-
|
1445 |
-
|
1446 |
-
|
1447 |
-
|
1448 |
-
|
1449 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1450 |
|
1451 |
-
# Use the helper to compile issues
|
1452 |
-
issues = self._compile_issues(issue_groups, category_descriptions)
|
1453 |
-
|
1454 |
return DocumentCheckResult(success=len(issues) == 0, issues=issues)
|
1455 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1456 |
@profile_performance
|
1457 |
def check_abbreviation_usage(self, doc: List[str]) -> DocumentCheckResult:
|
1458 |
"""Check for abbreviation consistency after first definition."""
|
@@ -1651,12 +1633,13 @@ class FAADocumentChecker(DocumentChecker):
|
|
1651 |
details={'message': f'No patterns defined for {pattern_category}'}
|
1652 |
)
|
1653 |
|
1654 |
-
# Use custom processing function if provided
|
1655 |
if process_func:
|
1656 |
return process_func(doc, patterns)
|
1657 |
|
1658 |
-
# Default processing
|
1659 |
-
|
|
|
1660 |
for paragraph in doc:
|
1661 |
sentences = re.split(r'(?<=[.!?])\s+', paragraph)
|
1662 |
for sentence in sentences:
|
@@ -1667,14 +1650,25 @@ class FAADocumentChecker(DocumentChecker):
|
|
1667 |
for pattern_config in patterns:
|
1668 |
matches = list(re.finditer(pattern_config.pattern, sentence))
|
1669 |
if matches:
|
1670 |
-
|
1671 |
-
|
1672 |
-
|
1673 |
-
|
1674 |
-
|
1675 |
-
|
|
|
1676 |
|
1677 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1678 |
|
1679 |
def run_all_checks(self, doc_path: str, doc_type: str, template_type: Optional[str] = None) -> Dict[str, DocumentCheckResult]:
|
1680 |
"""
|
@@ -1803,55 +1797,14 @@ class FAADocumentChecker(DocumentChecker):
|
|
1803 |
|
1804 |
return sentences
|
1805 |
|
1806 |
-
@profile_performance
|
1807 |
-
def check_terminology(self, doc: List[str]) -> DocumentCheckResult:
|
1808 |
-
"""Check document terminology and output only unique term replacements needed."""
|
1809 |
-
if not self.validate_input(doc):
|
1810 |
-
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
|
1811 |
-
|
1812 |
-
terminology_patterns = self.config_manager.pattern_registry.get('terminology', [])
|
1813 |
-
prohibited_patterns = self.config_manager.pattern_registry.get('reference_terms', [])
|
1814 |
-
|
1815 |
-
unique_issues = set() # Using a set to avoid duplicate replacements
|
1816 |
-
|
1817 |
-
# Process each sentence
|
1818 |
-
for paragraph in doc:
|
1819 |
-
sentences = re.split(r'(?<=[.!?])\s+', paragraph)
|
1820 |
-
for sentence in sentences:
|
1821 |
-
sentence = sentence.strip()
|
1822 |
-
if not sentence:
|
1823 |
-
continue
|
1824 |
-
|
1825 |
-
# Check terminology patterns
|
1826 |
-
for pattern_config in terminology_patterns:
|
1827 |
-
matches = list(re.finditer(pattern_config.pattern, sentence))
|
1828 |
-
for match in matches:
|
1829 |
-
if pattern_config.replacement: # Only if there's a replacement term
|
1830 |
-
unique_issues.add((match.group(), pattern_config.replacement))
|
1831 |
-
|
1832 |
-
# Check prohibited patterns
|
1833 |
-
for pattern_config in prohibited_patterns:
|
1834 |
-
if re.search(pattern_config.pattern, sentence, re.IGNORECASE):
|
1835 |
-
if pattern_config.replacement: # Only if there's a replacement term
|
1836 |
-
match_text = re.search(pattern_config.pattern, sentence, re.IGNORECASE).group()
|
1837 |
-
unique_issues.add((match_text, pattern_config.replacement))
|
1838 |
-
|
1839 |
-
# Format issues as simple replacement instructions
|
1840 |
-
formatted_issues = [
|
1841 |
-
{'incorrect_term': incorrect, 'correct_term': correct}
|
1842 |
-
for incorrect, correct in sorted(unique_issues) # Sort for consistent output
|
1843 |
-
]
|
1844 |
-
|
1845 |
-
return DocumentCheckResult(success=not formatted_issues, issues=formatted_issues)
|
1846 |
-
|
1847 |
@profile_performance
|
1848 |
def check_parentheses(self, doc: List[str]) -> DocumentCheckResult:
|
1849 |
"""
|
1850 |
Check for matching parentheses in the document.
|
1851 |
-
|
1852 |
Args:
|
1853 |
doc (List[str]): List of document paragraphs
|
1854 |
-
|
1855 |
Returns:
|
1856 |
DocumentCheckResult: Result containing any mismatched parentheses issues
|
1857 |
"""
|
@@ -1861,35 +1814,37 @@ class FAADocumentChecker(DocumentChecker):
|
|
1861 |
issues = []
|
1862 |
|
1863 |
for i, paragraph in enumerate(doc, 1):
|
1864 |
-
# Skip empty paragraphs
|
1865 |
-
if not paragraph.strip():
|
1866 |
continue
|
1867 |
-
|
1868 |
-
stack = []
|
1869 |
-
for j, char in enumerate(paragraph):
|
1870 |
-
if char == '(':
|
1871 |
-
stack.append((i, j)) # Store paragraph and character position
|
1872 |
-
elif char == ')':
|
1873 |
-
if not stack: # No matching opening parenthesis
|
1874 |
-
issues.append({
|
1875 |
-
'type': 'missing_opening',
|
1876 |
-
'paragraph': i,
|
1877 |
-
'position': j,
|
1878 |
-
'text': paragraph,
|
1879 |
-
'message': f"Add an opening parenthesis before '{paragraph[max(0, j-20):min(len(paragraph), j+20)]}'"
|
1880 |
-
})
|
1881 |
-
else:
|
1882 |
-
stack.pop() # Remove matching pair
|
1883 |
|
1884 |
-
#
|
1885 |
-
|
1886 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1887 |
issues.append({
|
1888 |
'type': 'missing_closing',
|
1889 |
-
'paragraph':
|
1890 |
'position': pos,
|
1891 |
-
'text':
|
1892 |
-
'message': f"Add a closing parenthesis
|
1893 |
})
|
1894 |
|
1895 |
return DocumentCheckResult(success=len(issues) == 0, issues=issues)
|
@@ -1900,57 +1855,52 @@ class FAADocumentChecker(DocumentChecker):
|
|
1900 |
if not self.validate_input(doc):
|
1901 |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
|
1902 |
|
1903 |
-
# Get patterns from registry
|
1904 |
spacing_patterns = self.config_manager.pattern_registry.get('spacing', [])
|
|
|
1905 |
|
1906 |
-
|
1907 |
-
|
1908 |
-
|
1909 |
-
|
1910 |
-
'part_number_spacing': [], # Part25
|
1911 |
-
'paragraph_spacing': [], # text(a) or text(1)
|
1912 |
-
'double_space': [] # Multiple spaces between words
|
1913 |
-
}
|
1914 |
-
|
1915 |
-
# Define descriptions for each issue type
|
1916 |
-
category_descriptions = {
|
1917 |
-
'document_type_spacing': 'Missing space between document type and number',
|
1918 |
-
'section_symbol_spacing': 'Missing space after section symbol',
|
1919 |
-
'part_number_spacing': 'Missing space between Part and number',
|
1920 |
-
'paragraph_spacing': 'Missing space before paragraph indication',
|
1921 |
-
'double_space': 'Multiple spaces between words'
|
1922 |
-
}
|
1923 |
-
|
1924 |
-
# Pattern mapping for categorization
|
1925 |
-
pattern_categories = {
|
1926 |
-
r'(?<!\s)(AC|AD|CFR|FAA|N|SFAR)(\d+[-]?\d*)': ('document_type_spacing', issue_groups['document_type_spacing']),
|
1927 |
-
r'(?<!\s)(§|§§)(\d+\.\d+)': ('section_symbol_spacing', issue_groups['section_symbol_spacing']),
|
1928 |
-
r'(?<!\s)Part(\d+)': ('part_number_spacing', issue_groups['part_number_spacing']),
|
1929 |
-
r'(?<!\s)(\([a-z](?!\))|\([1-9](?!\)))': ('paragraph_spacing', issue_groups['paragraph_spacing']),
|
1930 |
-
r'\s{2,}': ('double_space', issue_groups['double_space'])
|
1931 |
-
}
|
1932 |
|
1933 |
-
|
1934 |
-
|
1935 |
-
|
1936 |
-
|
1937 |
-
|
1938 |
-
|
1939 |
-
|
1940 |
-
|
1941 |
-
|
1942 |
-
|
1943 |
-
|
1944 |
-
|
1945 |
-
|
1946 |
-
|
1947 |
-
|
|
|
|
|
|
|
|
|
|
|
1948 |
|
1949 |
-
# Use the helper to compile issues
|
1950 |
-
issues = self._compile_issues(issue_groups, category_descriptions)
|
1951 |
-
|
1952 |
return DocumentCheckResult(success=len(issues) == 0, issues=issues)
|
1953 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1954 |
@profile_performance
|
1955 |
def check_abbreviation_usage(self, doc: List[str]) -> DocumentCheckResult:
|
1956 |
"""Check for abbreviation consistency after first definition."""
|
@@ -1992,303 +1942,6 @@ class FAADocumentChecker(DocumentChecker):
|
|
1992 |
success = len(inconsistent_uses) == 0
|
1993 |
return DocumentCheckResult(success=success, issues=inconsistent_uses)
|
1994 |
|
1995 |
-
@profile_performance
|
1996 |
-
def check_placeholders(self, doc: List[str]) -> DocumentCheckResult:
|
1997 |
-
"""Check for placeholders that should be removed."""
|
1998 |
-
def process_placeholders(doc: List[str], patterns: List[PatternConfig]) -> DocumentCheckResult:
|
1999 |
-
tbd_placeholders = []
|
2000 |
-
to_be_determined_placeholders = []
|
2001 |
-
to_be_added_placeholders = []
|
2002 |
-
|
2003 |
-
pattern_categories = {
|
2004 |
-
r'\bTBD\b': ('tbd', tbd_placeholders),
|
2005 |
-
r'\bTo be determined\b': ('to_be_determined', to_be_determined_placeholders),
|
2006 |
-
r'\bTo be added\b': ('to_be_added', to_be_added_placeholders)
|
2007 |
-
}
|
2008 |
-
|
2009 |
-
# Use _process_sentences helper
|
2010 |
-
for sentence, paragraph in self._process_sentences(doc, skip_empty=True, skip_headings=True):
|
2011 |
-
for pattern_config in patterns:
|
2012 |
-
compiled_pattern = re.compile(pattern_config.pattern, re.IGNORECASE)
|
2013 |
-
|
2014 |
-
for pattern_key, (category_name, category_list) in pattern_categories.items():
|
2015 |
-
if pattern_config.pattern == pattern_key:
|
2016 |
-
matches = compiled_pattern.finditer(sentence)
|
2017 |
-
for match in matches:
|
2018 |
-
category_list.append({
|
2019 |
-
'placeholder': match.group().strip(),
|
2020 |
-
'sentence': sentence.strip(),
|
2021 |
-
'description': pattern_config.description
|
2022 |
-
})
|
2023 |
-
|
2024 |
-
# Compile issues
|
2025 |
-
issues = []
|
2026 |
-
if tbd_placeholders:
|
2027 |
-
issues.append({
|
2028 |
-
'issue_type': 'tbd_placeholder',
|
2029 |
-
'description': 'Remove TBD placeholder',
|
2030 |
-
'occurrences': tbd_placeholders
|
2031 |
-
})
|
2032 |
-
|
2033 |
-
if to_be_determined_placeholders:
|
2034 |
-
issues.append({
|
2035 |
-
'issue_type': 'to_be_determined_placeholder',
|
2036 |
-
'description': "Remove 'To be determined' placeholder",
|
2037 |
-
'occurrences': to_be_determined_placeholders
|
2038 |
-
})
|
2039 |
-
|
2040 |
-
if to_be_added_placeholders:
|
2041 |
-
issues.append({
|
2042 |
-
'issue_type': 'to_be_added_placeholder',
|
2043 |
-
'description': "Remove 'To be added' placeholder",
|
2044 |
-
'occurrences': to_be_added_placeholders
|
2045 |
-
})
|
2046 |
-
|
2047 |
-
details = {
|
2048 |
-
'total_placeholders': len(tbd_placeholders) +
|
2049 |
-
len(to_be_determined_placeholders) +
|
2050 |
-
len(to_be_added_placeholders),
|
2051 |
-
'placeholder_types': {
|
2052 |
-
'TBD': len(tbd_placeholders),
|
2053 |
-
'To be determined': len(to_be_determined_placeholders),
|
2054 |
-
'To be added': len(to_be_added_placeholders)
|
2055 |
-
}
|
2056 |
-
}
|
2057 |
-
|
2058 |
-
return DocumentCheckResult(success=len(issues) == 0, issues=issues, details=details)
|
2059 |
-
|
2060 |
-
return self._process_patterns(doc, 'placeholders', process_placeholders)
|
2061 |
-
|
2062 |
-
@profile_performance
|
2063 |
-
def _process_patterns(
|
2064 |
-
self,
|
2065 |
-
doc: List[str],
|
2066 |
-
pattern_category: str,
|
2067 |
-
process_func: Optional[Callable] = None
|
2068 |
-
) -> DocumentCheckResult:
|
2069 |
-
"""
|
2070 |
-
Process document text against patterns from a specific category.
|
2071 |
-
|
2072 |
-
Args:
|
2073 |
-
doc: List of document paragraphs
|
2074 |
-
pattern_category: Category of patterns to check against
|
2075 |
-
process_func: Optional custom processing function
|
2076 |
-
|
2077 |
-
Returns:
|
2078 |
-
DocumentCheckResult with processed issues
|
2079 |
-
"""
|
2080 |
-
if not self.validate_input(doc):
|
2081 |
-
self.logger.error("Invalid document input for pattern check")
|
2082 |
-
return DocumentCheckResult(
|
2083 |
-
success=False,
|
2084 |
-
issues=[{'error': 'Invalid document input'}]
|
2085 |
-
)
|
2086 |
-
|
2087 |
-
# Get patterns from registry
|
2088 |
-
patterns = self.config_manager.pattern_registry.get(pattern_category, [])
|
2089 |
-
if not patterns:
|
2090 |
-
self.logger.warning(f"No patterns found for category: {pattern_category}")
|
2091 |
-
return DocumentCheckResult(
|
2092 |
-
success=True,
|
2093 |
-
issues=[],
|
2094 |
-
details={'message': f'No patterns defined for {pattern_category}'}
|
2095 |
-
)
|
2096 |
-
|
2097 |
-
# Use custom processing function if provided, otherwise use default
|
2098 |
-
if process_func:
|
2099 |
-
return process_func(doc, patterns)
|
2100 |
-
|
2101 |
-
# Default processing
|
2102 |
-
issues = []
|
2103 |
-
for paragraph in doc:
|
2104 |
-
sentences = re.split(r'(?<=[.!?])\s+', paragraph)
|
2105 |
-
for sentence in sentences:
|
2106 |
-
sentence = sentence.strip()
|
2107 |
-
if not sentence:
|
2108 |
-
continue
|
2109 |
-
|
2110 |
-
for pattern_config in patterns:
|
2111 |
-
matches = list(re.finditer(pattern_config.pattern, sentence))
|
2112 |
-
if matches:
|
2113 |
-
issues.append({
|
2114 |
-
'pattern': pattern_config.pattern,
|
2115 |
-
'description': pattern_config.description,
|
2116 |
-
'sentence': sentence,
|
2117 |
-
'matches': [m.group() for m in matches]
|
2118 |
-
})
|
2119 |
-
|
2120 |
-
return DocumentCheckResult(success=len(issues) == 0, issues=issues)
|
2121 |
-
|
2122 |
-
def _format_colored_text(self, text: str, color: str) -> str:
|
2123 |
-
"""Helper method to format colored text with reset.
|
2124 |
-
|
2125 |
-
Args:
|
2126 |
-
text: The text to be colored
|
2127 |
-
color: The color to apply (from colorama.Fore)
|
2128 |
-
|
2129 |
-
Returns:
|
2130 |
-
str: The colored text with reset styling
|
2131 |
-
"""
|
2132 |
-
return f"{color}{text}{Style.RESET_ALL}"
|
2133 |
-
|
2134 |
-
def _format_example(self, example_fix: Dict[str, str]) -> List[str]:
|
2135 |
-
"""Format example fixes consistently.
|
2136 |
-
|
2137 |
-
Args:
|
2138 |
-
example_fix: Dictionary containing 'before' and 'after' examples
|
2139 |
-
|
2140 |
-
Returns:
|
2141 |
-
List[str]: Formatted example lines
|
2142 |
-
"""
|
2143 |
-
return [
|
2144 |
-
f" ❌ Incorrect: {example_fix['before']}",
|
2145 |
-
f" ✓ Correct: {example_fix['after']}"
|
2146 |
-
]
|
2147 |
-
|
2148 |
-
def _format_heading_issues(self, result: DocumentCheckResult, doc_type: str) -> List[str]:
|
2149 |
-
"""Format heading check issues consistently."""
|
2150 |
-
output = []
|
2151 |
-
|
2152 |
-
for issue in result.issues:
|
2153 |
-
if issue.get('type') == 'missing_headings':
|
2154 |
-
missing = sorted(issue['missing'])
|
2155 |
-
output.append(f"\n Missing Required Headings for {doc_type}:")
|
2156 |
-
for heading in missing:
|
2157 |
-
output.append(f" • {heading}")
|
2158 |
-
elif issue.get('type') == 'unexpected_headings':
|
2159 |
-
unexpected = sorted(issue['unexpected'])
|
2160 |
-
output.append(f"\n Unexpected Headings Found:")
|
2161 |
-
for heading in unexpected:
|
2162 |
-
output.append(f" • {heading}")
|
2163 |
-
|
2164 |
-
return output
|
2165 |
-
|
2166 |
-
def _format_period_issues(self, result: DocumentCheckResult) -> List[str]:
|
2167 |
-
"""Format period check issues consistently."""
|
2168 |
-
output = []
|
2169 |
-
|
2170 |
-
if result.issues:
|
2171 |
-
output.append(f"\n Heading Period Format Issues:")
|
2172 |
-
for issue in result.issues:
|
2173 |
-
if 'message' in issue:
|
2174 |
-
output.append(f" • {issue['message']}")
|
2175 |
-
|
2176 |
-
return output
|
2177 |
-
|
2178 |
-
def _format_caption_issues(self, issues: List[Dict], doc_type: str) -> List[str]:
|
2179 |
-
"""Format caption check issues with clear replacement instructions."""
|
2180 |
-
formatted_issues = []
|
2181 |
-
for issue in issues:
|
2182 |
-
if 'incorrect_caption' in issue:
|
2183 |
-
caption_parts = issue['incorrect_caption'].split()
|
2184 |
-
if len(caption_parts) >= 2:
|
2185 |
-
caption_type = caption_parts[0] # "Table" or "Figure"
|
2186 |
-
number = caption_parts[1]
|
2187 |
-
|
2188 |
-
# Determine correct format based on document type
|
2189 |
-
if doc_type in ["Advisory Circular", "Order"]:
|
2190 |
-
if '-' not in number:
|
2191 |
-
correct_format = f"{caption_type} {number}-1"
|
2192 |
-
else:
|
2193 |
-
if '-' in number:
|
2194 |
-
correct_format = f"{caption_type} {number.split('-')[0]}"
|
2195 |
-
else:
|
2196 |
-
correct_format = issue['incorrect_caption']
|
2197 |
-
|
2198 |
-
formatted_issues.append(
|
2199 |
-
f" • Replace '{issue['incorrect_caption']}' with '{correct_format}'"
|
2200 |
-
)
|
2201 |
-
|
2202 |
-
return formatted_issues
|
2203 |
-
|
2204 |
-
def _format_reference_issues(self, result: DocumentCheckResult) -> List[str]:
|
2205 |
-
"""Format reference-related issues with clear replacement instructions."""
|
2206 |
-
output = []
|
2207 |
-
|
2208 |
-
if result.issues:
|
2209 |
-
for issue in result.issues:
|
2210 |
-
if 'reference' in issue and 'correct_form' in issue:
|
2211 |
-
output.append(f" • Replace '{issue['reference']}' with '{issue['correct_form']}'")
|
2212 |
-
|
2213 |
-
return output
|
2214 |
-
|
2215 |
-
def _format_standard_issue(self, issue: Dict[str, Any]) -> str:
|
2216 |
-
"""Format standard issues consistently."""
|
2217 |
-
if isinstance(issue, str):
|
2218 |
-
return f" • {issue}"
|
2219 |
-
|
2220 |
-
if 'incorrect' in issue and 'correct' in issue:
|
2221 |
-
return f" • Replace '{issue['incorrect']}' with '{issue['correct']}'"
|
2222 |
-
|
2223 |
-
if 'incorrect_term' in issue and 'correct_term' in issue:
|
2224 |
-
return f" • Replace '{issue['incorrect_term']}' with '{issue['correct_term']}'"
|
2225 |
-
|
2226 |
-
if 'sentence' in issue:
|
2227 |
-
return f" • {issue['sentence']}"
|
2228 |
-
|
2229 |
-
if 'description' in issue:
|
2230 |
-
return f" • {issue['description']}"
|
2231 |
-
|
2232 |
-
# Fallback for other issue formats
|
2233 |
-
return f" • {str(issue)}"
|
2234 |
-
|
2235 |
-
def _format_unused_acronym_issues(self, result: DocumentCheckResult) -> List[str]:
|
2236 |
-
"""Format unused acronym issues with a simple, clear message.
|
2237 |
-
|
2238 |
-
Args:
|
2239 |
-
result: DocumentCheckResult containing acronym issues
|
2240 |
-
|
2241 |
-
Returns:
|
2242 |
-
List[str]: Formatted list of unused acronym issues
|
2243 |
-
"""
|
2244 |
-
formatted_issues = []
|
2245 |
-
|
2246 |
-
if result.issues:
|
2247 |
-
for issue in result.issues:
|
2248 |
-
if isinstance(issue, dict) and 'acronym' in issue:
|
2249 |
-
formatted_issues.append(f" • Acronym '{issue['acronym']}' was defined but never used.")
|
2250 |
-
elif isinstance(issue, str):
|
2251 |
-
# Handle case where issue might be just the acronym
|
2252 |
-
formatted_issues.append(f" • Acronym '{issue}' was defined but never used.")
|
2253 |
-
|
2254 |
-
return formatted_issues
|
2255 |
-
|
2256 |
-
def _format_parentheses_issues(self, result: DocumentCheckResult) -> List[str]:
|
2257 |
-
"""Format parentheses issues with clear instructions for fixing."""
|
2258 |
-
formatted_issues = []
|
2259 |
-
|
2260 |
-
if result.issues:
|
2261 |
-
for issue in result.issues:
|
2262 |
-
if issue['type'] == 'missing_opening':
|
2263 |
-
formatted_issues.append(
|
2264 |
-
f" • Paragraph {issue['paragraph']}: {issue['message']}"
|
2265 |
-
)
|
2266 |
-
elif issue['type'] == 'missing_closing':
|
2267 |
-
formatted_issues.append(
|
2268 |
-
f" • Paragraph {issue['paragraph']}: {issue['message']}"
|
2269 |
-
)
|
2270 |
-
|
2271 |
-
return formatted_issues
|
2272 |
-
|
2273 |
-
def _format_section_symbol_issues(self, result: DocumentCheckResult) -> List[str]:
|
2274 |
-
"""Format section symbol issues with clear replacement instructions."""
|
2275 |
-
formatted_issues = []
|
2276 |
-
|
2277 |
-
if result.issues:
|
2278 |
-
for issue in result.issues:
|
2279 |
-
if 'incorrect' in issue and 'correct' in issue:
|
2280 |
-
if issue.get('is_sentence_start'):
|
2281 |
-
formatted_issues.append(
|
2282 |
-
f" • Do not begin sentences with the section symbol. "
|
2283 |
-
f"Replace '{issue['incorrect']}' with '{issue['correct']}' at the start of the sentence"
|
2284 |
-
)
|
2285 |
-
else:
|
2286 |
-
formatted_issues.append(
|
2287 |
-
f" • Replace '{issue['incorrect']}' with '{issue['correct']}'"
|
2288 |
-
)
|
2289 |
-
|
2290 |
-
return formatted_issues
|
2291 |
-
|
2292 |
class DocumentCheckResultsFormatter:
|
2293 |
"""Formats document check results in a user-friendly way with detailed examples and fixes."""
|
2294 |
|
@@ -2375,7 +2028,7 @@ class DocumentCheckResultsFormatter:
|
|
2375 |
'description': 'Analyzes document spacing patterns to ensure compliance with FAA formatting standards. This includes checking for proper spacing around regulatory references (like "AC 25-1" not "AC25-1"), section symbols (§ 25.1), paragraph references, and multiple spaces between words.',
|
2376 |
'solution': 'Fix spacing issues: remove any missing spaces, double spaces, or inadvertent tabs.',
|
2377 |
'example_fix': {
|
2378 |
-
'before': 'AC25.25 states that SFAR88 and §25.981 require...
|
2379 |
'after': 'AC 25.25 states that SFAR 88 and § 25.981 require...'
|
2380 |
}
|
2381 |
},
|
@@ -2384,8 +2037,8 @@ class DocumentCheckResultsFormatter:
|
|
2384 |
'description': 'Examines all date references in your document. The check automatically excludes technical reference numbers that may look like dates to ensure accurate validation of true date references. Note, though, there might be instances in the heading of the document where the date is formatted as "MM/DD/YYYY", which is acceptable. This applies mostly to date formats within the document body.',
|
2385 |
'solution': 'Use the format "Month Day, Year" where appropriate.',
|
2386 |
'example_fix': {
|
2387 |
-
'before': '
|
2388 |
-
'after': '
|
2389 |
}
|
2390 |
},
|
2391 |
'placeholders_check': {
|
@@ -2393,8 +2046,8 @@ class DocumentCheckResultsFormatter:
|
|
2393 |
'description': 'Identifies incomplete content and temporary placeholders that must be finalized before document publication. This includes common placeholder text (like "TBD" or "To be determined"), draft markers, and incomplete sections.',
|
2394 |
'solution': 'Replace all placeholder content with actual content',
|
2395 |
'example_fix': {
|
2396 |
-
'before': '
|
2397 |
-
'after': '
|
2398 |
}
|
2399 |
},
|
2400 |
'parentheses_check': {
|
@@ -2451,7 +2104,7 @@ class DocumentCheckResultsFormatter:
|
|
2451 |
for heading in unexpected:
|
2452 |
output.append(f" • {heading}")
|
2453 |
|
2454 |
-
return output
|
2455 |
|
2456 |
def _format_period_issues(self, result: DocumentCheckResult) -> List[str]:
|
2457 |
"""Format period check issues consistently."""
|
@@ -2507,6 +2160,9 @@ class DocumentCheckResultsFormatter:
|
|
2507 |
if isinstance(issue, str):
|
2508 |
return f" • {issue}"
|
2509 |
|
|
|
|
|
|
|
2510 |
if 'incorrect_term' in issue and 'correct_term' in issue:
|
2511 |
return f" • Replace '{issue['incorrect_term']}' with '{issue['correct_term']}'"
|
2512 |
|
@@ -2546,14 +2202,7 @@ class DocumentCheckResultsFormatter:
|
|
2546 |
|
2547 |
if result.issues:
|
2548 |
for issue in result.issues:
|
2549 |
-
|
2550 |
-
formatted_issues.append(
|
2551 |
-
f" • Paragraph {issue['paragraph']}: {issue['message']}"
|
2552 |
-
)
|
2553 |
-
elif issue['type'] == 'missing_closing':
|
2554 |
-
formatted_issues.append(
|
2555 |
-
f" • Paragraph {issue['paragraph']}: {issue['message']}"
|
2556 |
-
)
|
2557 |
|
2558 |
return formatted_issues
|
2559 |
|
|
|
42 |
|
43 |
TEMPLATE_TYPES = ["Short AC template AC", "Long AC template AC"]
|
44 |
|
|
|
45 |
HEADING_WORDS = {
|
46 |
+
'APPLICABILITY', 'APPENDIX', 'AUTHORITY', 'BACKGROUND', 'CANCELLATION', 'CAUTION',
|
47 |
+
'CHAPTER', 'CONCLUSION', 'DEPARTMENT', 'DEFINITION', 'DEFINITIONS', 'DISCUSSION',
|
48 |
+
'DISTRIBUTION', 'EXCEPTION', 'EXPLANATION', 'FIGURE', 'GENERAL', 'GROUPS',
|
49 |
+
'INFORMATION', 'INSERT', 'INTRODUCTION', 'MATERIAL', 'NOTE', 'PARTS', 'PAST',
|
50 |
+
'POLICY', 'PRACTICE', 'PROCEDURES', 'PURPOSE', 'RELEVANT', 'RELATED',
|
51 |
+
'REQUIREMENTS', 'SCOPE', 'SECTION', 'SUMMARY', 'TABLE', 'WARNING'
|
|
|
52 |
}
|
53 |
|
54 |
PREDEFINED_ACRONYMS = {
|
55 |
+
'AGC', 'AIR', 'CFR', 'DC', 'DOT', 'FAA IR-M', 'FAQ', 'i.e.', 'e.g.', 'MA',
|
56 |
+
'MD', 'MIL', 'MO', 'No.', 'PDF', 'SSN', 'TX', 'U.S.', 'U.S.C.', 'USA', 'US',
|
57 |
+
'WA', 'ZIP'
|
58 |
}
|
59 |
|
60 |
# Configuration Constants
|
|
|
462 |
),
|
463 |
PatternConfig(
|
464 |
pattern=r'\bUSC\b',
|
465 |
+
description="USC should be U.S.C.", # Per GPO Style Manual
|
466 |
is_error=True,
|
467 |
replacement="U.S.C."
|
468 |
),
|
469 |
PatternConfig(
|
470 |
pattern=r'\bCFR Part\b',
|
471 |
+
description="CFR Part should be CFR part (lowercase)", # Per FAA Order 1320.46
|
472 |
is_error=True,
|
473 |
replacement="CFR part"
|
474 |
),
|
475 |
PatternConfig(
|
476 |
pattern=r'\bC\.F\.R\.\b',
|
477 |
+
description="C.F.R. should be CFR", # GPO Style Manual
|
478 |
is_error=True,
|
479 |
replacement="CFR"
|
480 |
),
|
|
|
492 |
),
|
493 |
PatternConfig(
|
494 |
pattern=r'\bcancelled\b',
|
495 |
+
description="'cancelled' should be 'canceled'", # Per GPO Style Manual
|
496 |
is_error=True,
|
497 |
replacement="canceled"
|
498 |
),
|
499 |
PatternConfig(
|
500 |
pattern=r'\bshall\b',
|
501 |
+
description="'shall' should be 'must'", # Per FAA Order 1320.46
|
502 |
is_error=True,
|
503 |
replacement="must"
|
504 |
),
|
505 |
PatternConfig(
|
506 |
pattern=r'\b\&\b',
|
507 |
+
description="'&' should be 'and'", # Per April 17, 2024 Use ampersand instead or 'and' email from Judith Watson
|
508 |
is_error=True,
|
509 |
replacement="and"
|
510 |
),
|
511 |
PatternConfig(
|
512 |
pattern=r'\bflight crew\b',
|
513 |
+
description="'flight crew' should be 'flightcrew'", # Per AIR-600 Quick Reference Guide for Authors, Reviewers, and Writers/Editors
|
514 |
is_error=True,
|
515 |
replacement="flightcrew"
|
516 |
),
|
517 |
PatternConfig(
|
518 |
pattern=r'\bchairman\b',
|
519 |
+
description="'chairman' should be 'chair'", # Per AIR-600 Quick Reference Guide for Authors, Reviewers, and Writers/Editors
|
520 |
is_error=True,
|
521 |
replacement="chair"
|
522 |
),
|
523 |
PatternConfig(
|
524 |
pattern=r'\bflagman\b',
|
525 |
+
description="'flagman' should be 'flagger' or 'flagperson'", # Per AIR-600 Quick Reference Guide for Authors, Reviewers, and Writers/Editors
|
526 |
is_error=True,
|
527 |
replacement="flagperson"
|
528 |
),
|
529 |
PatternConfig(
|
530 |
pattern=r'\bman\b',
|
531 |
+
description="'man' should be 'individual' or 'person'", # Per AIR-600 Quick Reference Guide for Authors, Reviewers, and Writers/Editors
|
532 |
is_error=True,
|
533 |
replacement="person"
|
534 |
),
|
535 |
PatternConfig(
|
536 |
pattern=r'\bmanmade\b',
|
537 |
+
description="'manmade' should be 'personmade'", # Per AIR-600 Quick Reference Guide for Authors, Reviewers, and Writers/Editors
|
538 |
is_error=True,
|
539 |
replacement="personmade"
|
540 |
),
|
541 |
PatternConfig(
|
542 |
pattern=r'\bmanpower\b',
|
543 |
+
description="'manpower' should be 'labor force'", # Per AIR-600 Quick Reference Guide for Authors, Reviewers, and Writers/Editors
|
544 |
is_error=True,
|
545 |
replacement="labor force"
|
546 |
),
|
547 |
PatternConfig(
|
548 |
pattern=r'\bnotice to airman\b',
|
549 |
+
description="'notice to airman' should be 'notice to air missions'", # Per AIR-600 Quick Reference Guide for Authors, Reviewers, and Writers/Editors
|
550 |
is_error=True,
|
551 |
replacement="notice to air missions"
|
552 |
),
|
553 |
PatternConfig(
|
554 |
pattern=r'\bnotice to airmen\b',
|
555 |
+
description="'notice to airmen' should be 'notice to air missions'", # Per AIR-600 Quick Reference Guide for Authors, Reviewers, and Writers/Editors
|
556 |
is_error=True,
|
557 |
replacement="notice to air missions"
|
558 |
),
|
559 |
PatternConfig(
|
560 |
pattern=r'\bcockpit\b',
|
561 |
+
description="'cockpit' should be 'flight deck'", # Per AIR-600 Quick Reference Guide for Authors, Reviewers, and Writers/Editors
|
562 |
is_error=True,
|
563 |
replacement="flight deck"
|
564 |
),
|
565 |
PatternConfig(
|
566 |
pattern=r'\bA321 neo\b',
|
567 |
+
description="'A321 neo' should be 'A321neo'", # Per TCDS
|
568 |
is_error=True,
|
569 |
replacement="A321neo"
|
570 |
)
|
|
|
572 |
'section_symbol': [
|
573 |
PatternConfig(
|
574 |
pattern=r'^§',
|
575 |
+
description="Don't start a sentence with the section symbol. Write out 'Section'",
|
576 |
is_error=True
|
577 |
),
|
578 |
PatternConfig(
|
|
|
598 |
],
|
599 |
'spacing': [
|
600 |
PatternConfig(
|
601 |
+
pattern=r'([^\s]+)[ ]{2,}([^\s]+)', # Capture words before and after double space
|
602 |
+
description="Remove double spacing between '{0}' and '{1}'",
|
603 |
is_error=True
|
604 |
),
|
605 |
PatternConfig(
|
606 |
+
pattern=r'(?<!\s)(AC|AD|CFR|FAA|N|SFAR)(\d+[-]?\d*[A-Z]?)', # Capture doc type and number
|
607 |
+
description="Add space between '{0}' and '{1}'",
|
608 |
is_error=True
|
609 |
),
|
610 |
PatternConfig(
|
611 |
+
pattern=r'(§|§§)(\d+\.\d+)', # Removed (?<!\s) to catch all section symbols
|
612 |
+
description="Add space after '{0}' before '{1}'",
|
613 |
is_error=True
|
614 |
),
|
615 |
PatternConfig(
|
616 |
+
pattern=r'(?<!\s)(Part)(\d+)', # Capture 'Part' and number
|
617 |
+
description="Add space between '{0}' and '{1}'",
|
|
|
|
|
|
|
|
|
|
|
618 |
is_error=True
|
619 |
)
|
620 |
],
|
|
|
1072 |
|
1073 |
@profile_performance
|
1074 |
def check_terminology(self, doc: List[str]) -> DocumentCheckResult:
|
1075 |
+
"""Check document terminology and output only unique term replacements needed."""
|
|
|
|
|
1076 |
if not self.validate_input(doc):
|
1077 |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
|
1078 |
|
1079 |
terminology_patterns = self.config_manager.pattern_registry.get('terminology', [])
|
1080 |
prohibited_patterns = self.config_manager.pattern_registry.get('reference_terms', [])
|
1081 |
|
1082 |
+
unique_issues = set() # Using a set to avoid duplicate replacements
|
1083 |
|
1084 |
# Process each sentence
|
1085 |
for paragraph in doc:
|
|
|
1089 |
if not sentence:
|
1090 |
continue
|
1091 |
|
1092 |
+
# Check terminology patterns
|
|
|
1093 |
for pattern_config in terminology_patterns:
|
1094 |
matches = list(re.finditer(pattern_config.pattern, sentence))
|
1095 |
for match in matches:
|
1096 |
+
if pattern_config.replacement: # Only if there's a replacement term
|
1097 |
+
unique_issues.add((match.group(), pattern_config.replacement))
|
|
|
|
|
|
|
|
|
1098 |
|
1099 |
+
# Check prohibited patterns
|
1100 |
for pattern_config in prohibited_patterns:
|
1101 |
if re.search(pattern_config.pattern, sentence, re.IGNORECASE):
|
1102 |
+
if pattern_config.replacement: # Only if there's a replacement term
|
1103 |
+
match_text = re.search(pattern_config.pattern, sentence, re.IGNORECASE).group()
|
1104 |
+
unique_issues.add((match_text, pattern_config.replacement))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1105 |
|
1106 |
+
# Format issues as simple replacement instructions
|
1107 |
+
formatted_issues = [
|
1108 |
+
{'incorrect_term': incorrect, 'correct_term': correct}
|
1109 |
+
for incorrect, correct in sorted(unique_issues) # Sort for consistent output
|
1110 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1111 |
|
1112 |
+
return DocumentCheckResult(success=not formatted_issues, issues=formatted_issues)
|
1113 |
|
1114 |
@profile_performance
|
1115 |
def check_section_symbol_usage(self, doc: List[str]) -> DocumentCheckResult:
|
|
|
1141 |
matches = compiled_pattern.finditer(sentence)
|
1142 |
for match in matches:
|
1143 |
incorrect = match.group()
|
1144 |
+
# Remove § symbol without adding 'Section'
|
1145 |
+
correct = incorrect.replace('§ ', '')
|
1146 |
issues.append({
|
1147 |
'incorrect': incorrect,
|
1148 |
'correct': correct
|
|
|
1376 |
if not self.validate_input(doc):
|
1377 |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
|
1378 |
|
1379 |
+
issues = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1380 |
|
1381 |
+
try:
|
1382 |
+
for paragraph in doc:
|
1383 |
+
# Skip empty paragraphs
|
1384 |
+
if not paragraph.strip():
|
1385 |
+
continue
|
1386 |
+
|
1387 |
+
# Skip paragraphs with tabs
|
1388 |
+
if '\t' in paragraph:
|
1389 |
+
continue
|
1390 |
+
|
1391 |
+
# Check for multiple spaces between words, but ignore spaces around parentheses
|
1392 |
+
# First, temporarily replace valid parenthetical patterns to protect them
|
1393 |
+
working_text = paragraph
|
1394 |
|
1395 |
+
# Protect common regulatory reference patterns
|
1396 |
+
patterns_to_ignore = [
|
1397 |
+
r'\d+\(\d+\)\([a-z]\)', # matches patterns like 25(1)(a)
|
1398 |
+
r'\d+\([a-z]\)', # matches patterns like 25(a)
|
1399 |
+
r'\([a-z]\)\(\d+\)', # matches patterns like (a)(1)
|
1400 |
+
r'\(\d+\)\([a-z]\)', # matches patterns like (1)(a)
|
1401 |
+
r'\([a-z]\)', # matches single letter references like (a)
|
1402 |
+
r'\(\d+\)', # matches number references like (1)
|
1403 |
+
]
|
1404 |
+
|
1405 |
+
for pattern in patterns_to_ignore:
|
1406 |
+
working_text = re.sub(pattern, lambda m: 'PROTECTED' + str(hash(m.group())), working_text)
|
1407 |
+
|
1408 |
+
# Now check for multiple spaces
|
1409 |
+
matches = re.finditer(r'[ ]{2,}', working_text)
|
1410 |
+
for match in matches:
|
1411 |
+
issues.append({
|
1412 |
+
'incorrect': match.group(),
|
1413 |
+
'context': paragraph.strip(),
|
1414 |
+
'description': "Remove extra spaces"
|
1415 |
+
})
|
1416 |
+
|
1417 |
+
except Exception as e:
|
1418 |
+
self.logger.error(f"Error in spacing check: {str(e)}")
|
1419 |
+
return DocumentCheckResult(success=False, issues=[{'error': f'Spacing check failed: {str(e)}'}])
|
1420 |
|
|
|
|
|
|
|
1421 |
return DocumentCheckResult(success=len(issues) == 0, issues=issues)
|
1422 |
|
1423 |
+
def _format_spacing_issues(self, result: DocumentCheckResult) -> List[str]:
|
1424 |
+
"""Format spacing issues with clear instructions for fixing."""
|
1425 |
+
formatted_issues = []
|
1426 |
+
|
1427 |
+
if result.issues:
|
1428 |
+
for issue in result.issues:
|
1429 |
+
if 'error' in issue:
|
1430 |
+
formatted_issues.append(f" • {issue['error']}")
|
1431 |
+
else:
|
1432 |
+
formatted_issues.append(
|
1433 |
+
f" • {issue['description']} in: \"{issue['context']}\""
|
1434 |
+
)
|
1435 |
+
|
1436 |
+
return formatted_issues
|
1437 |
+
|
1438 |
@profile_performance
|
1439 |
def check_abbreviation_usage(self, doc: List[str]) -> DocumentCheckResult:
|
1440 |
"""Check for abbreviation consistency after first definition."""
|
|
|
1633 |
details={'message': f'No patterns defined for {pattern_category}'}
|
1634 |
)
|
1635 |
|
1636 |
+
# Use custom processing function if provided
|
1637 |
if process_func:
|
1638 |
return process_func(doc, patterns)
|
1639 |
|
1640 |
+
# Default processing with deduplication
|
1641 |
+
unique_issues = set() # Using a set to track unique issues
|
1642 |
+
|
1643 |
for paragraph in doc:
|
1644 |
sentences = re.split(r'(?<=[.!?])\s+', paragraph)
|
1645 |
for sentence in sentences:
|
|
|
1650 |
for pattern_config in patterns:
|
1651 |
matches = list(re.finditer(pattern_config.pattern, sentence))
|
1652 |
if matches:
|
1653 |
+
# Add each match as a tuple to ensure uniqueness
|
1654 |
+
for match in matches:
|
1655 |
+
unique_issues.add((
|
1656 |
+
match.group(), # The matched text
|
1657 |
+
pattern_config.description, # The issue description
|
1658 |
+
pattern_config.replacement if hasattr(pattern_config, 'replacement') else None
|
1659 |
+
))
|
1660 |
|
1661 |
+
# Convert unique issues back to the expected format
|
1662 |
+
formatted_issues = [
|
1663 |
+
{
|
1664 |
+
'incorrect': issue[0],
|
1665 |
+
'description': issue[1],
|
1666 |
+
'replacement': issue[2]
|
1667 |
+
}
|
1668 |
+
for issue in sorted(unique_issues) # Sort for consistent output
|
1669 |
+
]
|
1670 |
+
|
1671 |
+
return DocumentCheckResult(success=len(formatted_issues) == 0, issues=formatted_issues)
|
1672 |
|
1673 |
def run_all_checks(self, doc_path: str, doc_type: str, template_type: Optional[str] = None) -> Dict[str, DocumentCheckResult]:
|
1674 |
"""
|
|
|
1797 |
|
1798 |
return sentences
|
1799 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1800 |
@profile_performance
|
1801 |
def check_parentheses(self, doc: List[str]) -> DocumentCheckResult:
|
1802 |
"""
|
1803 |
Check for matching parentheses in the document.
|
1804 |
+
|
1805 |
Args:
|
1806 |
doc (List[str]): List of document paragraphs
|
1807 |
+
|
1808 |
Returns:
|
1809 |
DocumentCheckResult: Result containing any mismatched parentheses issues
|
1810 |
"""
|
|
|
1814 |
issues = []
|
1815 |
|
1816 |
for i, paragraph in enumerate(doc, 1):
|
1817 |
+
if not paragraph.strip(): # Skip empty paragraphs
|
|
|
1818 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1819 |
|
1820 |
+
stack = [] # Track unmatched opening parentheses
|
1821 |
+
sentences = re.split(r'(?<=[.!?])\s+', paragraph) # Split paragraph into sentences
|
1822 |
+
for sentence in sentences:
|
1823 |
+
for j, char in enumerate(sentence):
|
1824 |
+
if char == '(':
|
1825 |
+
stack.append((sentence, j)) # Store sentence and character position
|
1826 |
+
elif char == ')':
|
1827 |
+
if stack:
|
1828 |
+
stack.pop() # Remove matching opening parenthesis
|
1829 |
+
else:
|
1830 |
+
# No matching opening parenthesis
|
1831 |
+
issues.append({
|
1832 |
+
'type': 'missing_opening',
|
1833 |
+
'paragraph': i, # Still tracked but not included in the message
|
1834 |
+
'position': j,
|
1835 |
+
'text': sentence,
|
1836 |
+
'message': f"Add an opening parenthesis to the sentence: \"{sentence.strip()}\""
|
1837 |
+
})
|
1838 |
+
|
1839 |
+
# Check for any unmatched opening parentheses left in the stack
|
1840 |
+
for unmatched in stack:
|
1841 |
+
sentence, pos = unmatched
|
1842 |
issues.append({
|
1843 |
'type': 'missing_closing',
|
1844 |
+
'paragraph': i, # Still tracked but not included in the message
|
1845 |
'position': pos,
|
1846 |
+
'text': sentence,
|
1847 |
+
'message': f"Add a closing parenthesis to the sentence: \"{sentence.strip()}\""
|
1848 |
})
|
1849 |
|
1850 |
return DocumentCheckResult(success=len(issues) == 0, issues=issues)
|
|
|
1855 |
if not self.validate_input(doc):
|
1856 |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
|
1857 |
|
|
|
1858 |
spacing_patterns = self.config_manager.pattern_registry.get('spacing', [])
|
1859 |
+
issues = []
|
1860 |
|
1861 |
+
try:
|
1862 |
+
for paragraph in doc:
|
1863 |
+
if not paragraph.strip() or '\t' in paragraph:
|
1864 |
+
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1865 |
|
1866 |
+
for pattern_config in spacing_patterns:
|
1867 |
+
matches = re.finditer(pattern_config.pattern, paragraph)
|
1868 |
+
for match in matches:
|
1869 |
+
groups = match.groups()
|
1870 |
+
description = pattern_config.description.replace('{0}', groups[0]).replace('{1}', groups[1])
|
1871 |
+
|
1872 |
+
context_start = max(0, match.start() - 20)
|
1873 |
+
context_end = min(len(paragraph), match.end() + 20)
|
1874 |
+
context = paragraph[context_start:context_end].strip()
|
1875 |
+
|
1876 |
+
issues.append({
|
1877 |
+
'type': 'spacing',
|
1878 |
+
'incorrect': match.group(),
|
1879 |
+
'context': context,
|
1880 |
+
'description': description
|
1881 |
+
})
|
1882 |
+
|
1883 |
+
except Exception as e:
|
1884 |
+
self.logger.error(f"Error in spacing check: {str(e)}")
|
1885 |
+
return DocumentCheckResult(success=False, issues=[{'error': f'Spacing check failed: {str(e)}'}])
|
1886 |
|
|
|
|
|
|
|
1887 |
return DocumentCheckResult(success=len(issues) == 0, issues=issues)
|
1888 |
|
1889 |
+
def _format_spacing_issues(self, result: DocumentCheckResult) -> List[str]:
|
1890 |
+
"""Format spacing issues with clear instructions for fixing."""
|
1891 |
+
formatted_issues = []
|
1892 |
+
|
1893 |
+
if result.issues:
|
1894 |
+
for issue in result.issues:
|
1895 |
+
if 'error' in issue:
|
1896 |
+
formatted_issues.append(f" • {issue['error']}")
|
1897 |
+
else:
|
1898 |
+
formatted_issues.append(
|
1899 |
+
f" • {issue['description']} in: \"{issue['context']}\""
|
1900 |
+
)
|
1901 |
+
|
1902 |
+
return formatted_issues
|
1903 |
+
|
1904 |
@profile_performance
|
1905 |
def check_abbreviation_usage(self, doc: List[str]) -> DocumentCheckResult:
|
1906 |
"""Check for abbreviation consistency after first definition."""
|
|
|
1942 |
success = len(inconsistent_uses) == 0
|
1943 |
return DocumentCheckResult(success=success, issues=inconsistent_uses)
|
1944 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1945 |
class DocumentCheckResultsFormatter:
|
1946 |
"""Formats document check results in a user-friendly way with detailed examples and fixes."""
|
1947 |
|
|
|
2028 |
'description': 'Analyzes document spacing patterns to ensure compliance with FAA formatting standards. This includes checking for proper spacing around regulatory references (like "AC 25-1" not "AC25-1"), section symbols (§ 25.1), paragraph references, and multiple spaces between words.',
|
2029 |
'solution': 'Fix spacing issues: remove any missing spaces, double spaces, or inadvertent tabs.',
|
2030 |
'example_fix': {
|
2031 |
+
'before': 'AC25.25 states that SFAR88 and §25.981 require...',
|
2032 |
'after': 'AC 25.25 states that SFAR 88 and § 25.981 require...'
|
2033 |
}
|
2034 |
},
|
|
|
2037 |
'description': 'Examines all date references in your document. The check automatically excludes technical reference numbers that may look like dates to ensure accurate validation of true date references. Note, though, there might be instances in the heading of the document where the date is formatted as "MM/DD/YYYY", which is acceptable. This applies mostly to date formats within the document body.',
|
2038 |
'solution': 'Use the format "Month Day, Year" where appropriate.',
|
2039 |
'example_fix': {
|
2040 |
+
'before': 'This policy statement cancels Policy Statement PS-AIR100-2006-MMPDS, dated 7/25/2006.',
|
2041 |
+
'after': 'This policy statement cancels Policy Statement PS-AIR100-2006-MMPDS, dated July 25, 2006.'
|
2042 |
}
|
2043 |
},
|
2044 |
'placeholders_check': {
|
|
|
2046 |
'description': 'Identifies incomplete content and temporary placeholders that must be finalized before document publication. This includes common placeholder text (like "TBD" or "To be determined"), draft markers, and incomplete sections.',
|
2047 |
'solution': 'Replace all placeholder content with actual content',
|
2048 |
'example_fix': {
|
2049 |
+
'before': 'Pilots must submit the [Insert text] form to the FAA for approval.',
|
2050 |
+
'after': 'Pilots must submit the Report of Eye Evaluation form 8500-7 to the FAA for approval.'
|
2051 |
}
|
2052 |
},
|
2053 |
'parentheses_check': {
|
|
|
2104 |
for heading in unexpected:
|
2105 |
output.append(f" • {heading}")
|
2106 |
|
2107 |
+
return output
|
2108 |
|
2109 |
def _format_period_issues(self, result: DocumentCheckResult) -> List[str]:
|
2110 |
"""Format period check issues consistently."""
|
|
|
2160 |
if isinstance(issue, str):
|
2161 |
return f" • {issue}"
|
2162 |
|
2163 |
+
if 'incorrect' in issue and 'correct' in issue:
|
2164 |
+
return f" • Replace '{issue['incorrect']}' with '{issue['correct']}'"
|
2165 |
+
|
2166 |
if 'incorrect_term' in issue and 'correct_term' in issue:
|
2167 |
return f" • Replace '{issue['incorrect_term']}' with '{issue['correct_term']}'"
|
2168 |
|
|
|
2202 |
|
2203 |
if result.issues:
|
2204 |
for issue in result.issues:
|
2205 |
+
formatted_issues.append(f" • {issue['message']}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2206 |
|
2207 |
return formatted_issues
|
2208 |
|