Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -55,7 +55,7 @@ HEADING_WORDS = {
|
|
55 |
# Predefined Acronyms
|
56 |
PREDEFINED_ACRONYMS = {
|
57 |
'CFR', 'U.S.', 'USA', 'US', 'U.S.C', 'e.g.', 'i.e.', 'FAQ', 'No.', 'ZIP', 'PDF', 'SSN',
|
58 |
-
'DC', 'MA', 'WA', 'TX', 'MO'
|
59 |
}
|
60 |
|
61 |
# Configuration Constants
|
@@ -451,6 +451,16 @@ class DocumentCheckerConfig:
|
|
451 |
"""
|
452 |
return {
|
453 |
'terminology': [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
454 |
PatternConfig(
|
455 |
pattern=r'\bUSC\b',
|
456 |
description="USC should be U.S.C.",
|
@@ -924,10 +934,10 @@ class FAADocumentChecker(DocumentChecker):
|
|
924 |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
|
925 |
|
926 |
# Common words that might appear in uppercase but aren't acronyms
|
927 |
-
heading_words = self.config_manager.config.get('heading_words', HEADING_WORDS)
|
928 |
|
929 |
# Standard acronyms that don't need to be defined
|
930 |
-
predefined_acronyms = self.config_manager.config.get('predefined_acronyms', PREDEFINED_ACRONYMS)
|
931 |
|
932 |
# Tracking structures
|
933 |
defined_acronyms = {} # Stores definition info
|
@@ -936,6 +946,7 @@ class FAADocumentChecker(DocumentChecker):
|
|
936 |
|
937 |
# Patterns
|
938 |
defined_pattern = re.compile(r'\b([\w\s&]+?)\s*\((\b[A-Z]{2,}\b)\)')
|
|
|
939 |
acronym_pattern = re.compile(r'(?<!\()\b[A-Z]{2,}\b(?!\s*[:.]\s*)')
|
940 |
|
941 |
for paragraph in doc:
|
@@ -975,37 +986,69 @@ class FAADocumentChecker(DocumentChecker):
|
|
975 |
|
976 |
if acronym not in defined_acronyms:
|
977 |
# Undefined acronym used
|
978 |
-
issues.append(
|
979 |
-
'type': 'undefined_acronym',
|
980 |
-
'acronym': acronym,
|
981 |
-
'sentence': paragraph.strip()
|
982 |
-
})
|
983 |
else:
|
984 |
# Mark as used
|
985 |
defined_acronyms[acronym]['used'] = True
|
986 |
used_acronyms.add(acronym)
|
987 |
|
988 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
989 |
unused_acronyms = [
|
990 |
{
|
991 |
-
'type': 'unused_acronym',
|
992 |
'acronym': acronym,
|
993 |
'full_term': data['full_term'],
|
994 |
'defined_at': data['defined_at']
|
995 |
}
|
996 |
for acronym, data in defined_acronyms.items()
|
997 |
-
if not
|
998 |
]
|
999 |
|
1000 |
-
#
|
1001 |
-
|
1002 |
-
issues.extend(unused_acronyms)
|
1003 |
-
|
1004 |
-
success = len(issues) == 0
|
1005 |
-
|
1006 |
-
return DocumentCheckResult(success=success, issues=issues)
|
1007 |
-
|
1008 |
|
|
|
|
|
1009 |
@profile_performance
|
1010 |
def check_terminology(self, doc: List[str]) -> DocumentCheckResult:
|
1011 |
"""
|
@@ -1706,6 +1749,7 @@ class FAADocumentChecker(DocumentChecker):
|
|
1706 |
('heading_title_check', lambda: self.heading_title_check(doc, doc_type)),
|
1707 |
('heading_title_period_check', lambda: self.heading_title_period_check(doc, doc_type)),
|
1708 |
('acronym_check', lambda: self.acronym_check(doc)),
|
|
|
1709 |
('terminology_check', lambda: self.check_terminology(doc)),
|
1710 |
('section_symbol_usage_check', lambda: self.check_section_symbol_usage(doc)),
|
1711 |
('caption_check_table', lambda: self.caption_check(doc, doc_type, 'Table')),
|
@@ -1777,6 +1821,15 @@ class DocumentCheckResultsFormatter:
|
|
1777 |
'after': 'This order establishes general Federal Aviation Administration (FAA) organizational policies.'
|
1778 |
}
|
1779 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1780 |
'terminology_check': {
|
1781 |
'title': 'Incorrect Terminology',
|
1782 |
'description': 'Evaluates document text against the various style manuals and orders to identify non-compliant terminology, ambiguous references, and outdated phrases. This includes checking for prohibited relative references (like "above" or "below"), proper legal terminology (like "must" instead of "shall"), and consistent formatting of regulatory citations. The check ensures precise, unambiguous communication that meets current FAA documentation requirements.',
|
@@ -1909,6 +1962,23 @@ class DocumentCheckResultsFormatter:
|
|
1909 |
|
1910 |
return output
|
1911 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1912 |
def _format_caption_issues(self, result: DocumentCheckResult) -> List[str]:
|
1913 |
"""Format caption issues consistently."""
|
1914 |
output = []
|
@@ -2087,7 +2157,7 @@ class DocumentCheckResultsFormatter:
|
|
2087 |
"italics": True,
|
2088 |
"quotes": False,
|
2089 |
"description": "For Advisory Circulars, referenced document titles should be italicized but not quoted",
|
2090 |
-
"example": "See AC 25.1309-1B,
|
2091 |
},
|
2092 |
"quotes_only": {
|
2093 |
"types": [
|
@@ -2137,6 +2207,18 @@ class DocumentCheckResultsFormatter:
|
|
2137 |
|
2138 |
output = []
|
2139 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2140 |
# Header
|
2141 |
output.append(f"\n{Fore.CYAN}{'='*80}")
|
2142 |
output.append(f"Document Check Results Summary")
|
@@ -2179,6 +2261,8 @@ class DocumentCheckResultsFormatter:
|
|
2179 |
output.extend(self._format_reference_issues(result))
|
2180 |
elif check_name in ['caption_check_table', 'caption_check_figure']:
|
2181 |
output.extend(self._format_caption_issues(result))
|
|
|
|
|
2182 |
else:
|
2183 |
# Standard issue formatting
|
2184 |
formatted_issues = [self._format_standard_issue(issue) for issue in result.issues[:7]]
|
|
|
55 |
# Predefined Acronyms
|
56 |
PREDEFINED_ACRONYMS = {
|
57 |
'CFR', 'U.S.', 'USA', 'US', 'U.S.C', 'e.g.', 'i.e.', 'FAQ', 'No.', 'ZIP', 'PDF', 'SSN',
|
58 |
+
'DC', 'MD', 'MA', 'WA', 'TX', 'MO', 'FAA IR-M', 'DOT'
|
59 |
}
|
60 |
|
61 |
# Configuration Constants
|
|
|
451 |
"""
|
452 |
return {
|
453 |
'terminology': [
|
454 |
+
PatternConfig(
|
455 |
+
pattern=r'\btitle 14 of the Code of Federal Regulations \(14 CFR\)\b',
|
456 |
+
description="Ignore 'title 14 of the Code of Federal Regulations (14 CFR)'",
|
457 |
+
is_error=False # Set to False to ignore this phrase
|
458 |
+
),
|
459 |
+
PatternConfig(
|
460 |
+
pattern=r'\btitle 14, Code of Federal Regulations \(14 CFR\)\b',
|
461 |
+
description="Ignore 'title 14, Code of Federal Regulations (14 CFR)'",
|
462 |
+
is_error=False
|
463 |
+
),
|
464 |
PatternConfig(
|
465 |
pattern=r'\bUSC\b',
|
466 |
description="USC should be U.S.C.",
|
|
|
934 |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
|
935 |
|
936 |
# Common words that might appear in uppercase but aren't acronyms
|
937 |
+
heading_words = self.config_manager.config.get('heading_words', self.HEADING_WORDS)
|
938 |
|
939 |
# Standard acronyms that don't need to be defined
|
940 |
+
predefined_acronyms = self.config_manager.config.get('predefined_acronyms', self.PREDEFINED_ACRONYMS)
|
941 |
|
942 |
# Tracking structures
|
943 |
defined_acronyms = {} # Stores definition info
|
|
|
946 |
|
947 |
# Patterns
|
948 |
defined_pattern = re.compile(r'\b([\w\s&]+?)\s*\((\b[A-Z]{2,}\b)\)')
|
949 |
+
# Modified acronym pattern
|
950 |
acronym_pattern = re.compile(r'(?<!\()\b[A-Z]{2,}\b(?!\s*[:.]\s*)')
|
951 |
|
952 |
for paragraph in doc:
|
|
|
986 |
|
987 |
if acronym not in defined_acronyms:
|
988 |
# Undefined acronym used
|
989 |
+
issues.append(acronym) # Add only the acronym, not the sentence
|
|
|
|
|
|
|
|
|
990 |
else:
|
991 |
# Mark as used
|
992 |
defined_acronyms[acronym]['used'] = True
|
993 |
used_acronyms.add(acronym)
|
994 |
|
995 |
+
# Define success based on whether there are any undefined acronyms
|
996 |
+
success = len(issues) == 0
|
997 |
+
|
998 |
+
# Return the result with only undefined acronyms
|
999 |
+
return DocumentCheckResult(success=success, issues=list(set(issues)))
|
1000 |
+
|
1001 |
+
@profile_performance
|
1002 |
+
def acronym_usage_check(self, doc: List[str]) -> DocumentCheckResult:
|
1003 |
+
if not self.validate_input(doc):
|
1004 |
+
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
|
1005 |
+
|
1006 |
+
# Pattern to find acronym definitions (e.g., "Environmental Protection Agency (EPA)")
|
1007 |
+
defined_pattern = re.compile(r'\b([\w\s&]+?)\s*\((\b[A-Z]{2,}\b)\)')
|
1008 |
+
|
1009 |
+
# Pattern to find acronym usage (e.g., "FAA", "EPA")
|
1010 |
+
acronym_pattern = re.compile(r'\b[A-Z]{2,}\b')
|
1011 |
+
|
1012 |
+
# Tracking structures
|
1013 |
+
defined_acronyms = {}
|
1014 |
+
used_acronyms = set()
|
1015 |
+
|
1016 |
+
# Step 1: Extract all defined acronyms
|
1017 |
+
for paragraph in doc:
|
1018 |
+
defined_matches = defined_pattern.findall(paragraph)
|
1019 |
+
for full_term, acronym in defined_matches:
|
1020 |
+
if acronym not in defined_acronyms:
|
1021 |
+
defined_acronyms[acronym] = {
|
1022 |
+
'full_term': full_term.strip(),
|
1023 |
+
'defined_at': paragraph.strip()
|
1024 |
+
}
|
1025 |
+
|
1026 |
+
# Step 2: Check for acronym usage, excluding definitions
|
1027 |
+
for paragraph in doc:
|
1028 |
+
# Remove definitions from paragraph for usage checks
|
1029 |
+
paragraph_excluding_definitions = re.sub(defined_pattern, '', paragraph)
|
1030 |
+
|
1031 |
+
usage_matches = acronym_pattern.findall(paragraph_excluding_definitions)
|
1032 |
+
for acronym in usage_matches:
|
1033 |
+
if acronym in defined_acronyms:
|
1034 |
+
used_acronyms.add(acronym)
|
1035 |
+
|
1036 |
+
# Step 3: Identify unused acronyms
|
1037 |
unused_acronyms = [
|
1038 |
{
|
|
|
1039 |
'acronym': acronym,
|
1040 |
'full_term': data['full_term'],
|
1041 |
'defined_at': data['defined_at']
|
1042 |
}
|
1043 |
for acronym, data in defined_acronyms.items()
|
1044 |
+
if acronym not in used_acronyms
|
1045 |
]
|
1046 |
|
1047 |
+
# Success is true if no unused acronyms are found
|
1048 |
+
success = len(unused_acronyms) == 0
|
|
|
|
|
|
|
|
|
|
|
|
|
1049 |
|
1050 |
+
return DocumentCheckResult(success=success, issues=unused_acronyms)
|
1051 |
+
|
1052 |
@profile_performance
|
1053 |
def check_terminology(self, doc: List[str]) -> DocumentCheckResult:
|
1054 |
"""
|
|
|
1749 |
('heading_title_check', lambda: self.heading_title_check(doc, doc_type)),
|
1750 |
('heading_title_period_check', lambda: self.heading_title_period_check(doc, doc_type)),
|
1751 |
('acronym_check', lambda: self.acronym_check(doc)),
|
1752 |
+
('acronym_usage_check', lambda: self.acronym_usage_check(doc)),
|
1753 |
('terminology_check', lambda: self.check_terminology(doc)),
|
1754 |
('section_symbol_usage_check', lambda: self.check_section_symbol_usage(doc)),
|
1755 |
('caption_check_table', lambda: self.caption_check(doc, doc_type, 'Table')),
|
|
|
1821 |
'after': 'This order establishes general Federal Aviation Administration (FAA) organizational policies.'
|
1822 |
}
|
1823 |
},
|
1824 |
+
'acronym_usage_check': {
|
1825 |
+
'title': 'Unused Acronym Definitions',
|
1826 |
+
'description': 'Ensures all acronyms defined in the document are subsequently used. If a term is defined but not used, it should not be defined.',
|
1827 |
+
'solution': 'Remove definitions for acronyms that are not used later in the document.',
|
1828 |
+
'example_fix': {
|
1829 |
+
'before': 'Airworthiness Directive (AD) requirements are critical.',
|
1830 |
+
'after': 'Remove "Airworthiness Directive (AD)" if "AD" is not used elsewhere.'
|
1831 |
+
}
|
1832 |
+
},
|
1833 |
'terminology_check': {
|
1834 |
'title': 'Incorrect Terminology',
|
1835 |
'description': 'Evaluates document text against the various style manuals and orders to identify non-compliant terminology, ambiguous references, and outdated phrases. This includes checking for prohibited relative references (like "above" or "below"), proper legal terminology (like "must" instead of "shall"), and consistent formatting of regulatory citations. The check ensures precise, unambiguous communication that meets current FAA documentation requirements.',
|
|
|
1962 |
|
1963 |
return output
|
1964 |
|
1965 |
+
def _format_unused_acronym_issues(self, result: DocumentCheckResult) -> List[str]:
|
1966 |
+
"""
|
1967 |
+
Format issues for unused acronyms to display only the acronym.
|
1968 |
+
|
1969 |
+
Args:
|
1970 |
+
result: The DocumentCheckResult object containing issues.
|
1971 |
+
|
1972 |
+
Returns:
|
1973 |
+
List[str]: Formatted lines displaying unused acronyms.
|
1974 |
+
"""
|
1975 |
+
output = []
|
1976 |
+
for issue in result.issues:
|
1977 |
+
if isinstance(issue, dict):
|
1978 |
+
acronym = issue.get('acronym', 'Unknown Acronym')
|
1979 |
+
output.append(f" • Acronym '{acronym}' was defined but never used.")
|
1980 |
+
return output
|
1981 |
+
|
1982 |
def _format_caption_issues(self, result: DocumentCheckResult) -> List[str]:
|
1983 |
"""Format caption issues consistently."""
|
1984 |
output = []
|
|
|
2157 |
"italics": True,
|
2158 |
"quotes": False,
|
2159 |
"description": "For Advisory Circulars, referenced document titles should be italicized but not quoted",
|
2160 |
+
"example": "See AC 25.1309-1B, <i>System Design and Analysis</i>, for information on X."
|
2161 |
},
|
2162 |
"quotes_only": {
|
2163 |
"types": [
|
|
|
2207 |
|
2208 |
output = []
|
2209 |
|
2210 |
+
self.issue_categories['acronym_usage_check'] = {
|
2211 |
+
'title': 'Unused Acronym Definitions',
|
2212 |
+
'description': 'Ensures all acronyms defined in the document are subsequently used.',
|
2213 |
+
'solution': 'Remove definitions for acronyms that are not used later in the document.',
|
2214 |
+
'example_fix': {
|
2215 |
+
'before': 'Airworthiness Directive (AD) requirements are critical.',
|
2216 |
+
'after': 'Remove "Airworthiness Directive (AD)" if "AD" is not used elsewhere.'
|
2217 |
+
}
|
2218 |
+
}
|
2219 |
+
|
2220 |
+
output = []
|
2221 |
+
|
2222 |
# Header
|
2223 |
output.append(f"\n{Fore.CYAN}{'='*80}")
|
2224 |
output.append(f"Document Check Results Summary")
|
|
|
2261 |
output.extend(self._format_reference_issues(result))
|
2262 |
elif check_name in ['caption_check_table', 'caption_check_figure']:
|
2263 |
output.extend(self._format_caption_issues(result))
|
2264 |
+
elif check_name == 'acronym_usage_check':
|
2265 |
+
output.extend(self._format_unused_acronym_issues(result))
|
2266 |
else:
|
2267 |
# Standard issue formatting
|
2268 |
formatted_issues = [self._format_standard_issue(issue) for issue in result.issues[:7]]
|