Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -354,6 +354,16 @@ class DocumentCheckerConfig:
|
|
354 |
description="Ignore 'title 14, Code of Federal Regulations (14 CFR)'",
|
355 |
is_error=False
|
356 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
357 |
PatternConfig(
|
358 |
pattern=r'\bAD Compliance Team \(AD CRT\)\b',
|
359 |
description="Ignore 'AD Compliance Team (AD CRT)'",
|
@@ -658,7 +668,7 @@ class FAADocumentChecker(DocumentChecker):
|
|
658 |
PREDEFINED_ACRONYMS = {
|
659 |
'AGC', 'AIR', 'CFR', 'DC', 'DOT', 'FAA IR-M', 'FAQ', 'i.e.', 'e.g.', 'MA',
|
660 |
'MD', 'MIL', 'MO', 'No.', 'PDF', 'SSN', 'TX', 'U.S.', 'U.S.C.', 'USA', 'US',
|
661 |
-
'WA', 'ZIP'
|
662 |
}
|
663 |
|
664 |
# Constructor
|
@@ -1078,48 +1088,47 @@ class FAADocumentChecker(DocumentChecker):
|
|
1078 |
if not self.validate_input(doc):
|
1079 |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
|
1080 |
|
1081 |
-
section_patterns = self.config_manager.pattern_registry.get('section_symbol', [])
|
1082 |
issues = []
|
1083 |
-
|
1084 |
for paragraph in doc:
|
1085 |
sentences = re.split(r'(?<=[.!?])\s+', paragraph.strip())
|
1086 |
|
1087 |
for sentence in sentences:
|
1088 |
sentence = sentence.strip()
|
1089 |
-
|
1090 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1091 |
|
1092 |
-
|
1093 |
-
|
1094 |
-
|
1095 |
-
issues.append({
|
1096 |
-
'incorrect': section_ref,
|
1097 |
-
'correct': f"Section {section_ref.lstrip('§')}",
|
1098 |
-
'is_sentence_start': True # Flag to indicate sentence start issue
|
1099 |
-
})
|
1100 |
|
1101 |
-
|
1102 |
-
|
1103 |
-
|
1104 |
-
incorrect = match.group()
|
1105 |
-
# Remove § symbol without adding 'Section'
|
1106 |
-
correct = incorrect.replace('§ ', '')
|
1107 |
-
issues.append({
|
1108 |
-
'incorrect': incorrect,
|
1109 |
-
'correct': correct
|
1110 |
-
})
|
1111 |
|
1112 |
-
|
1113 |
-
|
1114 |
-
|
1115 |
-
|
1116 |
-
|
1117 |
-
|
1118 |
-
|
1119 |
-
|
1120 |
-
}
|
|
|
1121 |
|
1122 |
-
return DocumentCheckResult(success=
|
1123 |
|
1124 |
@profile_performance
|
1125 |
def caption_check(self, doc: List[str], doc_type: str, caption_type: str) -> DocumentCheckResult:
|
|
|
354 |
description="Ignore 'title 14, Code of Federal Regulations (14 CFR)'",
|
355 |
is_error=False
|
356 |
),
|
357 |
+
PatternConfig(
|
358 |
+
pattern=r'\btitle 49 of the United States Code \(49 U.S.C.\)\b',
|
359 |
+
description="Ignore 'title 49 of the United States Code (49 U.S.C.)'",
|
360 |
+
is_error=False
|
361 |
+
),
|
362 |
+
PatternConfig(
|
363 |
+
pattern=r'\btitle 49, United States Code \(49 U.S.C.\)\b',
|
364 |
+
description="Ignore 'title 49, United States Code (49 U.S.C.)'",
|
365 |
+
is_error=False
|
366 |
+
),
|
367 |
PatternConfig(
|
368 |
pattern=r'\bAD Compliance Team \(AD CRT\)\b',
|
369 |
description="Ignore 'AD Compliance Team (AD CRT)'",
|
|
|
668 |
PREDEFINED_ACRONYMS = {
|
669 |
'AGC', 'AIR', 'CFR', 'DC', 'DOT', 'FAA IR-M', 'FAQ', 'i.e.', 'e.g.', 'MA',
|
670 |
'MD', 'MIL', 'MO', 'No.', 'PDF', 'SSN', 'TX', 'U.S.', 'U.S.C.', 'USA', 'US',
|
671 |
+
'WA', 'XX', 'ZIP'
|
672 |
}
|
673 |
|
674 |
# Constructor
|
|
|
1088 |
if not self.validate_input(doc):
|
1089 |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
|
1090 |
|
|
|
1091 |
issues = []
|
1092 |
+
|
1093 |
for paragraph in doc:
|
1094 |
sentences = re.split(r'(?<=[.!?])\s+', paragraph.strip())
|
1095 |
|
1096 |
for sentence in sentences:
|
1097 |
sentence = sentence.strip()
|
1098 |
+
|
1099 |
+
# Check 14 CFR citations only
|
1100 |
+
cfr_matches = re.finditer(r'\b14 CFR §\s*(\d+\.\d+)\b', sentence)
|
1101 |
+
for match in cfr_matches:
|
1102 |
+
# Skip if this is part of a U.S.C. citation
|
1103 |
+
if not re.search(r'U\.S\.C\.\s*§', sentence):
|
1104 |
+
full_match = match.group(0)
|
1105 |
+
section_num = match.group(1)
|
1106 |
+
issues.append({
|
1107 |
+
'incorrect': full_match,
|
1108 |
+
'correct': f'14 CFR {section_num}',
|
1109 |
+
'description': f"Replace '{full_match}' with '14 CFR {section_num}'"
|
1110 |
+
})
|
1111 |
|
1112 |
+
# Skip any checks for sections that are part of U.S.C. citations
|
1113 |
+
if re.search(r'U\.S\.C\.\s*(?:§|§§)', sentence):
|
1114 |
+
continue
|
|
|
|
|
|
|
|
|
|
|
1115 |
|
1116 |
+
# Skip any checks for sections that are part of 14 CFR citations
|
1117 |
+
if re.search(r'14 CFR\s*§', sentence):
|
1118 |
+
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1119 |
|
1120 |
+
# Check section symbol at start of sentence
|
1121 |
+
if sentence.startswith('§'):
|
1122 |
+
match = re.match(r'^§\s*(\d+(?:\.\d+)?)', sentence)
|
1123 |
+
if match:
|
1124 |
+
section_num = match.group(1)
|
1125 |
+
issues.append({
|
1126 |
+
'incorrect': f'§ {section_num}',
|
1127 |
+
'correct': f'Section {section_num}',
|
1128 |
+
'description': f"Replace '§ {section_num}' with 'Section {section_num}'"
|
1129 |
+
})
|
1130 |
|
1131 |
+
return DocumentCheckResult(success=len(issues) == 0, issues=issues)
|
1132 |
|
1133 |
@profile_performance
|
1134 |
def caption_check(self, doc: List[str], doc_type: str, caption_type: str) -> DocumentCheckResult:
|