Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -420,7 +420,7 @@ class FAADocumentChecker(DocumentChecker):
|
|
420 |
PREDEFINED_ACRONYMS = {
|
421 |
'AGC', 'AIR', 'CFR', 'DC', 'DOT', 'FAA IR-M', 'FAQ', 'i.e.', 'e.g.', 'MA',
|
422 |
'MD', 'MIL', 'MO', 'No.', 'PDF', 'SAE', 'SSN', 'TX', 'U.S.', 'U.S.C.', 'USA', 'US',
|
423 |
-
'WA', 'XX', 'ZIP'
|
424 |
}
|
425 |
|
426 |
# Constructor
|
@@ -820,17 +820,18 @@ class FAADocumentChecker(DocumentChecker):
|
|
820 |
|
821 |
# Check terminology patterns
|
822 |
for pattern_config in terminology_patterns:
|
823 |
-
|
|
|
824 |
for match in matches:
|
825 |
if pattern_config.replacement: # Only if there's a replacement term
|
826 |
unique_issues.add((match.group(), pattern_config.replacement))
|
827 |
|
828 |
# Check prohibited patterns
|
829 |
for pattern_config in prohibited_patterns:
|
830 |
-
|
831 |
-
|
832 |
-
|
833 |
-
|
834 |
|
835 |
# Format issues as simple replacement instructions
|
836 |
formatted_issues = [
|
@@ -1095,10 +1096,28 @@ class FAADocumentChecker(DocumentChecker):
|
|
1095 |
|
1096 |
incorrect_sentences = []
|
1097 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1098 |
for paragraph in doc:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1099 |
# Split the paragraph into sentences based on common sentence-ending punctuation
|
1100 |
-
sentences = re.split(r'(?<=[.!?]) +',
|
1101 |
for sentence in sentences:
|
|
|
|
|
1102 |
if sentence.endswith('..'):
|
1103 |
incorrect_sentences.append({'sentence': sentence.strip()})
|
1104 |
|
@@ -1516,6 +1535,24 @@ class FAADocumentChecker(DocumentChecker):
|
|
1516 |
List of tuples containing (sentence, parent_paragraph)
|
1517 |
"""
|
1518 |
sentences = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1519 |
for paragraph in doc:
|
1520 |
paragraph = paragraph.strip()
|
1521 |
|
@@ -1527,17 +1564,34 @@ class FAADocumentChecker(DocumentChecker):
|
|
1527 |
):
|
1528 |
continue
|
1529 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1530 |
# Split paragraph into sentences
|
1531 |
-
para_sentences = re.split(r'(?<=[.!?])\s+',
|
1532 |
|
1533 |
# Process each sentence
|
1534 |
for sentence in para_sentences:
|
|
|
|
|
|
|
1535 |
sentence = sentence.strip()
|
1536 |
if skip_empty and not sentence:
|
1537 |
continue
|
1538 |
sentences.append((sentence, paragraph))
|
1539 |
|
1540 |
-
return sentences
|
1541 |
|
1542 |
@profile_performance
|
1543 |
def check_parentheses(self, doc: List[str]) -> DocumentCheckResult:
|
|
|
420 |
PREDEFINED_ACRONYMS = {
|
421 |
'AGC', 'AIR', 'CFR', 'DC', 'DOT', 'FAA IR-M', 'FAQ', 'i.e.', 'e.g.', 'MA',
|
422 |
'MD', 'MIL', 'MO', 'No.', 'PDF', 'SAE', 'SSN', 'TX', 'U.S.', 'U.S.C.', 'USA', 'US',
|
423 |
+
'WA', 'XX', 'ZIP', 'ACO' # Added ACO to ignore in acronym checks
|
424 |
}
|
425 |
|
426 |
# Constructor
|
|
|
820 |
|
821 |
# Check terminology patterns
|
822 |
for pattern_config in terminology_patterns:
|
823 |
+
compiled_pattern = pattern_config.compile()
|
824 |
+
matches = list(compiled_pattern.finditer(sentence))
|
825 |
for match in matches:
|
826 |
if pattern_config.replacement: # Only if there's a replacement term
|
827 |
unique_issues.add((match.group(), pattern_config.replacement))
|
828 |
|
829 |
# Check prohibited patterns
|
830 |
for pattern_config in prohibited_patterns:
|
831 |
+
compiled_pattern = pattern_config.compile()
|
832 |
+
match = compiled_pattern.search(sentence)
|
833 |
+
if match and pattern_config.replacement: # Only if there's a replacement term
|
834 |
+
unique_issues.add((match.group(), pattern_config.replacement))
|
835 |
|
836 |
# Format issues as simple replacement instructions
|
837 |
formatted_issues = [
|
|
|
1096 |
|
1097 |
incorrect_sentences = []
|
1098 |
|
1099 |
+
# Common abbreviations that end with a period but don't end sentences
|
1100 |
+
abbreviations = {
|
1101 |
+
'U.S.C.', 'U.S.', 'CFR', 'e.g.', 'i.e.', 'etc.', 'vs.', 'Dr.', 'Mr.',
|
1102 |
+
'Mrs.', 'Ms.', 'Prof.', 'Ph.D.', 'M.D.', 'B.A.', 'M.A.', 'Ph.D.'
|
1103 |
+
}
|
1104 |
+
|
1105 |
+
# Create a regex pattern that matches these abbreviations
|
1106 |
+
abbr_pattern = '|'.join(re.escape(abbr) for abbr in abbreviations)
|
1107 |
+
|
1108 |
for paragraph in doc:
|
1109 |
+
# First, protect abbreviations from being checked
|
1110 |
+
protected_paragraph = re.sub(
|
1111 |
+
f'({abbr_pattern})',
|
1112 |
+
lambda m: m.group(1).replace('.', 'ABBR_DOT'),
|
1113 |
+
paragraph
|
1114 |
+
)
|
1115 |
+
|
1116 |
# Split the paragraph into sentences based on common sentence-ending punctuation
|
1117 |
+
sentences = re.split(r'(?<=[.!?]) +', protected_paragraph)
|
1118 |
for sentence in sentences:
|
1119 |
+
# Restore the periods in abbreviations
|
1120 |
+
sentence = sentence.replace('ABBR_DOT', '.')
|
1121 |
if sentence.endswith('..'):
|
1122 |
incorrect_sentences.append({'sentence': sentence.strip()})
|
1123 |
|
|
|
1535 |
List of tuples containing (sentence, parent_paragraph)
|
1536 |
"""
|
1537 |
sentences = []
|
1538 |
+
|
1539 |
+
# Common abbreviations that end with a period but don't end sentences
|
1540 |
+
abbreviations = {
|
1541 |
+
'U.S.C.', 'U.S.', 'CFR', 'e.g.', 'i.e.', 'etc.', 'vs.', 'Dr.', 'Mr.',
|
1542 |
+
'Mrs.', 'Ms.', 'Prof.', 'Ph.D.', 'M.D.', 'B.A.', 'M.A.', 'Ph.D.'
|
1543 |
+
}
|
1544 |
+
|
1545 |
+
# Legal citation patterns that shouldn't be split
|
1546 |
+
legal_citations = [
|
1547 |
+
r'\d+ U\.S\.C\. § \d+\([a-zA-Z0-9]*\)(?:\([a-zA-Z0-9]*\))?', # e.g., 5 U.S.C. § 533(a)(1)
|
1548 |
+
r'\d+ CFR § \d+\.\d+', # e.g., 14 CFR § 1.1
|
1549 |
+
r'\d+ CFR part \d+' # e.g., 14 CFR part 1
|
1550 |
+
]
|
1551 |
+
|
1552 |
+
# Create a regex pattern that matches these abbreviations
|
1553 |
+
abbr_pattern = '|'.join(re.escape(abbr) for abbr in abbreviations)
|
1554 |
+
legal_pattern = '|'.join(legal_citations)
|
1555 |
+
|
1556 |
for paragraph in doc:
|
1557 |
paragraph = paragraph.strip()
|
1558 |
|
|
|
1564 |
):
|
1565 |
continue
|
1566 |
|
1567 |
+
# First, protect legal citations from being split
|
1568 |
+
protected_paragraph = re.sub(
|
1569 |
+
f'({legal_pattern})',
|
1570 |
+
lambda m: m.group(1).replace('.', 'LEGAL_DOT'),
|
1571 |
+
paragraph
|
1572 |
+
)
|
1573 |
+
|
1574 |
+
# Then protect abbreviations from being split
|
1575 |
+
protected_paragraph = re.sub(
|
1576 |
+
f'({abbr_pattern})',
|
1577 |
+
lambda m: m.group(1).replace('.', 'ABBR_DOT'),
|
1578 |
+
protected_paragraph
|
1579 |
+
)
|
1580 |
+
|
1581 |
# Split paragraph into sentences
|
1582 |
+
para_sentences = re.split(r'(?<=[.!?])\s+', protected_paragraph)
|
1583 |
|
1584 |
# Process each sentence
|
1585 |
for sentence in para_sentences:
|
1586 |
+
# Restore the periods in legal citations and abbreviations
|
1587 |
+
sentence = sentence.replace('LEGAL_DOT', '.')
|
1588 |
+
sentence = sentence.replace('ABBR_DOT', '.')
|
1589 |
sentence = sentence.strip()
|
1590 |
if skip_empty and not sentence:
|
1591 |
continue
|
1592 |
sentences.append((sentence, paragraph))
|
1593 |
|
1594 |
+
return sentences
|
1595 |
|
1596 |
@profile_performance
|
1597 |
def check_parentheses(self, doc: List[str]) -> DocumentCheckResult:
|