Hoctar77 commited on
Commit
876f61d
·
verified ·
1 Parent(s): fc12fb3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -9
app.py CHANGED
@@ -420,7 +420,7 @@ class FAADocumentChecker(DocumentChecker):
420
  PREDEFINED_ACRONYMS = {
421
  'AGC', 'AIR', 'CFR', 'DC', 'DOT', 'FAA IR-M', 'FAQ', 'i.e.', 'e.g.', 'MA',
422
  'MD', 'MIL', 'MO', 'No.', 'PDF', 'SAE', 'SSN', 'TX', 'U.S.', 'U.S.C.', 'USA', 'US',
423
- 'WA', 'XX', 'ZIP'
424
  }
425
 
426
  # Constructor
@@ -820,17 +820,18 @@ class FAADocumentChecker(DocumentChecker):
820
 
821
  # Check terminology patterns
822
  for pattern_config in terminology_patterns:
823
- matches = list(re.finditer(pattern_config.pattern, sentence))
 
824
  for match in matches:
825
  if pattern_config.replacement: # Only if there's a replacement term
826
  unique_issues.add((match.group(), pattern_config.replacement))
827
 
828
  # Check prohibited patterns
829
  for pattern_config in prohibited_patterns:
830
- if re.search(pattern_config.pattern, sentence, re.IGNORECASE):
831
- if pattern_config.replacement: # Only if there's a replacement term
832
- match_text = re.search(pattern_config.pattern, sentence, re.IGNORECASE).group()
833
- unique_issues.add((match_text, pattern_config.replacement))
834
 
835
  # Format issues as simple replacement instructions
836
  formatted_issues = [
@@ -1095,10 +1096,28 @@ class FAADocumentChecker(DocumentChecker):
1095
 
1096
  incorrect_sentences = []
1097
 
 
 
 
 
 
 
 
 
 
1098
  for paragraph in doc:
 
 
 
 
 
 
 
1099
  # Split the paragraph into sentences based on common sentence-ending punctuation
1100
- sentences = re.split(r'(?<=[.!?]) +', paragraph)
1101
  for sentence in sentences:
 
 
1102
  if sentence.endswith('..'):
1103
  incorrect_sentences.append({'sentence': sentence.strip()})
1104
 
@@ -1516,6 +1535,24 @@ class FAADocumentChecker(DocumentChecker):
1516
  List of tuples containing (sentence, parent_paragraph)
1517
  """
1518
  sentences = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1519
  for paragraph in doc:
1520
  paragraph = paragraph.strip()
1521
 
@@ -1527,17 +1564,34 @@ class FAADocumentChecker(DocumentChecker):
1527
  ):
1528
  continue
1529
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1530
  # Split paragraph into sentences
1531
- para_sentences = re.split(r'(?<=[.!?])\s+', paragraph)
1532
 
1533
  # Process each sentence
1534
  for sentence in para_sentences:
 
 
 
1535
  sentence = sentence.strip()
1536
  if skip_empty and not sentence:
1537
  continue
1538
  sentences.append((sentence, paragraph))
1539
 
1540
- return sentences
1541
 
1542
  @profile_performance
1543
  def check_parentheses(self, doc: List[str]) -> DocumentCheckResult:
 
420
  PREDEFINED_ACRONYMS = {
421
  'AGC', 'AIR', 'CFR', 'DC', 'DOT', 'FAA IR-M', 'FAQ', 'i.e.', 'e.g.', 'MA',
422
  'MD', 'MIL', 'MO', 'No.', 'PDF', 'SAE', 'SSN', 'TX', 'U.S.', 'U.S.C.', 'USA', 'US',
423
+ 'WA', 'XX', 'ZIP', 'ACO' # Added ACO to ignore in acronym checks
424
  }
425
 
426
  # Constructor
 
820
 
821
  # Check terminology patterns
822
  for pattern_config in terminology_patterns:
823
+ compiled_pattern = pattern_config.compile()
824
+ matches = list(compiled_pattern.finditer(sentence))
825
  for match in matches:
826
  if pattern_config.replacement: # Only if there's a replacement term
827
  unique_issues.add((match.group(), pattern_config.replacement))
828
 
829
  # Check prohibited patterns
830
  for pattern_config in prohibited_patterns:
831
+ compiled_pattern = pattern_config.compile()
832
+ match = compiled_pattern.search(sentence)
833
+ if match and pattern_config.replacement: # Only if there's a replacement term
834
+ unique_issues.add((match.group(), pattern_config.replacement))
835
 
836
  # Format issues as simple replacement instructions
837
  formatted_issues = [
 
1096
 
1097
  incorrect_sentences = []
1098
 
1099
+ # Common abbreviations that end with a period but don't end sentences
1100
+ abbreviations = {
1101
+ 'U.S.C.', 'U.S.', 'CFR', 'e.g.', 'i.e.', 'etc.', 'vs.', 'Dr.', 'Mr.',
1102
+ 'Mrs.', 'Ms.', 'Prof.', 'Ph.D.', 'M.D.', 'B.A.', 'M.A.', 'Ph.D.'
1103
+ }
1104
+
1105
+ # Create a regex pattern that matches these abbreviations
1106
+ abbr_pattern = '|'.join(re.escape(abbr) for abbr in abbreviations)
1107
+
1108
  for paragraph in doc:
1109
+ # First, protect abbreviations from being checked
1110
+ protected_paragraph = re.sub(
1111
+ f'({abbr_pattern})',
1112
+ lambda m: m.group(1).replace('.', 'ABBR_DOT'),
1113
+ paragraph
1114
+ )
1115
+
1116
  # Split the paragraph into sentences based on common sentence-ending punctuation
1117
+ sentences = re.split(r'(?<=[.!?]) +', protected_paragraph)
1118
  for sentence in sentences:
1119
+ # Restore the periods in abbreviations
1120
+ sentence = sentence.replace('ABBR_DOT', '.')
1121
  if sentence.endswith('..'):
1122
  incorrect_sentences.append({'sentence': sentence.strip()})
1123
 
 
1535
  List of tuples containing (sentence, parent_paragraph)
1536
  """
1537
  sentences = []
1538
+
1539
+ # Common abbreviations that end with a period but don't end sentences
1540
+ abbreviations = {
1541
+ 'U.S.C.', 'U.S.', 'CFR', 'e.g.', 'i.e.', 'etc.', 'vs.', 'Dr.', 'Mr.',
1542
+ 'Mrs.', 'Ms.', 'Prof.', 'Ph.D.', 'M.D.', 'B.A.', 'M.A.', 'Ph.D.'
1543
+ }
1544
+
1545
+ # Legal citation patterns that shouldn't be split
1546
+ legal_citations = [
1547
+ r'\d+ U\.S\.C\. § \d+\([a-zA-Z0-9]*\)(?:\([a-zA-Z0-9]*\))?', # e.g., 5 U.S.C. § 533(a)(1)
1548
+ r'\d+ CFR § \d+\.\d+', # e.g., 14 CFR § 1.1
1549
+ r'\d+ CFR part \d+' # e.g., 14 CFR part 1
1550
+ ]
1551
+
1552
+ # Create a regex pattern that matches these abbreviations
1553
+ abbr_pattern = '|'.join(re.escape(abbr) for abbr in abbreviations)
1554
+ legal_pattern = '|'.join(legal_citations)
1555
+
1556
  for paragraph in doc:
1557
  paragraph = paragraph.strip()
1558
 
 
1564
  ):
1565
  continue
1566
 
1567
+ # First, protect legal citations from being split
1568
+ protected_paragraph = re.sub(
1569
+ f'({legal_pattern})',
1570
+ lambda m: m.group(1).replace('.', 'LEGAL_DOT'),
1571
+ paragraph
1572
+ )
1573
+
1574
+ # Then protect abbreviations from being split
1575
+ protected_paragraph = re.sub(
1576
+ f'({abbr_pattern})',
1577
+ lambda m: m.group(1).replace('.', 'ABBR_DOT'),
1578
+ protected_paragraph
1579
+ )
1580
+
1581
  # Split paragraph into sentences
1582
+ para_sentences = re.split(r'(?<=[.!?])\s+', protected_paragraph)
1583
 
1584
  # Process each sentence
1585
  for sentence in para_sentences:
1586
+ # Restore the periods in legal citations and abbreviations
1587
+ sentence = sentence.replace('LEGAL_DOT', '.')
1588
+ sentence = sentence.replace('ABBR_DOT', '.')
1589
  sentence = sentence.strip()
1590
  if skip_empty and not sentence:
1591
  continue
1592
  sentences.append((sentence, paragraph))
1593
 
1594
+ return sentences
1595
 
1596
  @profile_performance
1597
  def check_parentheses(self, doc: List[str]) -> DocumentCheckResult: