Spaces:

Hoctar77
/

DocumentCheckerTool

Sleeping

App Files Files Community

Hoctar77 commited on Apr 20

Commit

876f61d

verified ·

1 Parent(s): fc12fb3

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -9

app.py CHANGED Viewed

@@ -420,7 +420,7 @@ class FAADocumentChecker(DocumentChecker):
     PREDEFINED_ACRONYMS = {
         'AGC', 'AIR', 'CFR', 'DC', 'DOT', 'FAA IR-M', 'FAQ', 'i.e.', 'e.g.', 'MA',
         'MD', 'MIL', 'MO', 'No.', 'PDF', 'SAE', 'SSN', 'TX', 'U.S.', 'U.S.C.', 'USA', 'US',
-        'WA', 'XX', 'ZIP'
     }
     # Constructor
@@ -820,17 +820,18 @@ class FAADocumentChecker(DocumentChecker):
                 # Check terminology patterns
                 for pattern_config in terminology_patterns:
-                    matches = list(re.finditer(pattern_config.pattern, sentence))
                     for match in matches:
                         if pattern_config.replacement:  # Only if there's a replacement term
                             unique_issues.add((match.group(), pattern_config.replacement))
                 # Check prohibited patterns
                 for pattern_config in prohibited_patterns:
-                    if re.search(pattern_config.pattern, sentence, re.IGNORECASE):
-                        if pattern_config.replacement:  # Only if there's a replacement term
-                            match_text = re.search(pattern_config.pattern, sentence, re.IGNORECASE).group()
-                            unique_issues.add((match_text, pattern_config.replacement))
         # Format issues as simple replacement instructions
         formatted_issues = [
@@ -1095,10 +1096,28 @@ class FAADocumentChecker(DocumentChecker):
         incorrect_sentences = []
         for paragraph in doc:
             # Split the paragraph into sentences based on common sentence-ending punctuation
-            sentences = re.split(r'(?<=[.!?]) +', paragraph)
             for sentence in sentences:
                 if sentence.endswith('..'):
                     incorrect_sentences.append({'sentence': sentence.strip()})
@@ -1516,6 +1535,24 @@ class FAADocumentChecker(DocumentChecker):
             List of tuples containing (sentence, parent_paragraph)
         """
         sentences = []
         for paragraph in doc:
             paragraph = paragraph.strip()
@@ -1527,17 +1564,34 @@ class FAADocumentChecker(DocumentChecker):
                 ):
                     continue
             # Split paragraph into sentences
-            para_sentences = re.split(r'(?<=[.!?])\s+', paragraph)
             # Process each sentence
             for sentence in para_sentences:
                 sentence = sentence.strip()
                 if skip_empty and not sentence:
                     continue
                 sentences.append((sentence, paragraph))
-        return sentences
     @profile_performance
     def check_parentheses(self, doc: List[str]) -> DocumentCheckResult:

     PREDEFINED_ACRONYMS = {
         'AGC', 'AIR', 'CFR', 'DC', 'DOT', 'FAA IR-M', 'FAQ', 'i.e.', 'e.g.', 'MA',
         'MD', 'MIL', 'MO', 'No.', 'PDF', 'SAE', 'SSN', 'TX', 'U.S.', 'U.S.C.', 'USA', 'US',
+        'WA', 'XX', 'ZIP', 'ACO'  # Added ACO to ignore in acronym checks
     }
     # Constructor
                 # Check terminology patterns
                 for pattern_config in terminology_patterns:
+                    compiled_pattern = pattern_config.compile()
+                    matches = list(compiled_pattern.finditer(sentence))
                     for match in matches:
                         if pattern_config.replacement:  # Only if there's a replacement term
                             unique_issues.add((match.group(), pattern_config.replacement))
                 # Check prohibited patterns
                 for pattern_config in prohibited_patterns:
+                    compiled_pattern = pattern_config.compile()
+                    match = compiled_pattern.search(sentence)
+                    if match and pattern_config.replacement:  # Only if there's a replacement term
+                        unique_issues.add((match.group(), pattern_config.replacement))
         # Format issues as simple replacement instructions
         formatted_issues = [
         incorrect_sentences = []
+        # Common abbreviations that end with a period but don't end sentences
+        abbreviations = {
+            'U.S.C.', 'U.S.', 'CFR', 'e.g.', 'i.e.', 'etc.', 'vs.', 'Dr.', 'Mr.',
+            'Mrs.', 'Ms.', 'Prof.', 'Ph.D.', 'M.D.', 'B.A.', 'M.A.', 'Ph.D.'
+        }
+        # Create a regex pattern that matches these abbreviations
+        abbr_pattern = '|'.join(re.escape(abbr) for abbr in abbreviations)
         for paragraph in doc:
+            # First, protect abbreviations from being checked
+            protected_paragraph = re.sub(
+                f'({abbr_pattern})',
+                lambda m: m.group(1).replace('.', 'ABBR_DOT'),
+                paragraph
+            )
             # Split the paragraph into sentences based on common sentence-ending punctuation
+            sentences = re.split(r'(?<=[.!?]) +', protected_paragraph)
             for sentence in sentences:
+                # Restore the periods in abbreviations
+                sentence = sentence.replace('ABBR_DOT', '.')
                 if sentence.endswith('..'):
                     incorrect_sentences.append({'sentence': sentence.strip()})
             List of tuples containing (sentence, parent_paragraph)
         """
         sentences = []
+        # Common abbreviations that end with a period but don't end sentences
+        abbreviations = {
+            'U.S.C.', 'U.S.', 'CFR', 'e.g.', 'i.e.', 'etc.', 'vs.', 'Dr.', 'Mr.',
+            'Mrs.', 'Ms.', 'Prof.', 'Ph.D.', 'M.D.', 'B.A.', 'M.A.', 'Ph.D.'
+        }
+        # Legal citation patterns that shouldn't be split
+        legal_citations = [
+            r'\d+ U\.S\.C\. § \d+\([a-zA-Z0-9]*\)(?:\([a-zA-Z0-9]*\))?',  # e.g., 5 U.S.C. § 533(a)(1)
+            r'\d+ CFR § \d+\.\d+',  # e.g., 14 CFR § 1.1
+            r'\d+ CFR part \d+'  # e.g., 14 CFR part 1
+        ]
+        # Create a regex pattern that matches these abbreviations
+        abbr_pattern = '|'.join(re.escape(abbr) for abbr in abbreviations)
+        legal_pattern = '|'.join(legal_citations)
         for paragraph in doc:
             paragraph = paragraph.strip()
                 ):
                     continue
+            # First, protect legal citations from being split
+            protected_paragraph = re.sub(
+                f'({legal_pattern})',
+                lambda m: m.group(1).replace('.', 'LEGAL_DOT'),
+                paragraph
+            )
+            # Then protect abbreviations from being split
+            protected_paragraph = re.sub(
+                f'({abbr_pattern})',
+                lambda m: m.group(1).replace('.', 'ABBR_DOT'),
+                protected_paragraph
+            )
             # Split paragraph into sentences
+            para_sentences = re.split(r'(?<=[.!?])\s+', protected_paragraph)
             # Process each sentence
             for sentence in para_sentences:
+                # Restore the periods in legal citations and abbreviations
+                sentence = sentence.replace('LEGAL_DOT', '.')
+                sentence = sentence.replace('ABBR_DOT', '.')
                 sentence = sentence.strip()
                 if skip_empty and not sentence:
                     continue
                 sentences.append((sentence, paragraph))
+        return sentences
     @profile_performance
     def check_parentheses(self, doc: List[str]) -> DocumentCheckResult: