Spaces:

Hoctar77
/

DocumentCheckerTool

Sleeping

App Files Files Community

Hoctar77 commited on Nov 22, 2024

Commit

9d1e68c

verified ·

1 Parent(s): c7c4f68

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -33

app.py CHANGED Viewed

@@ -354,6 +354,16 @@ class DocumentCheckerConfig:
                     description="Ignore 'title 14, Code of Federal Regulations (14 CFR)'",
                     is_error=False
                 ),
                 PatternConfig(
                     pattern=r'\bAD Compliance Team \(AD CRT\)\b',
                     description="Ignore 'AD Compliance Team (AD CRT)'",
@@ -658,7 +668,7 @@ class FAADocumentChecker(DocumentChecker):
     PREDEFINED_ACRONYMS = {
         'AGC', 'AIR', 'CFR', 'DC', 'DOT', 'FAA IR-M', 'FAQ', 'i.e.', 'e.g.', 'MA',
         'MD', 'MIL', 'MO', 'No.', 'PDF', 'SSN', 'TX', 'U.S.', 'U.S.C.', 'USA', 'US',
-        'WA', 'ZIP'
     }
     # Constructor
@@ -1078,48 +1088,47 @@ class FAADocumentChecker(DocumentChecker):
         if not self.validate_input(doc):
             return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
-        section_patterns = self.config_manager.pattern_registry.get('section_symbol', [])
         issues = []
         for paragraph in doc:
             sentences = re.split(r'(?<=[.!?])\s+', paragraph.strip())
             for sentence in sentences:
                 sentence = sentence.strip()
-                for pattern_config in section_patterns:
-                    compiled_pattern = re.compile(pattern_config.pattern)
-                    if pattern_config.pattern == r'^§':  # Start of sentence with § symbol
-                        if compiled_pattern.match(sentence):
-                            section_ref = sentence.split()[0]  # Get the first word (§XX.XX)
-                            issues.append({
-                                'incorrect': section_ref,
-                                'correct': f"Section {section_ref.lstrip('§')}",
-                                'is_sentence_start': True  # Flag to indicate sentence start issue
-                            })
-                    elif pattern_config.pattern == r'\b14 CFR §\s*\d+\.\d+\b':  # 14 CFR § format
-                        matches = compiled_pattern.finditer(sentence)
-                        for match in matches:
-                            incorrect = match.group()
-                            # Remove § symbol without adding 'Section'
-                            correct = incorrect.replace('§ ', '')
-                            issues.append({
-                                'incorrect': incorrect,
-                                'correct': correct
-                            })
-                    elif '§' in sentence:
-                        matches = compiled_pattern.finditer(sentence)
-                        for match in matches:
-                            incorrect = match.group()
-                            correct = incorrect.replace('§', 'Section')
-                            issues.append({
-                                'incorrect': incorrect,
-                                'correct': correct
-                            })
-        return DocumentCheckResult(success=not issues, issues=issues)
     @profile_performance
     def caption_check(self, doc: List[str], doc_type: str, caption_type: str) -> DocumentCheckResult:

                     description="Ignore 'title 14, Code of Federal Regulations (14 CFR)'",
                     is_error=False
                 ),
+                PatternConfig(
+                    pattern=r'\btitle 49 of the United States Code \(49 U.S.C.\)\b',
+                    description="Ignore 'title 49 of the United States Code (49 U.S.C.)'",
+                    is_error=False
+                ),
+                PatternConfig(
+                    pattern=r'\btitle 49, United States Code \(49 U.S.C.\)\b',
+                    description="Ignore 'title 49, United States Code (49 U.S.C.)'",
+                    is_error=False
+                ),
                 PatternConfig(
                     pattern=r'\bAD Compliance Team \(AD CRT\)\b',
                     description="Ignore 'AD Compliance Team (AD CRT)'",
     PREDEFINED_ACRONYMS = {
         'AGC', 'AIR', 'CFR', 'DC', 'DOT', 'FAA IR-M', 'FAQ', 'i.e.', 'e.g.', 'MA',
         'MD', 'MIL', 'MO', 'No.', 'PDF', 'SSN', 'TX', 'U.S.', 'U.S.C.', 'USA', 'US',
+        'WA', 'XX', 'ZIP'
     }
     # Constructor
         if not self.validate_input(doc):
             return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
         issues = []
         for paragraph in doc:
             sentences = re.split(r'(?<=[.!?])\s+', paragraph.strip())
             for sentence in sentences:
                 sentence = sentence.strip()
+                # Check 14 CFR citations only
+                cfr_matches = re.finditer(r'\b14 CFR §\s*(\d+\.\d+)\b', sentence)
+                for match in cfr_matches:
+                    # Skip if this is part of a U.S.C. citation
+                    if not re.search(r'U\.S\.C\.\s*§', sentence):
+                        full_match = match.group(0)
+                        section_num = match.group(1)
+                        issues.append({
+                            'incorrect': full_match,
+                            'correct': f'14 CFR {section_num}',
+                            'description': f"Replace '{full_match}' with '14 CFR {section_num}'"
+                        })
+                # Skip any checks for sections that are part of U.S.C. citations
+                if re.search(r'U\.S\.C\.\s*(?:§|§§)', sentence):
+                    continue
+                # Skip any checks for sections that are part of 14 CFR citations
+                if re.search(r'14 CFR\s*§', sentence):
+                    continue
+                # Check section symbol at start of sentence
+                if sentence.startswith('§'):
+                    match = re.match(r'^§\s*(\d+(?:\.\d+)?)', sentence)
+                    if match:
+                        section_num = match.group(1)
+                        issues.append({
+                            'incorrect': f'§ {section_num}',
+                            'correct': f'Section {section_num}',
+                            'description': f"Replace '§ {section_num}' with 'Section {section_num}'"
+                        })
+        return DocumentCheckResult(success=len(issues) == 0, issues=issues)
     @profile_performance
     def caption_check(self, doc: List[str], doc_type: str, caption_type: str) -> DocumentCheckResult: