Spaces:

Hoctar77
/

DocumentCheckerTool

Sleeping

App Files Files Community

Hoctar77 commited on Nov 14, 2024

Commit

c996527

verified ·

1 Parent(s): ff7a3c4

Update app.py

Browse files

Files changed (1) hide show

app.py +84 -66

app.py CHANGED Viewed

@@ -722,16 +722,6 @@ class FAADocumentChecker(DocumentChecker):
     # Core Check Methods
     @profile_performance
     def heading_title_check(self, doc: List[str], doc_type: str) -> DocumentCheckResult:
-        """
-        Check headings for a specific document type.
-        Args:
-            doc (List[str]): List of document paragraphs
-            doc_type (str): Type of document being checked
-        Returns:
-            DocumentCheckResult: Result of heading check including found and missing headings
-        """
         if not self.validate_input(doc):
             self.logger.error("Invalid document input for heading check")
             return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
@@ -758,13 +748,29 @@ class FAADocumentChecker(DocumentChecker):
         headings_found = []
         required_headings_set = set(required_headings)
-        # Extract and normalize headings from document
         for para in doc:
             para_strip = para.strip()
-            # Handle both exact matches and variations with trailing periods
-            para_base = para_strip.rstrip('.')
-            if para_base in required_headings_set or para_strip in required_headings_set:
-                headings_found.append(para_strip)
         # Check if all required headings are found
         found_headings_set = set(headings_found)
@@ -796,6 +802,7 @@ class FAADocumentChecker(DocumentChecker):
         return DocumentCheckResult(success=success, issues=issues, details=details)
     @profile_performance
     def heading_title_period_check(self, doc: List[str], doc_type: str) -> DocumentCheckResult:
         """
@@ -907,39 +914,22 @@ class FAADocumentChecker(DocumentChecker):
     @profile_performance
     def acronym_check(self, doc: List[str]) -> DocumentCheckResult:
-        """
-        Check if acronyms are defined at their first use, ignoring uppercase headings
-        and common exceptions.
-        """
         if not self.validate_input(doc):
             return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
-        defined_acronyms = set()
-        first_occurrences = {}  # Track first occurrence of each acronym
-        undefined_acronyms = []
         # Common words that might appear in uppercase but aren't acronyms
-        heading_words = {
-            'INFORMATION', 'GENERAL', 'SUMMARY', 'INTRODUCTION', 'BACKGROUND',
-            'DISCUSSION', 'CONCLUSION', 'APPENDIX', 'CHAPTER', 'SECTION',
-            'PURPOSE', 'APPLICABILITY', 'CANCELLATION', 'DEFINITION', 'REQUIREMENTS',
-            'AUTHORITY', 'POLICY', 'SCOPE', 'RELATED', 'MATERIAL', 'DISTRIBUTION',
-            'EXPLANATION', 'PROCEDURES', 'NOTE', 'WARNING', 'CAUTION', 'EXCEPTION',
-            'GROUPS', 'PARTS', 'TABLE', 'FIGURE', 'REFERENCES', 'DEFINITIONS'
-        }
         # Standard acronyms that don't need to be defined
-        predefined_acronyms = {
-            'CFR', 'U.S.', 'USA', 'US', 'U.S.C', 'e.g.', 'i.e.', 'FAQ', 'No.', 'ZIP', 'PDF', 'SSN',
-            'DC', 'MA', 'WA', 'TX', 'MO'
-        }
-        defined_acronyms.update(predefined_acronyms)
-        # Pattern for finding defined acronyms like "Federal Aviation Administration (FAA)"
         defined_pattern = re.compile(r'\b([\w\s&]+?)\s*\((\b[A-Z]{2,}\b)\)')
-        # Modified acronym pattern to exclude common heading patterns
         acronym_pattern = re.compile(r'\b[A-Z]{2,}\b(?!\s*[:.]\s*)')
         for paragraph in doc:
@@ -948,48 +938,68 @@ class FAADocumentChecker(DocumentChecker):
             if all(word.isupper() for word in words) and any(word in heading_words for word in words):
                 continue
-            # Check for definitions first
             defined_matches = defined_pattern.findall(paragraph)
             for full_term, acronym in defined_matches:
-                defined_acronyms.add(acronym)
-                # If this was previously marked as undefined, remove it
-                if acronym in first_occurrences:
-                    del first_occurrences[acronym]
             # Check for acronym usage
             usage_matches = acronym_pattern.finditer(paragraph)
             for match in usage_matches:
                 acronym = match.group()
                 # Skip if it's part of a heading or contains non-letter characters
-                if (acronym in heading_words or
                     any(not c.isalpha() for c in acronym) or
                     len(acronym) > 10):  # Usually acronyms aren't this long
                     continue
                 if acronym not in defined_acronyms:
-                    # Only process if we haven't seen this acronym before
-                    if acronym not in first_occurrences:
-                        # Find the sentence containing the first undefined acronym
-                        sentences = re.split(r'(?<=[.!?])\s+', paragraph)
-                        for sentence in sentences:
-                            if acronym in sentence:
-                                # Additional check to avoid marking uppercase headings
-                                if not (sentence.isupper() and any(word in heading_words for word in sentence.split())):
-                                    first_occurrences[acronym] = {
-                                        'acronym': acronym,
-                                        'sentence': sentence.strip()
-                                    }
-                                break
-        # Convert first occurrences to list of issues
-        undefined_acronyms = list(first_occurrences.values())
-        success = len(undefined_acronyms) == 0
-        issues = undefined_acronyms if not success else []
         return DocumentCheckResult(success=success, issues=issues)
     @profile_performance
     def check_terminology(self, doc: List[str]) -> DocumentCheckResult:
         """
@@ -1942,6 +1952,14 @@ class DocumentCheckResultsFormatter:
                     subsequent_indent='      '
                 )
             # Handle issues with direct sentence reference
             elif 'sentence' in issue:
                 return textwrap.fill(
@@ -2405,7 +2423,7 @@ def create_interface():
                     """
             # Extract issues
-            issues_match = re.findall(r'•\s*([^•\n]+)', content)
             issues_html_section = ""
             if issues_match:
                 issues_html_section = """

     # Core Check Methods
     @profile_performance
     def heading_title_check(self, doc: List[str], doc_type: str) -> DocumentCheckResult:
         if not self.validate_input(doc):
             self.logger.error("Invalid document input for heading check")
             return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
         headings_found = []
         required_headings_set = set(required_headings)
+        # Precompile a regex pattern to match headings at the start of the paragraph
+        # Escape special characters in headings and allow for optional spaces and periods
+        heading_patterns = []
+        for heading in required_headings:
+            escaped_heading = re.escape(heading.rstrip('.'))
+            pattern = rf'^\s*{escaped_heading}\.?\s*'
+            heading_patterns.append(pattern)
+        combined_pattern = re.compile('|'.join(heading_patterns), re.IGNORECASE)
         for para in doc:
             para_strip = para.strip()
+            # Check if paragraph starts with any of the required headings
+            match = combined_pattern.match(para_strip)
+            if match:
+                # Extract the matched heading
+                matched_heading = match.group().strip()
+                # Normalize the matched heading to compare with required headings
+                matched_heading_base = matched_heading.rstrip('.').strip()
+                # Find the exact heading from required headings (case-insensitive)
+                for required_heading in required_headings:
+                    if matched_heading_base.lower() == required_heading.rstrip('.').lower():
+                        headings_found.append(required_heading)
+                        break
         # Check if all required headings are found
         found_headings_set = set(headings_found)
         return DocumentCheckResult(success=success, issues=issues, details=details)
     @profile_performance
     def heading_title_period_check(self, doc: List[str], doc_type: str) -> DocumentCheckResult:
         """
     @profile_performance
     def acronym_check(self, doc: List[str]) -> DocumentCheckResult:
         if not self.validate_input(doc):
             return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
         # Common words that might appear in uppercase but aren't acronyms
+        heading_words = self.config_manager.config.get('heading_words', HEADING_WORDS)
         # Standard acronyms that don't need to be defined
+        predefined_acronyms = self.config_manager.config.get('predefined_acronyms', PREDEFINED_ACRONYMS)
+        # Tracking structures
+        defined_acronyms = {}  # Stores definition info
+        used_acronyms = set()  # Stores acronyms used after definition
+        issues = []
+        # Patterns
         defined_pattern = re.compile(r'\b([\w\s&]+?)\s*\((\b[A-Z]{2,}\b)\)')
         acronym_pattern = re.compile(r'\b[A-Z]{2,}\b(?!\s*[:.]\s*)')
         for paragraph in doc:
             if all(word.isupper() for word in words) and any(word in heading_words for word in words):
                 continue
+            # Check for acronym definitions first
             defined_matches = defined_pattern.findall(paragraph)
             for full_term, acronym in defined_matches:
+                if acronym not in predefined_acronyms:
+                    if acronym not in defined_acronyms:
+                        defined_acronyms[acronym] = {
+                            'full_term': full_term.strip(),
+                            'defined_at': paragraph.strip(),
+                            'used': False  # Initially not used
+                        }
+                    else:
+                        # Handle duplicate definitions if necessary
+                        pass  # You may add logic for duplicate definitions
             # Check for acronym usage
             usage_matches = acronym_pattern.finditer(paragraph)
             for match in usage_matches:
                 acronym = match.group()
+                # Skip predefined acronyms
+                if acronym in predefined_acronyms:
+                    continue
                 # Skip if it's part of a heading or contains non-letter characters
+                if (acronym in heading_words or
                     any(not c.isalpha() for c in acronym) or
                     len(acronym) > 10):  # Usually acronyms aren't this long
                     continue
                 if acronym not in defined_acronyms:
+                    # Undefined acronym used
+                    issues.append({
+                        'type': 'undefined_acronym',
+                        'acronym': acronym,
+                        'sentence': paragraph.strip()
+                    })
+                else:
+                    # Mark as used
+                    defined_acronyms[acronym]['used'] = True
+                    used_acronyms.add(acronym)
+        # Check for defined but unused acronyms
+        unused_acronyms = [
+            {
+                'type': 'unused_acronym',
+                'acronym': acronym,
+                'full_term': data['full_term'],
+                'defined_at': data['defined_at']
+            }
+            for acronym, data in defined_acronyms.items()
+            if not data['used']
+        ]
+        # Combine issues
+        if unused_acronyms:
+            issues.extend(unused_acronyms)
+        success = len(issues) == 0
         return DocumentCheckResult(success=success, issues=issues)
     @profile_performance
     def check_terminology(self, doc: List[str]) -> DocumentCheckResult:
         """
                     subsequent_indent='      '
                 )
+            # Handle unused acronym issues
+            if issue.get('type') == 'unused_acronym':
+                return textwrap.fill(
+                    f"    • Acronym '{issue['acronym']}' defined but not used again after definition.",
+                    width=76,
+                    subsequent_indent='      '
+                )
             # Handle issues with direct sentence reference
             elif 'sentence' in issue:
                 return textwrap.fill(
                     """
             # Extract issues
+            issues_match = re.findall(r'•\s*(.+?)(?=\n•|\Z)', content, re.DOTALL)
             issues_html_section = ""
             if issues_match:
                 issues_html_section = """