Spaces:

Hoctar77
/

DocumentCheckerTool

Sleeping

App Files Files Community

Hoctar77 commited on Nov 14, 2024

Commit

d4c5938

verified ·

1 Parent(s): a3c728a

Update app.py

Browse files

Files changed (1) hide show

app.py +167 -137

app.py CHANGED Viewed

@@ -924,10 +924,10 @@ class FAADocumentChecker(DocumentChecker):
             return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
         # Common words that might appear in uppercase but aren't acronyms
-        heading_words = self.config_manager.config.get('heading_words', self.HEADING_WORDS)
         # Standard acronyms that don't need to be defined
-        predefined_acronyms = self.config_manager.config.get('predefined_acronyms', self.PREDEFINED_ACRONYMS)
         # Tracking structures
         defined_acronyms = {}  # Stores definition info
@@ -936,7 +936,7 @@ class FAADocumentChecker(DocumentChecker):
         # Patterns
         defined_pattern = re.compile(r'\b([\w\s&]+?)\s*\((\b[A-Z]{2,}\b)\)')
-        acronym_pattern = re.compile(r'\b[A-Z]{2,}\b(?!\s*[:.]\s*)')
         for paragraph in doc:
             # Skip lines that appear to be headings (all uppercase with common heading words)
@@ -944,56 +944,46 @@ class FAADocumentChecker(DocumentChecker):
             if all(word.isupper() for word in words) and any(word in heading_words for word in words):
                 continue
-            # Split the paragraph into sentences
-            sentences = re.split(r'(?<=[.!?])\s+', paragraph)
-            definitions_in_paragraph = set()
-            # First pass: Identify definitions
-            for sentence in sentences:
-                defined_matches = defined_pattern.findall(sentence)
-                for full_term, acronym in defined_matches:
-                    if acronym not in predefined_acronyms:
-                        if acronym not in defined_acronyms:
-                            defined_acronyms[acronym] = {
-                                'full_term': full_term.strip(),
-                                'defined_at': sentence.strip(),
-                                'used': False  # Initially not used
-                            }
-                            definitions_in_paragraph.add(sentence)
-                        else:
-                            # Handle duplicate definitions if necessary
-                            pass  # You may add logic for duplicate definitions
-            # Second pass: Check for acronym usage in sentences that are not definitions
-            for sentence in sentences:
-                if sentence in definitions_in_paragraph:
-                    continue  # Skip definition sentences
-                usage_matches = acronym_pattern.finditer(sentence)
-                for match in usage_matches:
-                    acronym = match.group()
-                    # Skip predefined acronyms
-                    if acronym in predefined_acronyms:
-                        continue
-                    # Skip if it's part of a heading or contains non-letter characters
-                    if (acronym in heading_words or
-                        any(not c.isalpha() for c in acronym) or
-                        len(acronym) > 10):  # Usually acronyms aren't this long
-                        continue
-                    if acronym not in defined_acronyms:
-                        # Undefined acronym used
-                        issues.append({
-                            'type': 'undefined_acronym',
-                            'acronym': acronym,
-                            'sentence': sentence.strip()
-                        })
-                    else:
-                        # Mark as used
-                        defined_acronyms[acronym]['used'] = True
-                        used_acronyms.add(acronym)
         # Check for defined but unused acronyms
         unused_acronyms = [
@@ -1015,6 +1005,7 @@ class FAADocumentChecker(DocumentChecker):
         return DocumentCheckResult(success=success, issues=issues)
     @profile_performance
     def check_terminology(self, doc: List[str]) -> DocumentCheckResult:
         """
@@ -1765,7 +1756,7 @@ class DocumentCheckResultsFormatter:
                 'solution': 'Format heading periods according to document type requirements',
                 'example_fix': {
                     'before': 'Purpose',
-                    'after': 'Purpose.'  # For ACs and Orders
                 }
             },
             'table_figure_reference_check': {
@@ -1779,8 +1770,8 @@ class DocumentCheckResultsFormatter:
             },
             'acronym_check': {
                 'title': 'Acronym Definition Issues',
-                'description': 'Ensures every acronym is properly introduced with its full term at first use. The check identifies undefined acronyms, unused acronyms, and duplicate definitions while recognizing common exceptions (like U.S.) that don\'t require definition.',
-                'solution': 'Define each acronym at its first use and use it consistently throughout the document',
                 'example_fix': {
                     'before': 'This order establishes general FAA organizational policies.',
                     'after': 'This order establishes general Federal Aviation Administration (FAA) organizational policies.'
@@ -1841,13 +1832,29 @@ class DocumentCheckResultsFormatter:
                 }
             }
         }
     def _format_colored_text(self, text: str, color: str) -> str:
-        """Helper method to format colored text with reset."""
         return f"{color}{text}{Style.RESET_ALL}"
     def _format_example(self, example_fix: Dict[str, str]) -> List[str]:
-        """Format example fixes consistently."""
         return [
             f"    ❌ Incorrect: {example_fix['before']}",
             f"    ✓ Correct: {example_fix['after']}"
@@ -1870,7 +1877,7 @@ class DocumentCheckResultsFormatter:
                     output.append(f"    • {heading}")
         return output
     def _format_period_issues(self, result: DocumentCheckResult) -> List[str]:
         """Format period check issues consistently."""
         output = []
@@ -1882,28 +1889,26 @@ class DocumentCheckResultsFormatter:
                     output.append(f"    • {issue['message']}")
         return output
     def _format_reference_issues(self, result: DocumentCheckResult) -> List[str]:
-        """Format section symbol check issues consistently with other document checks."""
         output = []
-        # Skip if no issues
-        if not result.issues:
-            return output
-        for issue in result.issues[:3]:
-            if 'sentence' in issue:
-                output.append(f"    • {issue['sentence']}")
-            elif 'matches' in issue:
-                output.append(f"    • Incorrect usage: {', '.join(issue['matches'])}")
-            else:
-                output.append(f"    • {str(issue)}")
-        if len(result.issues) > 3:
-            output.append(f"\n    ... and {len(result.issues) - 3} more similar issues.")
         return output
     def _format_caption_issues(self, result: DocumentCheckResult) -> List[str]:
         """Format caption issues consistently."""
         output = []
@@ -1913,25 +1918,55 @@ class DocumentCheckResultsFormatter:
                 output.append(f"    • {issue.get('incorrect_caption', '')} (correct format: {issue.get('correct_format', '')})")
         return output
     def _format_standard_issue(self, issue: Dict[str, Any]) -> str:
         """Format a standard issue consistently."""
         if isinstance(issue, dict):
-            # Handle unused acronym issues
-            if issue.get('type') == 'unused_acronym':
                 return textwrap.fill(
-                    f"    • Acronym '{issue['acronym']}' defined but not used again after definition.",
                     width=76,
                     subsequent_indent='      '
                 )
-            # Handle undefined acronym issues
-            if issue.get('type') == 'undefined_acronym':
                 return textwrap.fill(
-                    f"    • Undefined acronym '{issue['acronym']}' used in: {issue['sentence']}",
                     width=76,
                     subsequent_indent='      '
                 )
-            # Handle other issues with sentence
             elif 'sentence' in issue:
                 return textwrap.fill(
                     issue['sentence'],
@@ -1939,32 +1974,55 @@ class DocumentCheckResultsFormatter:
                     initial_indent='    • ',
                     subsequent_indent='      '
                 )
-            # Handle issues with description and occurrences
-            elif 'description' in issue and 'occurrences' in issue:
-                occurrences = issue['occurrences']
-                examples = []
-                for occ in occurrences[:3]:
-                    if 'sentence' in occ:
-                        examples.append(occ['sentence'])
-                    elif 'text' in occ:
-                        examples.append(occ['text'])
-                    else:
-                        examples.append(str(occ))
-                occurrences_text = '; '.join(examples)
                 return textwrap.fill(
-                    f"    • {issue['description']} - Examples: {occurrences_text}",
                     width=76,
                     subsequent_indent='      '
                 )
             else:
-                # Generic issue formatting
                 message_parts = []
                 for k, v in issue.items():
                     if k not in ['type', 'error']:
-                        message_parts.append(f"{k}: {v}")
                 return f"    • {'; '.join(message_parts)}"
-        else:
-            return f"    • {str(issue)}"
     def format_results(self, results: Dict[str, Any], doc_type: str) -> str:
         """
@@ -1977,7 +2035,7 @@ class DocumentCheckResultsFormatter:
         Returns:
             str: Formatted report with consistent styling
         """
-        # Determine caption format based on document type
         if doc_type in ["Advisory Circular", "Order"]:
             table_format = {
                 'title': 'Table Caption Format Issues',
@@ -2000,7 +2058,7 @@ class DocumentCheckResultsFormatter:
         else:
             table_format = {
                 'title': 'Table Caption Format Issues',
-                'description': f'Analyzes table captions to ensure they follow the FAA\'s sequential numbering system for {doc_type}s. Tables must be numbered consecutively throughout the document using a single number format. This straightforward numbering system maintains document organization while facilitating clear references to specific tables.',
                 'solution': 'Use the format "Table X" where X is a sequential number',
                 'example_fix': {
                     'before': 'Table 5-1. | Table A-1',
@@ -2009,7 +2067,7 @@ class DocumentCheckResultsFormatter:
             }
             figure_format = {
                 'title': 'Figure Caption Format Issues',
-                'description': f'Analyzes figure captions to ensure they follow the FAA\'s sequential numbering system for {doc_type}s. Figures must be numbered consecutively throughout the document using a single number format. This consistent numbering approach ensures clear figure identification and maintains parallel structure with table numbering.',
                 'solution': 'Use the format "Figure X" where X is a sequential number',
                 'example_fix': {
                     'before': 'Figure 5-1. | Figure A-1.',
@@ -2027,7 +2085,7 @@ class DocumentCheckResultsFormatter:
                 "types": ["Advisory Circular"],
                 "italics": True,
                 "quotes": False,
-                "description": "For Advisory Circulars, referenced document titles should be italicized but not quoted.",
                 "example": "See AC 25.1309-1B, *System Design and Analysis*, for information on X."
             },
             "quotes_only": {
@@ -2038,14 +2096,14 @@ class DocumentCheckResultsFormatter:
                 ],
                 "italics": False,
                 "quotes": True,
-                "description": "For this document type, referenced document titles should be in quotes without italics.",
                 "example": 'See AC 25.1309-1B, "System Design and Analysis," for information on X.'
             },
             "no_formatting": {
                 "types": ["Policy Statement", "Other"],
                 "italics": False,
                 "quotes": False,
-                "description": "For this document type, referenced document titles should not use italics or quotes.",
                 "example": "See AC 25.1309-1B, System Design and Analysis, for information on X."
             }
         }
@@ -2093,17 +2151,8 @@ class DocumentCheckResultsFormatter:
         # Process all check results consistently
         for check_name, result in results.items():
-            if not result.success:
-                # Get category details
-                category = self.issue_categories.get(check_name, {
-                    'title': check_name.replace('_', ' ').title(),
-                    'description': 'No description available for this check.',
-                    'solution': 'Review the issues and correct them accordingly.',
-                    'example_fix': {
-                        'before': '',
-                        'after': ''
-                    }
-                })
                 # Add extra line break before each category
                 output.append("\n")
@@ -2114,10 +2163,9 @@ class DocumentCheckResultsFormatter:
                 output.append(f"  {self._format_colored_text('How to fix: ' + category['solution'], Fore.GREEN)}")
                 # Example Fix
-                if category['example_fix']['before'] and category['example_fix']['after']:
-                    output.append(f"\n  {self._format_colored_text('Example Fix:', Fore.CYAN)}")
-                    output.extend(self._format_example(category['example_fix']))
-                    output.append("")  # Add blank line after example
                 # Actual Issues Found
                 output.append(f"  {self._format_colored_text('Issues found in your document:', Fore.CYAN)}")
@@ -2155,24 +2203,6 @@ class DocumentCheckResultsFormatter:
         output.append(f"\n{Fore.CYAN}{'='*80}{Style.RESET_ALL}\n")
         return '\n'.join(output)
-    def save_report(self, results: Dict[str, Any], filepath: str, doc_type: str) -> None:
-        """Save the formatted results to a file with proper formatting."""
-        try:
-            with open(filepath, 'w', encoding='utf-8') as f:
-                # Create a report without color codes
-                report = self.format_results(results, doc_type)
-                # Strip color codes
-                for color in [Fore.CYAN, Fore.GREEN, Fore.YELLOW, Fore.RED, Style.RESET_ALL]:
-                    report = report.replace(str(color), '')
-                # Convert markdown-style italics to alternative formatting for plain text
-                report = report.replace('*', '_')
-                f.write(report)
-        except Exception as e:
-            print(f"Error saving report: {e}")
     def save_report(self, results: Dict[str, Any], filepath: str, doc_type: str) -> None:
         """Save the formatted results to a file with proper formatting."""

             return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
         # Common words that might appear in uppercase but aren't acronyms
+        heading_words = self.config_manager.config.get('heading_words', HEADING_WORDS)
         # Standard acronyms that don't need to be defined
+        predefined_acronyms = self.config_manager.config.get('predefined_acronyms', PREDEFINED_ACRONYMS)
         # Tracking structures
         defined_acronyms = {}  # Stores definition info
         # Patterns
         defined_pattern = re.compile(r'\b([\w\s&]+?)\s*\((\b[A-Z]{2,}\b)\)')
+        acronym_pattern = re.compile(r'(?<!\()\b[A-Z]{2,}\b(?!\s*[:.]\s*)')
         for paragraph in doc:
             # Skip lines that appear to be headings (all uppercase with common heading words)
             if all(word.isupper() for word in words) and any(word in heading_words for word in words):
                 continue
+            # Check for acronym definitions first
+            defined_matches = defined_pattern.findall(paragraph)
+            for full_term, acronym in defined_matches:
+                if acronym not in predefined_acronyms:
+                    if acronym not in defined_acronyms:
+                        defined_acronyms[acronym] = {
+                            'full_term': full_term.strip(),
+                            'defined_at': paragraph.strip(),
+                            'used': False  # Initially not used
+                        }
+                    else:
+                        # Handle duplicate definitions if necessary
+                        pass  # You may add logic for duplicate definitions
+            # Check for acronym usage
+            usage_matches = acronym_pattern.finditer(paragraph)
+            for match in usage_matches:
+                acronym = match.group()
+                # Skip predefined acronyms
+                if acronym in predefined_acronyms:
+                    continue
+                # Skip if it's part of a heading or contains non-letter characters
+                if (acronym in heading_words or
+                    any(not c.isalpha() for c in acronym) or
+                    len(acronym) > 10):  # Usually acronyms aren't this long
+                    continue
+                if acronym not in defined_acronyms:
+                    # Undefined acronym used
+                    issues.append({
+                        'type': 'undefined_acronym',
+                        'acronym': acronym,
+                        'sentence': paragraph.strip()
+                    })
+                else:
+                    # Mark as used
+                    defined_acronyms[acronym]['used'] = True
+                    used_acronyms.add(acronym)
         # Check for defined but unused acronyms
         unused_acronyms = [
         return DocumentCheckResult(success=success, issues=issues)
     @profile_performance
     def check_terminology(self, doc: List[str]) -> DocumentCheckResult:
         """
                 'solution': 'Format heading periods according to document type requirements',
                 'example_fix': {
                     'before': 'Purpose',
+                    'after': 'Purpose.' # For ACs and Orders
                 }
             },
             'table_figure_reference_check': {
             },
             'acronym_check': {
                 'title': 'Acronym Definition Issues',
+                'description': 'Ensures every acronym is properly introduced with its full term at first use. The check identifies undefined acronyms while recognizing common exceptions (like U.S.) that don\'t require definition.',
+                'solution': 'Define each acronym at its first use, e.g., "Federal Aviation Administration (FAA)"',
                 'example_fix': {
                     'before': 'This order establishes general FAA organizational policies.',
                     'after': 'This order establishes general Federal Aviation Administration (FAA) organizational policies.'
                 }
             }
         }
+        # Add these two helper methods here, after __init__ and before other methods
     def _format_colored_text(self, text: str, color: str) -> str:
+        """Helper method to format colored text with reset.
+        Args:
+            text: The text to be colored
+            color: The color to apply (from colorama.Fore)
+        Returns:
+            str: The colored text with reset styling
+        """
         return f"{color}{text}{Style.RESET_ALL}"
     def _format_example(self, example_fix: Dict[str, str]) -> List[str]:
+        """Format example fixes consistently.
+        Args:
+            example_fix: Dictionary containing 'before' and 'after' examples
+        Returns:
+            List[str]: Formatted example lines
+        """
         return [
             f"    ❌ Incorrect: {example_fix['before']}",
             f"    ✓ Correct: {example_fix['after']}"
                     output.append(f"    • {heading}")
         return output
     def _format_period_issues(self, result: DocumentCheckResult) -> List[str]:
         """Format period check issues consistently."""
         output = []
                     output.append(f"    • {issue['message']}")
         return output
     def _format_reference_issues(self, result: DocumentCheckResult) -> List[str]:
+        """Format reference issues consistently."""
         output = []
+        for issue in result.issues:
+            if isinstance(issue, dict):
+                reference_text = f"    • {issue['reference']} should be {issue['correct_form']}"
+                output.append(reference_text)
+                if 'sentence' in issue:
+                    context = textwrap.fill(
+                        issue['sentence'],
+                        width=76,
+                        initial_indent='      ',
+                        subsequent_indent='      '
+                    )
+                    output.append(f"{Fore.YELLOW}Context: {context}{Style.RESET_ALL}")
         return output
     def _format_caption_issues(self, result: DocumentCheckResult) -> List[str]:
         """Format caption issues consistently."""
         output = []
                 output.append(f"    • {issue.get('incorrect_caption', '')} (correct format: {issue.get('correct_format', '')})")
         return output
     def _format_standard_issue(self, issue: Dict[str, Any]) -> str:
         """Format a standard issue consistently."""
         if isinstance(issue, dict):
+            # Handle grouped issues per sentence
+            if 'incorrect_terms' in issue and 'sentence' in issue:
+                # Build the replacements text
+                replacements = '; '.join(
+                    f"'{inc}' with '{corr}'" if corr else f"Remove '{inc}'"
+                    for inc, corr in sorted(issue['incorrect_terms'])
+                )
+                # Start building the output lines
+                lines = []
+                lines.append(f"    • In: {issue['sentence']}")
+                lines.append(f"      Replace {replacements}")
+                # Format each line individually
+                formatted_lines = [
+                    textwrap.fill(line, width=76, subsequent_indent='      ')
+                    for line in lines
+                ]
+                return '\n'.join(formatted_lines)
+            # Handle issues with occurrences list
+            if 'occurrences' in issue:
+                # Format the first 3 occurrences
+                examples = issue['occurrences'][:3]
+                formatted_examples = []
+                for example in examples:
+                    if 'sentence' in example:
+                        formatted_examples.append(example['sentence'])
+                    elif isinstance(example, str):
+                        formatted_examples.append(example)
+                description = issue.get('description', '')
                 return textwrap.fill(
+                    f"    • {description} - Examples: {'; '.join(formatted_examples)}",
                     width=76,
                     subsequent_indent='      '
                 )
+            # Handle unused acronym issues
+            if issue.get('type') == 'unused_acronym':
                 return textwrap.fill(
+                    f"    • Acronym '{issue['acronym']}' defined but not used again after definition.",
                     width=76,
                     subsequent_indent='      '
                 )
+            # Handle issues with direct sentence reference
             elif 'sentence' in issue:
                 return textwrap.fill(
                     issue['sentence'],
                     initial_indent='    • ',
                     subsequent_indent='      '
                 )
+            # Handle issues with specific error messages
+            elif 'error' in issue:
+                return f"    • Error: {issue['error']}"
+            # Handle issues with description and matches
+            elif all(k in issue for k in ['issue_type', 'description', 'matches']):
+                matches_str = '; '.join(str(m) for m in issue['matches'][:3])
                 return textwrap.fill(
+                    f"    • {issue['description']} - Found: {matches_str}",
                     width=76,
                     subsequent_indent='      '
                 )
+            # Handle terminology issues
+            if all(k in issue for k in ['incorrect_term', 'correct_term', 'sentence']):
+                return textwrap.fill(
+                    f"    • Replace '{issue['incorrect_term']}' with '{issue['correct_term']}' in: "
+                    f"{issue['sentence']}",
+                    width=76,
+                    subsequent_indent='      '
+                )
+            # Handle placeholder issues
+            elif 'placeholder' in issue:
+                return textwrap.fill(
+                    f"    • Found placeholder '{issue['placeholder']}' in: {issue.get('sentence', '')}",
+                    width=76,
+                    subsequent_indent='      '
+                )
+            # Handle other dictionary formats
             else:
                 message_parts = []
                 for k, v in issue.items():
                     if k not in ['type', 'error']:
+                        if isinstance(v, list):
+                            if all(isinstance(item, dict) for item in v):
+                                # Handle list of dictionaries
+                                v_str = '; '.join(str(item.get('sentence', str(item))) for item in v[:3])
+                            else:
+                                # Handle list of strings
+                                v_str = ', '.join(str(item) for item in v[:3])
+                            message_parts.append(f"{k}: {v_str}")
+                        else:
+                            message_parts.append(f"{k}: {v}")
                 return f"    • {'; '.join(message_parts)}"
+        return f"    • {str(issue)}"
     def format_results(self, results: Dict[str, Any], doc_type: str) -> str:
         """
         Returns:
             str: Formatted report with consistent styling
         """
+         # Determine caption format based on document type
         if doc_type in ["Advisory Circular", "Order"]:
             table_format = {
                 'title': 'Table Caption Format Issues',
         else:
             table_format = {
                 'title': 'Table Caption Format Issues',
+                'description': f'Analyzes table captions to ensure they follow the FAA\'s sequential numbering system for {doc_type}s. Tables must be numbered consecutively throughout the document using a single-number format. This straightforward numbering system maintains document organization while facilitating clear references to specific tables.',
                 'solution': 'Use the format "Table X" where X is a sequential number',
                 'example_fix': {
                     'before': 'Table 5-1. | Table A-1',
             }
             figure_format = {
                 'title': 'Figure Caption Format Issues',
+                'description': f'Analyzes figure captions to ensure they follow the FAA\'s sequential numbering system for {doc_type}s. Figures must be numbered consecutively throughout the document using a single-number format. This consistent numbering approach ensures clear figure identification and maintains parallel structure with table numbering.',
                 'solution': 'Use the format "Figure X" where X is a sequential number',
                 'example_fix': {
                     'before': 'Figure 5-1. | Figure A-1.',
                 "types": ["Advisory Circular"],
                 "italics": True,
                 "quotes": False,
+                "description": "For Advisory Circulars, referenced document titles should be italicized but not quoted",
                 "example": "See AC 25.1309-1B, *System Design and Analysis*, for information on X."
             },
             "quotes_only": {
                 ],
                 "italics": False,
                 "quotes": True,
+                "description": "For this document type, referenced document titles should be in quotes without italics",
                 "example": 'See AC 25.1309-1B, "System Design and Analysis," for information on X.'
             },
             "no_formatting": {
                 "types": ["Policy Statement", "Other"],
                 "italics": False,
                 "quotes": False,
+                "description": "For this document type, referenced document titles should not use italics or quotes",
                 "example": "See AC 25.1309-1B, System Design and Analysis, for information on X."
             }
         }
         # Process all check results consistently
         for check_name, result in results.items():
+            if not result.success and check_name in self.issue_categories:
+                category = self.issue_categories[check_name]
                 # Add extra line break before each category
                 output.append("\n")
                 output.append(f"  {self._format_colored_text('How to fix: ' + category['solution'], Fore.GREEN)}")
                 # Example Fix
+                output.append(f"\n  {self._format_colored_text('Example Fix:', Fore.CYAN)}")
+                output.extend(self._format_example(category['example_fix']))
+                output.append("")  # Add blank line after example
                 # Actual Issues Found
                 output.append(f"  {self._format_colored_text('Issues found in your document:', Fore.CYAN)}")
         output.append(f"\n{Fore.CYAN}{'='*80}{Style.RESET_ALL}\n")
         return '\n'.join(output)
     def save_report(self, results: Dict[str, Any], filepath: str, doc_type: str) -> None:
         """Save the formatted results to a file with proper formatting."""