Spaces:

Hoctar77
/

DocumentCheckerTool

Sleeping

App Files Files Community

Hoctar77 commited on Nov 13, 2024

Commit

ff7a3c4

verified ·

1 Parent(s): 7a044f4

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -45

app.py CHANGED Viewed

@@ -1004,27 +1004,20 @@ class FAADocumentChecker(DocumentChecker):
         if not self.validate_input(doc):
             return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
-        # Get patterns from the pattern registry
         terminology_patterns = self.config_manager.pattern_registry.get('terminology', [])
         prohibited_patterns = self.config_manager.pattern_registry.get('reference_terms', [])
-        # Use a dictionary to track unique issues by sentence
-        # Key: sentence, Value: list of issues in that sentence
         sentence_issues = {}
-        # Check each paragraph for terminology issues
         for paragraph in doc:
             sentences = re.split(r'(?<=[.!?])\s+', paragraph)
             for sentence in sentences:
                 sentence = sentence.strip()
-                # Skip empty sentences
                 if not sentence:
                     continue
                 current_sentence_issues = []
-                # Check for incorrect terms that need replacement
                 for pattern_config in terminology_patterns:
                     matches = list(re.finditer(pattern_config.pattern, sentence))
                     for match in matches:
@@ -1035,7 +1028,6 @@ class FAADocumentChecker(DocumentChecker):
                             'sentence': sentence
                         })
-                # Check for prohibited phrases and constructions
                 for pattern_config in prohibited_patterns:
                     if re.search(pattern_config.pattern, sentence, re.IGNORECASE):
                         current_sentence_issues.append({
@@ -1043,32 +1035,26 @@ class FAADocumentChecker(DocumentChecker):
                             'sentence': sentence
                         })
-                # Only add if we found issues in this sentence
                 if current_sentence_issues:
-                    # Use sentence as key to prevent duplicates
                     if sentence not in sentence_issues:
                         sentence_issues[sentence] = current_sentence_issues
                     else:
                         sentence_issues[sentence].extend(current_sentence_issues)
-        # Build the issues per sentence
         unique_issues = []
         for sentence, sentence_issue_list in sentence_issues.items():
-            incorrect_terms = set()
-            descriptions = set()
             for issue in sentence_issue_list:
-                if 'incorrect_term' in issue:
-                    incorrect_terms.add((issue['incorrect_term'], issue.get('correct_term')))
-                if 'description' in issue:
-                    descriptions.add(issue['description'])
-            unique_issues.append({
-                'sentence': sentence,
-                'incorrect_terms': list(incorrect_terms),
-                'descriptions': list(descriptions),
-            })
-        return DocumentCheckResult(success=not unique_issues, issues=unique_issues)
     @profile_performance
     def check_section_symbol_usage(self, doc: List[str]) -> DocumentCheckResult:
@@ -1076,36 +1062,33 @@ class FAADocumentChecker(DocumentChecker):
         if not self.validate_input(doc):
             return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
-        # Patterns for specific issues
         section_patterns = self.config_manager.pattern_registry.get('section_symbol', [])
-        # Capture problematic sentences or phrases
-        sentences_starting_with_section_symbol = []
-        incorrect_14_CFR_section_symbol_usage = []
-        # Define patterns and check the document for issues
         for paragraph in doc:
             sentences = re.split(r'(?<=[.!?])\s+', paragraph)
-            for pattern_config in section_patterns:
-                compiled_pattern = re.compile(pattern_config.pattern)
-                if pattern_config.pattern == r'^§':
-                    for sentence in sentences:
-                        if compiled_pattern.match(sentence.strip()):
-                            sentences_starting_with_section_symbol.append(sentence.strip())
-                elif pattern_config.pattern == r'\b14 CFR §\s*\d+\.\d+\b':
-                    matches = compiled_pattern.findall(paragraph)
-                    incorrect_14_CFR_section_symbol_usage.extend(matches)
-        # Minimal output structure with only the sentences and matches needing correction
-        issues = []
-        if sentences_starting_with_section_symbol:
-            issues.extend(sentences_starting_with_section_symbol)
-        if incorrect_14_CFR_section_symbol_usage:
-            issues.extend(incorrect_14_CFR_section_symbol_usage)
-        # Return only the list of issues
         return DocumentCheckResult(success=not issues, issues=issues)
     @profile_performance

         if not self.validate_input(doc):
             return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
         terminology_patterns = self.config_manager.pattern_registry.get('terminology', [])
         prohibited_patterns = self.config_manager.pattern_registry.get('reference_terms', [])
         sentence_issues = {}
         for paragraph in doc:
             sentences = re.split(r'(?<=[.!?])\s+', paragraph)
             for sentence in sentences:
                 sentence = sentence.strip()
                 if not sentence:
                     continue
                 current_sentence_issues = []
                 for pattern_config in terminology_patterns:
                     matches = list(re.finditer(pattern_config.pattern, sentence))
                     for match in matches:
                             'sentence': sentence
                         })
                 for pattern_config in prohibited_patterns:
                     if re.search(pattern_config.pattern, sentence, re.IGNORECASE):
                         current_sentence_issues.append({
                             'sentence': sentence
                         })
                 if current_sentence_issues:
                     if sentence not in sentence_issues:
                         sentence_issues[sentence] = current_sentence_issues
                     else:
                         sentence_issues[sentence].extend(current_sentence_issues)
         unique_issues = []
         for sentence, sentence_issue_list in sentence_issues.items():
+            replacements = []
             for issue in sentence_issue_list:
+                if 'incorrect_term' in issue and issue.get('correct_term'):
+                    replacements.append(f"'{issue['incorrect_term']}' with '{issue['correct_term']}'")
+            replacement_text = "; ".join(replacements)
+            formatted_issue = {
+                'sentence': f"{sentence} ({'Replace ' + replacement_text})" if replacements else sentence
+            }
+            unique_issues.append(formatted_issue)
+        return DocumentCheckResult(success=not unique_issues, issues=unique_issues)
     @profile_performance
     def check_section_symbol_usage(self, doc: List[str]) -> DocumentCheckResult:
         if not self.validate_input(doc):
             return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
         section_patterns = self.config_manager.pattern_registry.get('section_symbol', [])
+        issues = []
         for paragraph in doc:
             sentences = re.split(r'(?<=[.!?])\s+', paragraph)
+            for sentence in sentences:
+                sentence = sentence.strip()
+                for pattern_config in section_patterns:
+                    compiled_pattern = re.compile(pattern_config.pattern)
+                    if pattern_config.pattern == r'^§':  # Start of sentence with § symbol
+                        if compiled_pattern.match(sentence):
+                            corrected_sentence = sentence.replace('§', 'Section', 1)
+                            issues.append({
+                                'sentence': f"{sentence} (Replace § with 'Section')"
+                            })
+                    elif pattern_config.pattern == r'\b14 CFR §\s*\d+\.\d+\b':  # 14 CFR § format
+                        matches = compiled_pattern.findall(sentence)
+                        for match in matches:
+                            corrected_sentence = sentence.replace('§', '', 1)
+                            issues.append({
+                                'sentence': f"{sentence} (Remove §)"
+                            })
         return DocumentCheckResult(success=not issues, issues=issues)
     @profile_performance