Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1004,27 +1004,20 @@ class FAADocumentChecker(DocumentChecker):
|
|
1004 |
if not self.validate_input(doc):
|
1005 |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
|
1006 |
|
1007 |
-
# Get patterns from the pattern registry
|
1008 |
terminology_patterns = self.config_manager.pattern_registry.get('terminology', [])
|
1009 |
prohibited_patterns = self.config_manager.pattern_registry.get('reference_terms', [])
|
1010 |
|
1011 |
-
# Use a dictionary to track unique issues by sentence
|
1012 |
-
# Key: sentence, Value: list of issues in that sentence
|
1013 |
sentence_issues = {}
|
1014 |
|
1015 |
-
# Check each paragraph for terminology issues
|
1016 |
for paragraph in doc:
|
1017 |
sentences = re.split(r'(?<=[.!?])\s+', paragraph)
|
1018 |
for sentence in sentences:
|
1019 |
sentence = sentence.strip()
|
1020 |
-
|
1021 |
-
# Skip empty sentences
|
1022 |
if not sentence:
|
1023 |
continue
|
1024 |
|
1025 |
current_sentence_issues = []
|
1026 |
|
1027 |
-
# Check for incorrect terms that need replacement
|
1028 |
for pattern_config in terminology_patterns:
|
1029 |
matches = list(re.finditer(pattern_config.pattern, sentence))
|
1030 |
for match in matches:
|
@@ -1035,7 +1028,6 @@ class FAADocumentChecker(DocumentChecker):
|
|
1035 |
'sentence': sentence
|
1036 |
})
|
1037 |
|
1038 |
-
# Check for prohibited phrases and constructions
|
1039 |
for pattern_config in prohibited_patterns:
|
1040 |
if re.search(pattern_config.pattern, sentence, re.IGNORECASE):
|
1041 |
current_sentence_issues.append({
|
@@ -1043,32 +1035,26 @@ class FAADocumentChecker(DocumentChecker):
|
|
1043 |
'sentence': sentence
|
1044 |
})
|
1045 |
|
1046 |
-
# Only add if we found issues in this sentence
|
1047 |
if current_sentence_issues:
|
1048 |
-
# Use sentence as key to prevent duplicates
|
1049 |
if sentence not in sentence_issues:
|
1050 |
sentence_issues[sentence] = current_sentence_issues
|
1051 |
else:
|
1052 |
sentence_issues[sentence].extend(current_sentence_issues)
|
1053 |
|
1054 |
-
# Build the issues per sentence
|
1055 |
unique_issues = []
|
1056 |
for sentence, sentence_issue_list in sentence_issues.items():
|
1057 |
-
|
1058 |
-
descriptions = set()
|
1059 |
for issue in sentence_issue_list:
|
1060 |
-
if 'incorrect_term' in issue:
|
1061 |
-
|
1062 |
-
if 'description' in issue:
|
1063 |
-
descriptions.add(issue['description'])
|
1064 |
-
unique_issues.append({
|
1065 |
-
'sentence': sentence,
|
1066 |
-
'incorrect_terms': list(incorrect_terms),
|
1067 |
-
'descriptions': list(descriptions),
|
1068 |
-
})
|
1069 |
|
1070 |
-
|
|
|
|
|
|
|
|
|
1071 |
|
|
|
1072 |
|
1073 |
@profile_performance
|
1074 |
def check_section_symbol_usage(self, doc: List[str]) -> DocumentCheckResult:
|
@@ -1076,36 +1062,33 @@ class FAADocumentChecker(DocumentChecker):
|
|
1076 |
if not self.validate_input(doc):
|
1077 |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
|
1078 |
|
1079 |
-
# Patterns for specific issues
|
1080 |
section_patterns = self.config_manager.pattern_registry.get('section_symbol', [])
|
1081 |
|
1082 |
-
|
1083 |
-
sentences_starting_with_section_symbol = []
|
1084 |
-
incorrect_14_CFR_section_symbol_usage = []
|
1085 |
|
1086 |
-
# Define patterns and check the document for issues
|
1087 |
for paragraph in doc:
|
1088 |
sentences = re.split(r'(?<=[.!?])\s+', paragraph)
|
1089 |
|
1090 |
-
for
|
1091 |
-
|
1092 |
-
|
1093 |
-
|
1094 |
-
if compiled_pattern.match(sentence.strip()):
|
1095 |
-
sentences_starting_with_section_symbol.append(sentence.strip())
|
1096 |
-
elif pattern_config.pattern == r'\b14 CFR §\s*\d+\.\d+\b':
|
1097 |
-
matches = compiled_pattern.findall(paragraph)
|
1098 |
-
incorrect_14_CFR_section_symbol_usage.extend(matches)
|
1099 |
-
|
1100 |
-
# Minimal output structure with only the sentences and matches needing correction
|
1101 |
-
issues = []
|
1102 |
-
if sentences_starting_with_section_symbol:
|
1103 |
-
issues.extend(sentences_starting_with_section_symbol)
|
1104 |
|
1105 |
-
|
1106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1107 |
|
1108 |
-
# Return only the list of issues
|
1109 |
return DocumentCheckResult(success=not issues, issues=issues)
|
1110 |
|
1111 |
@profile_performance
|
|
|
1004 |
if not self.validate_input(doc):
|
1005 |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
|
1006 |
|
|
|
1007 |
terminology_patterns = self.config_manager.pattern_registry.get('terminology', [])
|
1008 |
prohibited_patterns = self.config_manager.pattern_registry.get('reference_terms', [])
|
1009 |
|
|
|
|
|
1010 |
sentence_issues = {}
|
1011 |
|
|
|
1012 |
for paragraph in doc:
|
1013 |
sentences = re.split(r'(?<=[.!?])\s+', paragraph)
|
1014 |
for sentence in sentences:
|
1015 |
sentence = sentence.strip()
|
|
|
|
|
1016 |
if not sentence:
|
1017 |
continue
|
1018 |
|
1019 |
current_sentence_issues = []
|
1020 |
|
|
|
1021 |
for pattern_config in terminology_patterns:
|
1022 |
matches = list(re.finditer(pattern_config.pattern, sentence))
|
1023 |
for match in matches:
|
|
|
1028 |
'sentence': sentence
|
1029 |
})
|
1030 |
|
|
|
1031 |
for pattern_config in prohibited_patterns:
|
1032 |
if re.search(pattern_config.pattern, sentence, re.IGNORECASE):
|
1033 |
current_sentence_issues.append({
|
|
|
1035 |
'sentence': sentence
|
1036 |
})
|
1037 |
|
|
|
1038 |
if current_sentence_issues:
|
|
|
1039 |
if sentence not in sentence_issues:
|
1040 |
sentence_issues[sentence] = current_sentence_issues
|
1041 |
else:
|
1042 |
sentence_issues[sentence].extend(current_sentence_issues)
|
1043 |
|
|
|
1044 |
unique_issues = []
|
1045 |
for sentence, sentence_issue_list in sentence_issues.items():
|
1046 |
+
replacements = []
|
|
|
1047 |
for issue in sentence_issue_list:
|
1048 |
+
if 'incorrect_term' in issue and issue.get('correct_term'):
|
1049 |
+
replacements.append(f"'{issue['incorrect_term']}' with '{issue['correct_term']}'")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1050 |
|
1051 |
+
replacement_text = "; ".join(replacements)
|
1052 |
+
formatted_issue = {
|
1053 |
+
'sentence': f"{sentence} ({'Replace ' + replacement_text})" if replacements else sentence
|
1054 |
+
}
|
1055 |
+
unique_issues.append(formatted_issue)
|
1056 |
|
1057 |
+
return DocumentCheckResult(success=not unique_issues, issues=unique_issues)
|
1058 |
|
1059 |
@profile_performance
|
1060 |
def check_section_symbol_usage(self, doc: List[str]) -> DocumentCheckResult:
|
|
|
1062 |
if not self.validate_input(doc):
|
1063 |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
|
1064 |
|
|
|
1065 |
section_patterns = self.config_manager.pattern_registry.get('section_symbol', [])
|
1066 |
|
1067 |
+
issues = []
|
|
|
|
|
1068 |
|
|
|
1069 |
for paragraph in doc:
|
1070 |
sentences = re.split(r'(?<=[.!?])\s+', paragraph)
|
1071 |
|
1072 |
+
for sentence in sentences:
|
1073 |
+
sentence = sentence.strip()
|
1074 |
+
for pattern_config in section_patterns:
|
1075 |
+
compiled_pattern = re.compile(pattern_config.pattern)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1076 |
|
1077 |
+
if pattern_config.pattern == r'^§': # Start of sentence with § symbol
|
1078 |
+
if compiled_pattern.match(sentence):
|
1079 |
+
corrected_sentence = sentence.replace('§', 'Section', 1)
|
1080 |
+
issues.append({
|
1081 |
+
'sentence': f"{sentence} (Replace § with 'Section')"
|
1082 |
+
})
|
1083 |
+
|
1084 |
+
elif pattern_config.pattern == r'\b14 CFR §\s*\d+\.\d+\b': # 14 CFR § format
|
1085 |
+
matches = compiled_pattern.findall(sentence)
|
1086 |
+
for match in matches:
|
1087 |
+
corrected_sentence = sentence.replace('§', '', 1)
|
1088 |
+
issues.append({
|
1089 |
+
'sentence': f"{sentence} (Remove §)"
|
1090 |
+
})
|
1091 |
|
|
|
1092 |
return DocumentCheckResult(success=not issues, issues=issues)
|
1093 |
|
1094 |
@profile_performance
|