Hoctar77 commited on
Commit
ff7a3c4
·
verified ·
1 Parent(s): 7a044f4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -45
app.py CHANGED
@@ -1004,27 +1004,20 @@ class FAADocumentChecker(DocumentChecker):
1004
  if not self.validate_input(doc):
1005
  return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
1006
 
1007
- # Get patterns from the pattern registry
1008
  terminology_patterns = self.config_manager.pattern_registry.get('terminology', [])
1009
  prohibited_patterns = self.config_manager.pattern_registry.get('reference_terms', [])
1010
 
1011
- # Use a dictionary to track unique issues by sentence
1012
- # Key: sentence, Value: list of issues in that sentence
1013
  sentence_issues = {}
1014
 
1015
- # Check each paragraph for terminology issues
1016
  for paragraph in doc:
1017
  sentences = re.split(r'(?<=[.!?])\s+', paragraph)
1018
  for sentence in sentences:
1019
  sentence = sentence.strip()
1020
-
1021
- # Skip empty sentences
1022
  if not sentence:
1023
  continue
1024
 
1025
  current_sentence_issues = []
1026
 
1027
- # Check for incorrect terms that need replacement
1028
  for pattern_config in terminology_patterns:
1029
  matches = list(re.finditer(pattern_config.pattern, sentence))
1030
  for match in matches:
@@ -1035,7 +1028,6 @@ class FAADocumentChecker(DocumentChecker):
1035
  'sentence': sentence
1036
  })
1037
 
1038
- # Check for prohibited phrases and constructions
1039
  for pattern_config in prohibited_patterns:
1040
  if re.search(pattern_config.pattern, sentence, re.IGNORECASE):
1041
  current_sentence_issues.append({
@@ -1043,32 +1035,26 @@ class FAADocumentChecker(DocumentChecker):
1043
  'sentence': sentence
1044
  })
1045
 
1046
- # Only add if we found issues in this sentence
1047
  if current_sentence_issues:
1048
- # Use sentence as key to prevent duplicates
1049
  if sentence not in sentence_issues:
1050
  sentence_issues[sentence] = current_sentence_issues
1051
  else:
1052
  sentence_issues[sentence].extend(current_sentence_issues)
1053
 
1054
- # Build the issues per sentence
1055
  unique_issues = []
1056
  for sentence, sentence_issue_list in sentence_issues.items():
1057
- incorrect_terms = set()
1058
- descriptions = set()
1059
  for issue in sentence_issue_list:
1060
- if 'incorrect_term' in issue:
1061
- incorrect_terms.add((issue['incorrect_term'], issue.get('correct_term')))
1062
- if 'description' in issue:
1063
- descriptions.add(issue['description'])
1064
- unique_issues.append({
1065
- 'sentence': sentence,
1066
- 'incorrect_terms': list(incorrect_terms),
1067
- 'descriptions': list(descriptions),
1068
- })
1069
 
1070
- return DocumentCheckResult(success=not unique_issues, issues=unique_issues)
 
 
 
 
1071
 
 
1072
 
1073
  @profile_performance
1074
  def check_section_symbol_usage(self, doc: List[str]) -> DocumentCheckResult:
@@ -1076,36 +1062,33 @@ class FAADocumentChecker(DocumentChecker):
1076
  if not self.validate_input(doc):
1077
  return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
1078
 
1079
- # Patterns for specific issues
1080
  section_patterns = self.config_manager.pattern_registry.get('section_symbol', [])
1081
 
1082
- # Capture problematic sentences or phrases
1083
- sentences_starting_with_section_symbol = []
1084
- incorrect_14_CFR_section_symbol_usage = []
1085
 
1086
- # Define patterns and check the document for issues
1087
  for paragraph in doc:
1088
  sentences = re.split(r'(?<=[.!?])\s+', paragraph)
1089
 
1090
- for pattern_config in section_patterns:
1091
- compiled_pattern = re.compile(pattern_config.pattern)
1092
- if pattern_config.pattern == r'^§':
1093
- for sentence in sentences:
1094
- if compiled_pattern.match(sentence.strip()):
1095
- sentences_starting_with_section_symbol.append(sentence.strip())
1096
- elif pattern_config.pattern == r'\b14 CFR §\s*\d+\.\d+\b':
1097
- matches = compiled_pattern.findall(paragraph)
1098
- incorrect_14_CFR_section_symbol_usage.extend(matches)
1099
-
1100
- # Minimal output structure with only the sentences and matches needing correction
1101
- issues = []
1102
- if sentences_starting_with_section_symbol:
1103
- issues.extend(sentences_starting_with_section_symbol)
1104
 
1105
- if incorrect_14_CFR_section_symbol_usage:
1106
- issues.extend(incorrect_14_CFR_section_symbol_usage)
 
 
 
 
 
 
 
 
 
 
 
 
1107
 
1108
- # Return only the list of issues
1109
  return DocumentCheckResult(success=not issues, issues=issues)
1110
 
1111
  @profile_performance
 
1004
  if not self.validate_input(doc):
1005
  return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
1006
 
 
1007
  terminology_patterns = self.config_manager.pattern_registry.get('terminology', [])
1008
  prohibited_patterns = self.config_manager.pattern_registry.get('reference_terms', [])
1009
 
 
 
1010
  sentence_issues = {}
1011
 
 
1012
  for paragraph in doc:
1013
  sentences = re.split(r'(?<=[.!?])\s+', paragraph)
1014
  for sentence in sentences:
1015
  sentence = sentence.strip()
 
 
1016
  if not sentence:
1017
  continue
1018
 
1019
  current_sentence_issues = []
1020
 
 
1021
  for pattern_config in terminology_patterns:
1022
  matches = list(re.finditer(pattern_config.pattern, sentence))
1023
  for match in matches:
 
1028
  'sentence': sentence
1029
  })
1030
 
 
1031
  for pattern_config in prohibited_patterns:
1032
  if re.search(pattern_config.pattern, sentence, re.IGNORECASE):
1033
  current_sentence_issues.append({
 
1035
  'sentence': sentence
1036
  })
1037
 
 
1038
  if current_sentence_issues:
 
1039
  if sentence not in sentence_issues:
1040
  sentence_issues[sentence] = current_sentence_issues
1041
  else:
1042
  sentence_issues[sentence].extend(current_sentence_issues)
1043
 
 
1044
  unique_issues = []
1045
  for sentence, sentence_issue_list in sentence_issues.items():
1046
+ replacements = []
 
1047
  for issue in sentence_issue_list:
1048
+ if 'incorrect_term' in issue and issue.get('correct_term'):
1049
+ replacements.append(f"'{issue['incorrect_term']}' with '{issue['correct_term']}'")
 
 
 
 
 
 
 
1050
 
1051
+ replacement_text = "; ".join(replacements)
1052
+ formatted_issue = {
1053
+ 'sentence': f"{sentence} ({'Replace ' + replacement_text})" if replacements else sentence
1054
+ }
1055
+ unique_issues.append(formatted_issue)
1056
 
1057
+ return DocumentCheckResult(success=not unique_issues, issues=unique_issues)
1058
 
1059
  @profile_performance
1060
  def check_section_symbol_usage(self, doc: List[str]) -> DocumentCheckResult:
 
1062
  if not self.validate_input(doc):
1063
  return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
1064
 
 
1065
  section_patterns = self.config_manager.pattern_registry.get('section_symbol', [])
1066
 
1067
+ issues = []
 
 
1068
 
 
1069
  for paragraph in doc:
1070
  sentences = re.split(r'(?<=[.!?])\s+', paragraph)
1071
 
1072
+ for sentence in sentences:
1073
+ sentence = sentence.strip()
1074
+ for pattern_config in section_patterns:
1075
+ compiled_pattern = re.compile(pattern_config.pattern)
 
 
 
 
 
 
 
 
 
 
1076
 
1077
+ if pattern_config.pattern == r'^§': # Start of sentence with § symbol
1078
+ if compiled_pattern.match(sentence):
1079
+ corrected_sentence = sentence.replace('§', 'Section', 1)
1080
+ issues.append({
1081
+ 'sentence': f"{sentence} (Replace § with 'Section')"
1082
+ })
1083
+
1084
+ elif pattern_config.pattern == r'\b14 CFR §\s*\d+\.\d+\b': # 14 CFR § format
1085
+ matches = compiled_pattern.findall(sentence)
1086
+ for match in matches:
1087
+ corrected_sentence = sentence.replace('§', '', 1)
1088
+ issues.append({
1089
+ 'sentence': f"{sentence} (Remove §)"
1090
+ })
1091
 
 
1092
  return DocumentCheckResult(success=not issues, issues=issues)
1093
 
1094
  @profile_performance