Hoctar77 commited on
Commit
9d1e68c
·
verified ·
1 Parent(s): c7c4f68

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -33
app.py CHANGED
@@ -354,6 +354,16 @@ class DocumentCheckerConfig:
354
  description="Ignore 'title 14, Code of Federal Regulations (14 CFR)'",
355
  is_error=False
356
  ),
 
 
 
 
 
 
 
 
 
 
357
  PatternConfig(
358
  pattern=r'\bAD Compliance Team \(AD CRT\)\b',
359
  description="Ignore 'AD Compliance Team (AD CRT)'",
@@ -658,7 +668,7 @@ class FAADocumentChecker(DocumentChecker):
658
  PREDEFINED_ACRONYMS = {
659
  'AGC', 'AIR', 'CFR', 'DC', 'DOT', 'FAA IR-M', 'FAQ', 'i.e.', 'e.g.', 'MA',
660
  'MD', 'MIL', 'MO', 'No.', 'PDF', 'SSN', 'TX', 'U.S.', 'U.S.C.', 'USA', 'US',
661
- 'WA', 'ZIP'
662
  }
663
 
664
  # Constructor
@@ -1078,48 +1088,47 @@ class FAADocumentChecker(DocumentChecker):
1078
  if not self.validate_input(doc):
1079
  return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
1080
 
1081
- section_patterns = self.config_manager.pattern_registry.get('section_symbol', [])
1082
  issues = []
1083
-
1084
  for paragraph in doc:
1085
  sentences = re.split(r'(?<=[.!?])\s+', paragraph.strip())
1086
 
1087
  for sentence in sentences:
1088
  sentence = sentence.strip()
1089
- for pattern_config in section_patterns:
1090
- compiled_pattern = re.compile(pattern_config.pattern)
 
 
 
 
 
 
 
 
 
 
 
1091
 
1092
- if pattern_config.pattern == r'^§': # Start of sentence with § symbol
1093
- if compiled_pattern.match(sentence):
1094
- section_ref = sentence.split()[0] # Get the first word (§XX.XX)
1095
- issues.append({
1096
- 'incorrect': section_ref,
1097
- 'correct': f"Section {section_ref.lstrip('§')}",
1098
- 'is_sentence_start': True # Flag to indicate sentence start issue
1099
- })
1100
 
1101
- elif pattern_config.pattern == r'\b14 CFR §\s*\d+\.\d+\b': # 14 CFR § format
1102
- matches = compiled_pattern.finditer(sentence)
1103
- for match in matches:
1104
- incorrect = match.group()
1105
- # Remove § symbol without adding 'Section'
1106
- correct = incorrect.replace('§ ', '')
1107
- issues.append({
1108
- 'incorrect': incorrect,
1109
- 'correct': correct
1110
- })
1111
 
1112
- elif '§' in sentence:
1113
- matches = compiled_pattern.finditer(sentence)
1114
- for match in matches:
1115
- incorrect = match.group()
1116
- correct = incorrect.replace('§', 'Section')
1117
- issues.append({
1118
- 'incorrect': incorrect,
1119
- 'correct': correct
1120
- })
 
1121
 
1122
- return DocumentCheckResult(success=not issues, issues=issues)
1123
 
1124
  @profile_performance
1125
  def caption_check(self, doc: List[str], doc_type: str, caption_type: str) -> DocumentCheckResult:
 
354
  description="Ignore 'title 14, Code of Federal Regulations (14 CFR)'",
355
  is_error=False
356
  ),
357
+ PatternConfig(
358
+ pattern=r'\btitle 49 of the United States Code \(49 U.S.C.\)\b',
359
+ description="Ignore 'title 49 of the United States Code (49 U.S.C.)'",
360
+ is_error=False
361
+ ),
362
+ PatternConfig(
363
+ pattern=r'\btitle 49, United States Code \(49 U.S.C.\)\b',
364
+ description="Ignore 'title 49, United States Code (49 U.S.C.)'",
365
+ is_error=False
366
+ ),
367
  PatternConfig(
368
  pattern=r'\bAD Compliance Team \(AD CRT\)\b',
369
  description="Ignore 'AD Compliance Team (AD CRT)'",
 
668
  PREDEFINED_ACRONYMS = {
669
  'AGC', 'AIR', 'CFR', 'DC', 'DOT', 'FAA IR-M', 'FAQ', 'i.e.', 'e.g.', 'MA',
670
  'MD', 'MIL', 'MO', 'No.', 'PDF', 'SSN', 'TX', 'U.S.', 'U.S.C.', 'USA', 'US',
671
+ 'WA', 'XX', 'ZIP'
672
  }
673
 
674
  # Constructor
 
1088
  if not self.validate_input(doc):
1089
  return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
1090
 
 
1091
  issues = []
1092
+
1093
  for paragraph in doc:
1094
  sentences = re.split(r'(?<=[.!?])\s+', paragraph.strip())
1095
 
1096
  for sentence in sentences:
1097
  sentence = sentence.strip()
1098
+
1099
+ # Check 14 CFR citations only
1100
+ cfr_matches = re.finditer(r'\b14 CFR §\s*(\d+\.\d+)\b', sentence)
1101
+ for match in cfr_matches:
1102
+ # Skip if this is part of a U.S.C. citation
1103
+ if not re.search(r'U\.S\.C\.\s*§', sentence):
1104
+ full_match = match.group(0)
1105
+ section_num = match.group(1)
1106
+ issues.append({
1107
+ 'incorrect': full_match,
1108
+ 'correct': f'14 CFR {section_num}',
1109
+ 'description': f"Replace '{full_match}' with '14 CFR {section_num}'"
1110
+ })
1111
 
1112
+ # Skip any checks for sections that are part of U.S.C. citations
1113
+ if re.search(r'U\.S\.C\.\s*(?:§|§§)', sentence):
1114
+ continue
 
 
 
 
 
1115
 
1116
+ # Skip any checks for sections that are part of 14 CFR citations
1117
+ if re.search(r'14 CFR\s*§', sentence):
1118
+ continue
 
 
 
 
 
 
 
1119
 
1120
+ # Check section symbol at start of sentence
1121
+ if sentence.startswith('§'):
1122
+ match = re.match(r'^§\s*(\d+(?:\.\d+)?)', sentence)
1123
+ if match:
1124
+ section_num = match.group(1)
1125
+ issues.append({
1126
+ 'incorrect': f'§ {section_num}',
1127
+ 'correct': f'Section {section_num}',
1128
+ 'description': f"Replace '§ {section_num}' with 'Section {section_num}'"
1129
+ })
1130
 
1131
+ return DocumentCheckResult(success=len(issues) == 0, issues=issues)
1132
 
1133
  @profile_performance
1134
  def caption_check(self, doc: List[str], doc_type: str, caption_type: str) -> DocumentCheckResult: