Hoctar77 commited on
Commit
c996527
·
verified ·
1 Parent(s): ff7a3c4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -66
app.py CHANGED
@@ -722,16 +722,6 @@ class FAADocumentChecker(DocumentChecker):
722
  # Core Check Methods
723
  @profile_performance
724
  def heading_title_check(self, doc: List[str], doc_type: str) -> DocumentCheckResult:
725
- """
726
- Check headings for a specific document type.
727
-
728
- Args:
729
- doc (List[str]): List of document paragraphs
730
- doc_type (str): Type of document being checked
731
-
732
- Returns:
733
- DocumentCheckResult: Result of heading check including found and missing headings
734
- """
735
  if not self.validate_input(doc):
736
  self.logger.error("Invalid document input for heading check")
737
  return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
@@ -758,13 +748,29 @@ class FAADocumentChecker(DocumentChecker):
758
  headings_found = []
759
  required_headings_set = set(required_headings)
760
 
761
- # Extract and normalize headings from document
 
 
 
 
 
 
 
 
762
  for para in doc:
763
  para_strip = para.strip()
764
- # Handle both exact matches and variations with trailing periods
765
- para_base = para_strip.rstrip('.')
766
- if para_base in required_headings_set or para_strip in required_headings_set:
767
- headings_found.append(para_strip)
 
 
 
 
 
 
 
 
768
 
769
  # Check if all required headings are found
770
  found_headings_set = set(headings_found)
@@ -796,6 +802,7 @@ class FAADocumentChecker(DocumentChecker):
796
 
797
  return DocumentCheckResult(success=success, issues=issues, details=details)
798
 
 
799
  @profile_performance
800
  def heading_title_period_check(self, doc: List[str], doc_type: str) -> DocumentCheckResult:
801
  """
@@ -907,39 +914,22 @@ class FAADocumentChecker(DocumentChecker):
907
 
908
  @profile_performance
909
  def acronym_check(self, doc: List[str]) -> DocumentCheckResult:
910
- """
911
- Check if acronyms are defined at their first use, ignoring uppercase headings
912
- and common exceptions.
913
- """
914
  if not self.validate_input(doc):
915
  return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
916
 
917
- defined_acronyms = set()
918
- first_occurrences = {} # Track first occurrence of each acronym
919
- undefined_acronyms = []
920
-
921
  # Common words that might appear in uppercase but aren't acronyms
922
- heading_words = {
923
- 'INFORMATION', 'GENERAL', 'SUMMARY', 'INTRODUCTION', 'BACKGROUND',
924
- 'DISCUSSION', 'CONCLUSION', 'APPENDIX', 'CHAPTER', 'SECTION',
925
- 'PURPOSE', 'APPLICABILITY', 'CANCELLATION', 'DEFINITION', 'REQUIREMENTS',
926
- 'AUTHORITY', 'POLICY', 'SCOPE', 'RELATED', 'MATERIAL', 'DISTRIBUTION',
927
- 'EXPLANATION', 'PROCEDURES', 'NOTE', 'WARNING', 'CAUTION', 'EXCEPTION',
928
- 'GROUPS', 'PARTS', 'TABLE', 'FIGURE', 'REFERENCES', 'DEFINITIONS'
929
- }
930
 
931
  # Standard acronyms that don't need to be defined
932
- predefined_acronyms = {
933
- 'CFR', 'U.S.', 'USA', 'US', 'U.S.C', 'e.g.', 'i.e.', 'FAQ', 'No.', 'ZIP', 'PDF', 'SSN',
934
- 'DC', 'MA', 'WA', 'TX', 'MO'
935
- }
936
 
937
- defined_acronyms.update(predefined_acronyms)
 
 
 
938
 
939
- # Pattern for finding defined acronyms like "Federal Aviation Administration (FAA)"
940
  defined_pattern = re.compile(r'\b([\w\s&]+?)\s*\((\b[A-Z]{2,}\b)\)')
941
-
942
- # Modified acronym pattern to exclude common heading patterns
943
  acronym_pattern = re.compile(r'\b[A-Z]{2,}\b(?!\s*[:.]\s*)')
944
 
945
  for paragraph in doc:
@@ -948,48 +938,68 @@ class FAADocumentChecker(DocumentChecker):
948
  if all(word.isupper() for word in words) and any(word in heading_words for word in words):
949
  continue
950
 
951
- # Check for definitions first
952
  defined_matches = defined_pattern.findall(paragraph)
953
  for full_term, acronym in defined_matches:
954
- defined_acronyms.add(acronym)
955
- # If this was previously marked as undefined, remove it
956
- if acronym in first_occurrences:
957
- del first_occurrences[acronym]
 
 
 
 
 
 
958
 
959
  # Check for acronym usage
960
  usage_matches = acronym_pattern.finditer(paragraph)
961
  for match in usage_matches:
962
  acronym = match.group()
963
-
 
 
 
 
964
  # Skip if it's part of a heading or contains non-letter characters
965
- if (acronym in heading_words or
966
  any(not c.isalpha() for c in acronym) or
967
  len(acronym) > 10): # Usually acronyms aren't this long
968
  continue
969
 
970
  if acronym not in defined_acronyms:
971
- # Only process if we haven't seen this acronym before
972
- if acronym not in first_occurrences:
973
- # Find the sentence containing the first undefined acronym
974
- sentences = re.split(r'(?<=[.!?])\s+', paragraph)
975
- for sentence in sentences:
976
- if acronym in sentence:
977
- # Additional check to avoid marking uppercase headings
978
- if not (sentence.isupper() and any(word in heading_words for word in sentence.split())):
979
- first_occurrences[acronym] = {
980
- 'acronym': acronym,
981
- 'sentence': sentence.strip()
982
- }
983
- break
984
-
985
- # Convert first occurrences to list of issues
986
- undefined_acronyms = list(first_occurrences.values())
987
-
988
- success = len(undefined_acronyms) == 0
989
- issues = undefined_acronyms if not success else []
 
 
 
 
 
 
 
 
 
990
 
991
  return DocumentCheckResult(success=success, issues=issues)
992
 
 
993
  @profile_performance
994
  def check_terminology(self, doc: List[str]) -> DocumentCheckResult:
995
  """
@@ -1942,6 +1952,14 @@ class DocumentCheckResultsFormatter:
1942
  subsequent_indent=' '
1943
  )
1944
 
 
 
 
 
 
 
 
 
1945
  # Handle issues with direct sentence reference
1946
  elif 'sentence' in issue:
1947
  return textwrap.fill(
@@ -2405,7 +2423,7 @@ def create_interface():
2405
  """
2406
 
2407
  # Extract issues
2408
- issues_match = re.findall(r'•\s*([^•\n]+)', content)
2409
  issues_html_section = ""
2410
  if issues_match:
2411
  issues_html_section = """
 
722
  # Core Check Methods
723
  @profile_performance
724
  def heading_title_check(self, doc: List[str], doc_type: str) -> DocumentCheckResult:
 
 
 
 
 
 
 
 
 
 
725
  if not self.validate_input(doc):
726
  self.logger.error("Invalid document input for heading check")
727
  return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
 
748
  headings_found = []
749
  required_headings_set = set(required_headings)
750
 
751
+ # Precompile a regex pattern to match headings at the start of the paragraph
752
+ # Escape special characters in headings and allow for optional spaces and periods
753
+ heading_patterns = []
754
+ for heading in required_headings:
755
+ escaped_heading = re.escape(heading.rstrip('.'))
756
+ pattern = rf'^\s*{escaped_heading}\.?\s*'
757
+ heading_patterns.append(pattern)
758
+ combined_pattern = re.compile('|'.join(heading_patterns), re.IGNORECASE)
759
+
760
  for para in doc:
761
  para_strip = para.strip()
762
+ # Check if paragraph starts with any of the required headings
763
+ match = combined_pattern.match(para_strip)
764
+ if match:
765
+ # Extract the matched heading
766
+ matched_heading = match.group().strip()
767
+ # Normalize the matched heading to compare with required headings
768
+ matched_heading_base = matched_heading.rstrip('.').strip()
769
+ # Find the exact heading from required headings (case-insensitive)
770
+ for required_heading in required_headings:
771
+ if matched_heading_base.lower() == required_heading.rstrip('.').lower():
772
+ headings_found.append(required_heading)
773
+ break
774
 
775
  # Check if all required headings are found
776
  found_headings_set = set(headings_found)
 
802
 
803
  return DocumentCheckResult(success=success, issues=issues, details=details)
804
 
805
+
806
  @profile_performance
807
  def heading_title_period_check(self, doc: List[str], doc_type: str) -> DocumentCheckResult:
808
  """
 
914
 
915
  @profile_performance
916
  def acronym_check(self, doc: List[str]) -> DocumentCheckResult:
 
 
 
 
917
  if not self.validate_input(doc):
918
  return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
919
 
 
 
 
 
920
  # Common words that might appear in uppercase but aren't acronyms
921
+ heading_words = self.config_manager.config.get('heading_words', HEADING_WORDS)
 
 
 
 
 
 
 
922
 
923
  # Standard acronyms that don't need to be defined
924
+ predefined_acronyms = self.config_manager.config.get('predefined_acronyms', PREDEFINED_ACRONYMS)
 
 
 
925
 
926
+ # Tracking structures
927
+ defined_acronyms = {} # Stores definition info
928
+ used_acronyms = set() # Stores acronyms used after definition
929
+ issues = []
930
 
931
+ # Patterns
932
  defined_pattern = re.compile(r'\b([\w\s&]+?)\s*\((\b[A-Z]{2,}\b)\)')
 
 
933
  acronym_pattern = re.compile(r'\b[A-Z]{2,}\b(?!\s*[:.]\s*)')
934
 
935
  for paragraph in doc:
 
938
  if all(word.isupper() for word in words) and any(word in heading_words for word in words):
939
  continue
940
 
941
+ # Check for acronym definitions first
942
  defined_matches = defined_pattern.findall(paragraph)
943
  for full_term, acronym in defined_matches:
944
+ if acronym not in predefined_acronyms:
945
+ if acronym not in defined_acronyms:
946
+ defined_acronyms[acronym] = {
947
+ 'full_term': full_term.strip(),
948
+ 'defined_at': paragraph.strip(),
949
+ 'used': False # Initially not used
950
+ }
951
+ else:
952
+ # Handle duplicate definitions if necessary
953
+ pass # You may add logic for duplicate definitions
954
 
955
  # Check for acronym usage
956
  usage_matches = acronym_pattern.finditer(paragraph)
957
  for match in usage_matches:
958
  acronym = match.group()
959
+
960
+ # Skip predefined acronyms
961
+ if acronym in predefined_acronyms:
962
+ continue
963
+
964
  # Skip if it's part of a heading or contains non-letter characters
965
+ if (acronym in heading_words or
966
  any(not c.isalpha() for c in acronym) or
967
  len(acronym) > 10): # Usually acronyms aren't this long
968
  continue
969
 
970
  if acronym not in defined_acronyms:
971
+ # Undefined acronym used
972
+ issues.append({
973
+ 'type': 'undefined_acronym',
974
+ 'acronym': acronym,
975
+ 'sentence': paragraph.strip()
976
+ })
977
+ else:
978
+ # Mark as used
979
+ defined_acronyms[acronym]['used'] = True
980
+ used_acronyms.add(acronym)
981
+
982
+ # Check for defined but unused acronyms
983
+ unused_acronyms = [
984
+ {
985
+ 'type': 'unused_acronym',
986
+ 'acronym': acronym,
987
+ 'full_term': data['full_term'],
988
+ 'defined_at': data['defined_at']
989
+ }
990
+ for acronym, data in defined_acronyms.items()
991
+ if not data['used']
992
+ ]
993
+
994
+ # Combine issues
995
+ if unused_acronyms:
996
+ issues.extend(unused_acronyms)
997
+
998
+ success = len(issues) == 0
999
 
1000
  return DocumentCheckResult(success=success, issues=issues)
1001
 
1002
+
1003
  @profile_performance
1004
  def check_terminology(self, doc: List[str]) -> DocumentCheckResult:
1005
  """
 
1952
  subsequent_indent=' '
1953
  )
1954
 
1955
+ # Handle unused acronym issues
1956
+ if issue.get('type') == 'unused_acronym':
1957
+ return textwrap.fill(
1958
+ f" • Acronym '{issue['acronym']}' defined but not used again after definition.",
1959
+ width=76,
1960
+ subsequent_indent=' '
1961
+ )
1962
+
1963
  # Handle issues with direct sentence reference
1964
  elif 'sentence' in issue:
1965
  return textwrap.fill(
 
2423
  """
2424
 
2425
  # Extract issues
2426
+ issues_match = re.findall(r'•\s*(.+?)(?=\n•|\Z)', content, re.DOTALL)
2427
  issues_html_section = ""
2428
  if issues_match:
2429
  issues_html_section = """