Hoctar77 commited on
Commit
429fee2
·
verified ·
1 Parent(s): 0a842be

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -20
app.py CHANGED
@@ -55,7 +55,7 @@ HEADING_WORDS = {
55
  # Predefined Acronyms
56
  PREDEFINED_ACRONYMS = {
57
  'CFR', 'U.S.', 'USA', 'US', 'U.S.C', 'e.g.', 'i.e.', 'FAQ', 'No.', 'ZIP', 'PDF', 'SSN',
58
- 'DC', 'MA', 'WA', 'TX', 'MO'
59
  }
60
 
61
  # Configuration Constants
@@ -451,6 +451,16 @@ class DocumentCheckerConfig:
451
  """
452
  return {
453
  'terminology': [
 
 
 
 
 
 
 
 
 
 
454
  PatternConfig(
455
  pattern=r'\bUSC\b',
456
  description="USC should be U.S.C.",
@@ -924,10 +934,10 @@ class FAADocumentChecker(DocumentChecker):
924
  return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
925
 
926
  # Common words that might appear in uppercase but aren't acronyms
927
- heading_words = self.config_manager.config.get('heading_words', HEADING_WORDS)
928
 
929
  # Standard acronyms that don't need to be defined
930
- predefined_acronyms = self.config_manager.config.get('predefined_acronyms', PREDEFINED_ACRONYMS)
931
 
932
  # Tracking structures
933
  defined_acronyms = {} # Stores definition info
@@ -936,6 +946,7 @@ class FAADocumentChecker(DocumentChecker):
936
 
937
  # Patterns
938
  defined_pattern = re.compile(r'\b([\w\s&]+?)\s*\((\b[A-Z]{2,}\b)\)')
 
939
  acronym_pattern = re.compile(r'(?<!\()\b[A-Z]{2,}\b(?!\s*[:.]\s*)')
940
 
941
  for paragraph in doc:
@@ -975,37 +986,69 @@ class FAADocumentChecker(DocumentChecker):
975
 
976
  if acronym not in defined_acronyms:
977
  # Undefined acronym used
978
- issues.append({
979
- 'type': 'undefined_acronym',
980
- 'acronym': acronym,
981
- 'sentence': paragraph.strip()
982
- })
983
  else:
984
  # Mark as used
985
  defined_acronyms[acronym]['used'] = True
986
  used_acronyms.add(acronym)
987
 
988
- # Check for defined but unused acronyms
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
989
  unused_acronyms = [
990
  {
991
- 'type': 'unused_acronym',
992
  'acronym': acronym,
993
  'full_term': data['full_term'],
994
  'defined_at': data['defined_at']
995
  }
996
  for acronym, data in defined_acronyms.items()
997
- if not data['used']
998
  ]
999
 
1000
- # Combine issues
1001
- if unused_acronyms:
1002
- issues.extend(unused_acronyms)
1003
-
1004
- success = len(issues) == 0
1005
-
1006
- return DocumentCheckResult(success=success, issues=issues)
1007
-
1008
 
 
 
1009
  @profile_performance
1010
  def check_terminology(self, doc: List[str]) -> DocumentCheckResult:
1011
  """
@@ -1706,6 +1749,7 @@ class FAADocumentChecker(DocumentChecker):
1706
  ('heading_title_check', lambda: self.heading_title_check(doc, doc_type)),
1707
  ('heading_title_period_check', lambda: self.heading_title_period_check(doc, doc_type)),
1708
  ('acronym_check', lambda: self.acronym_check(doc)),
 
1709
  ('terminology_check', lambda: self.check_terminology(doc)),
1710
  ('section_symbol_usage_check', lambda: self.check_section_symbol_usage(doc)),
1711
  ('caption_check_table', lambda: self.caption_check(doc, doc_type, 'Table')),
@@ -1777,6 +1821,15 @@ class DocumentCheckResultsFormatter:
1777
  'after': 'This order establishes general Federal Aviation Administration (FAA) organizational policies.'
1778
  }
1779
  },
 
 
 
 
 
 
 
 
 
1780
  'terminology_check': {
1781
  'title': 'Incorrect Terminology',
1782
  'description': 'Evaluates document text against the various style manuals and orders to identify non-compliant terminology, ambiguous references, and outdated phrases. This includes checking for prohibited relative references (like "above" or "below"), proper legal terminology (like "must" instead of "shall"), and consistent formatting of regulatory citations. The check ensures precise, unambiguous communication that meets current FAA documentation requirements.',
@@ -1909,6 +1962,23 @@ class DocumentCheckResultsFormatter:
1909
 
1910
  return output
1911
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1912
  def _format_caption_issues(self, result: DocumentCheckResult) -> List[str]:
1913
  """Format caption issues consistently."""
1914
  output = []
@@ -2087,7 +2157,7 @@ class DocumentCheckResultsFormatter:
2087
  "italics": True,
2088
  "quotes": False,
2089
  "description": "For Advisory Circulars, referenced document titles should be italicized but not quoted",
2090
- "example": "See AC 25.1309-1B, *System Design and Analysis*, for information on X."
2091
  },
2092
  "quotes_only": {
2093
  "types": [
@@ -2137,6 +2207,18 @@ class DocumentCheckResultsFormatter:
2137
 
2138
  output = []
2139
 
 
 
 
 
 
 
 
 
 
 
 
 
2140
  # Header
2141
  output.append(f"\n{Fore.CYAN}{'='*80}")
2142
  output.append(f"Document Check Results Summary")
@@ -2179,6 +2261,8 @@ class DocumentCheckResultsFormatter:
2179
  output.extend(self._format_reference_issues(result))
2180
  elif check_name in ['caption_check_table', 'caption_check_figure']:
2181
  output.extend(self._format_caption_issues(result))
 
 
2182
  else:
2183
  # Standard issue formatting
2184
  formatted_issues = [self._format_standard_issue(issue) for issue in result.issues[:7]]
 
55
  # Predefined Acronyms
56
  PREDEFINED_ACRONYMS = {
57
  'CFR', 'U.S.', 'USA', 'US', 'U.S.C', 'e.g.', 'i.e.', 'FAQ', 'No.', 'ZIP', 'PDF', 'SSN',
58
+ 'DC', 'MD', 'MA', 'WA', 'TX', 'MO', 'FAA IR-M', 'DOT'
59
  }
60
 
61
  # Configuration Constants
 
451
  """
452
  return {
453
  'terminology': [
454
+ PatternConfig(
455
+ pattern=r'\btitle 14 of the Code of Federal Regulations \(14 CFR\)\b',
456
+ description="Ignore 'title 14 of the Code of Federal Regulations (14 CFR)'",
457
+ is_error=False # Set to False to ignore this phrase
458
+ ),
459
+ PatternConfig(
460
+ pattern=r'\btitle 14, Code of Federal Regulations \(14 CFR\)\b',
461
+ description="Ignore 'title 14, Code of Federal Regulations (14 CFR)'",
462
+ is_error=False
463
+ ),
464
  PatternConfig(
465
  pattern=r'\bUSC\b',
466
  description="USC should be U.S.C.",
 
934
  return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
935
 
936
  # Common words that might appear in uppercase but aren't acronyms
937
+ heading_words = self.config_manager.config.get('heading_words', self.HEADING_WORDS)
938
 
939
  # Standard acronyms that don't need to be defined
940
+ predefined_acronyms = self.config_manager.config.get('predefined_acronyms', self.PREDEFINED_ACRONYMS)
941
 
942
  # Tracking structures
943
  defined_acronyms = {} # Stores definition info
 
946
 
947
  # Patterns
948
  defined_pattern = re.compile(r'\b([\w\s&]+?)\s*\((\b[A-Z]{2,}\b)\)')
949
+ # Modified acronym pattern
950
  acronym_pattern = re.compile(r'(?<!\()\b[A-Z]{2,}\b(?!\s*[:.]\s*)')
951
 
952
  for paragraph in doc:
 
986
 
987
  if acronym not in defined_acronyms:
988
  # Undefined acronym used
989
+ issues.append(acronym) # Add only the acronym, not the sentence
 
 
 
 
990
  else:
991
  # Mark as used
992
  defined_acronyms[acronym]['used'] = True
993
  used_acronyms.add(acronym)
994
 
995
+ # Define success based on whether there are any undefined acronyms
996
+ success = len(issues) == 0
997
+
998
+ # Return the result with only undefined acronyms
999
+ return DocumentCheckResult(success=success, issues=list(set(issues)))
1000
+
1001
+ @profile_performance
1002
+ def acronym_usage_check(self, doc: List[str]) -> DocumentCheckResult:
1003
+ if not self.validate_input(doc):
1004
+ return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
1005
+
1006
+ # Pattern to find acronym definitions (e.g., "Environmental Protection Agency (EPA)")
1007
+ defined_pattern = re.compile(r'\b([\w\s&]+?)\s*\((\b[A-Z]{2,}\b)\)')
1008
+
1009
+ # Pattern to find acronym usage (e.g., "FAA", "EPA")
1010
+ acronym_pattern = re.compile(r'\b[A-Z]{2,}\b')
1011
+
1012
+ # Tracking structures
1013
+ defined_acronyms = {}
1014
+ used_acronyms = set()
1015
+
1016
+ # Step 1: Extract all defined acronyms
1017
+ for paragraph in doc:
1018
+ defined_matches = defined_pattern.findall(paragraph)
1019
+ for full_term, acronym in defined_matches:
1020
+ if acronym not in defined_acronyms:
1021
+ defined_acronyms[acronym] = {
1022
+ 'full_term': full_term.strip(),
1023
+ 'defined_at': paragraph.strip()
1024
+ }
1025
+
1026
+ # Step 2: Check for acronym usage, excluding definitions
1027
+ for paragraph in doc:
1028
+ # Remove definitions from paragraph for usage checks
1029
+ paragraph_excluding_definitions = re.sub(defined_pattern, '', paragraph)
1030
+
1031
+ usage_matches = acronym_pattern.findall(paragraph_excluding_definitions)
1032
+ for acronym in usage_matches:
1033
+ if acronym in defined_acronyms:
1034
+ used_acronyms.add(acronym)
1035
+
1036
+ # Step 3: Identify unused acronyms
1037
  unused_acronyms = [
1038
  {
 
1039
  'acronym': acronym,
1040
  'full_term': data['full_term'],
1041
  'defined_at': data['defined_at']
1042
  }
1043
  for acronym, data in defined_acronyms.items()
1044
+ if acronym not in used_acronyms
1045
  ]
1046
 
1047
+ # Success is true if no unused acronyms are found
1048
+ success = len(unused_acronyms) == 0
 
 
 
 
 
 
1049
 
1050
+ return DocumentCheckResult(success=success, issues=unused_acronyms)
1051
+
1052
  @profile_performance
1053
  def check_terminology(self, doc: List[str]) -> DocumentCheckResult:
1054
  """
 
1749
  ('heading_title_check', lambda: self.heading_title_check(doc, doc_type)),
1750
  ('heading_title_period_check', lambda: self.heading_title_period_check(doc, doc_type)),
1751
  ('acronym_check', lambda: self.acronym_check(doc)),
1752
+ ('acronym_usage_check', lambda: self.acronym_usage_check(doc)),
1753
  ('terminology_check', lambda: self.check_terminology(doc)),
1754
  ('section_symbol_usage_check', lambda: self.check_section_symbol_usage(doc)),
1755
  ('caption_check_table', lambda: self.caption_check(doc, doc_type, 'Table')),
 
1821
  'after': 'This order establishes general Federal Aviation Administration (FAA) organizational policies.'
1822
  }
1823
  },
1824
+ 'acronym_usage_check': {
1825
+ 'title': 'Unused Acronym Definitions',
1826
+ 'description': 'Ensures all acronyms defined in the document are subsequently used. If a term is defined but not used, it should not be defined.',
1827
+ 'solution': 'Remove definitions for acronyms that are not used later in the document.',
1828
+ 'example_fix': {
1829
+ 'before': 'Airworthiness Directive (AD) requirements are critical.',
1830
+ 'after': 'Remove "Airworthiness Directive (AD)" if "AD" is not used elsewhere.'
1831
+ }
1832
+ },
1833
  'terminology_check': {
1834
  'title': 'Incorrect Terminology',
1835
  'description': 'Evaluates document text against the various style manuals and orders to identify non-compliant terminology, ambiguous references, and outdated phrases. This includes checking for prohibited relative references (like "above" or "below"), proper legal terminology (like "must" instead of "shall"), and consistent formatting of regulatory citations. The check ensures precise, unambiguous communication that meets current FAA documentation requirements.',
 
1962
 
1963
  return output
1964
 
1965
+ def _format_unused_acronym_issues(self, result: DocumentCheckResult) -> List[str]:
1966
+ """
1967
+ Format issues for unused acronyms to display only the acronym.
1968
+
1969
+ Args:
1970
+ result: The DocumentCheckResult object containing issues.
1971
+
1972
+ Returns:
1973
+ List[str]: Formatted lines displaying unused acronyms.
1974
+ """
1975
+ output = []
1976
+ for issue in result.issues:
1977
+ if isinstance(issue, dict):
1978
+ acronym = issue.get('acronym', 'Unknown Acronym')
1979
+ output.append(f" • Acronym '{acronym}' was defined but never used.")
1980
+ return output
1981
+
1982
  def _format_caption_issues(self, result: DocumentCheckResult) -> List[str]:
1983
  """Format caption issues consistently."""
1984
  output = []
 
2157
  "italics": True,
2158
  "quotes": False,
2159
  "description": "For Advisory Circulars, referenced document titles should be italicized but not quoted",
2160
+ "example": "See AC 25.1309-1B, <i>System Design and Analysis</i>, for information on X."
2161
  },
2162
  "quotes_only": {
2163
  "types": [
 
2207
 
2208
  output = []
2209
 
2210
+ self.issue_categories['acronym_usage_check'] = {
2211
+ 'title': 'Unused Acronym Definitions',
2212
+ 'description': 'Ensures all acronyms defined in the document are subsequently used.',
2213
+ 'solution': 'Remove definitions for acronyms that are not used later in the document.',
2214
+ 'example_fix': {
2215
+ 'before': 'Airworthiness Directive (AD) requirements are critical.',
2216
+ 'after': 'Remove "Airworthiness Directive (AD)" if "AD" is not used elsewhere.'
2217
+ }
2218
+ }
2219
+
2220
+ output = []
2221
+
2222
  # Header
2223
  output.append(f"\n{Fore.CYAN}{'='*80}")
2224
  output.append(f"Document Check Results Summary")
 
2261
  output.extend(self._format_reference_issues(result))
2262
  elif check_name in ['caption_check_table', 'caption_check_figure']:
2263
  output.extend(self._format_caption_issues(result))
2264
+ elif check_name == 'acronym_usage_check':
2265
+ output.extend(self._format_unused_acronym_issues(result))
2266
  else:
2267
  # Standard issue formatting
2268
  formatted_issues = [self._format_standard_issue(issue) for issue in result.issues[:7]]