Hoctar77 commited on
Commit
8db81fe
Β·
verified Β·
1 Parent(s): 9d1e68c

Added paragraph and sentence length checks

Browse files
Files changed (1) hide show
  1. app.py +417 -23
app.py CHANGED
@@ -662,7 +662,7 @@ class FAADocumentChecker(DocumentChecker):
662
  'DISTRIBUTION', 'EXCEPTION', 'EXPLANATION', 'FIGURE', 'GENERAL', 'GROUPS',
663
  'INFORMATION', 'INSERT', 'INTRODUCTION', 'MATERIAL', 'NOTE', 'PARTS', 'PAST',
664
  'POLICY', 'PRACTICE', 'PROCEDURES', 'PURPOSE', 'RELEVANT', 'RELATED',
665
- 'REQUIREMENTS', 'SCOPE', 'SECTION', 'SUMMARY', 'TABLE', 'WARNING'
666
  }
667
 
668
  PREDEFINED_ACRONYMS = {
@@ -1457,6 +1457,7 @@ class FAADocumentChecker(DocumentChecker):
1457
 
1458
  # Patterns to ignore (aviation references)
1459
  ignore_patterns = [
 
1460
  r'\bAD \d{4}-\d{2}-\d{2}\b', # Airworthiness Directive references
1461
  r'\bSWPM \d{2}-\d{2}-\d{2}\b', # Standard Wiring Practices Manual references
1462
  r'\bAMM \d{2}-\d{2}-\d{2}\b', # Aircraft Maintenance Manual references
@@ -1667,20 +1668,21 @@ class FAADocumentChecker(DocumentChecker):
1667
  check_sequence = [
1668
  ('heading_title_check', lambda: self.heading_title_check(doc, doc_type)),
1669
  ('heading_title_period_check', lambda: self.heading_title_period_check(doc, doc_type)),
 
1670
  ('acronym_check', lambda: self.acronym_check(doc)),
1671
  ('acronym_usage_check', lambda: self.acronym_usage_check(doc)),
1672
- ('terminology_check', lambda: self.check_terminology(doc)),
1673
  ('section_symbol_usage_check', lambda: self.check_section_symbol_usage(doc)),
 
 
 
1674
  ('caption_check_table', lambda: self.caption_check(doc, doc_type, 'Table')),
1675
  ('caption_check_figure', lambda: self.caption_check(doc, doc_type, 'Figure')),
1676
  ('table_figure_reference_check', lambda: self.table_figure_reference_check(doc, doc_type)),
1677
- ('document_title_check', lambda: self.document_title_check(doc_path, doc_type) if not skip_title_check else DocumentCheckResult(success=True, issues=[])),
1678
  ('double_period_check', lambda: self.double_period_check(doc)),
1679
  ('spacing_check', lambda: self.spacing_check(doc)),
1680
- ('abbreviation_usage_check', lambda: self.check_abbreviation_usage(doc)),
1681
- ('date_formats_check', lambda: self.check_date_formats(doc)),
1682
- ('placeholders_check', lambda: self.check_placeholders(doc)),
1683
- ('parentheses_check', lambda: self.check_parentheses(doc))
1684
  ]
1685
 
1686
  # Run each check and store results
@@ -1871,6 +1873,369 @@ class FAADocumentChecker(DocumentChecker):
1871
 
1872
  return formatted_issues
1873
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1874
  class DocumentCheckResultsFormatter:
1875
 
1876
  def __init__(self):
@@ -1890,7 +2255,7 @@ class DocumentCheckResultsFormatter:
1890
  'heading_title_period_check': {
1891
  'title': 'Heading Period Format',
1892
  'description': 'Examines heading punctuation to ensure compliance with FAA document formatting standards. Some FAA documents (like Advisory Circulars and Orders) require periods at the end of headings, while others (like Federal Register Notices) don\'t.',
1893
- 'solution': 'Format heading periods according to document type requirements',
1894
  'example_fix': {
1895
  'before': 'Purpose',
1896
  'after': 'Purpose.' # For ACs and Orders
@@ -1899,7 +2264,7 @@ class DocumentCheckResultsFormatter:
1899
  'table_figure_reference_check': {
1900
  'title': 'Table and Figure References',
1901
  'description': 'Analyzes how tables and figures are referenced within your document text. Capitalize references at the beginning of sentences (e.g., "Table 2-1 shows...") and use lowercase references within sentences (e.g., "...as shown in table 2-1").',
1902
- 'solution': 'Capitalize references at start of sentences, use lowercase within sentences',
1903
  'example_fix': {
1904
  'before': 'The DTR values are specified in Table 3-1 and Figure 3-2.',
1905
  'after': 'The DTR values are specified in table 3-1 and figure 3-2.'
@@ -1908,7 +2273,7 @@ class DocumentCheckResultsFormatter:
1908
  'acronym_check': {
1909
  'title': 'Acronym Definition Issues',
1910
  'description': 'Ensures every acronym is properly introduced with its full term at first use. The check identifies undefined acronyms while recognizing common exceptions (like U.S.) that don\'t require definition.',
1911
- 'solution': 'Define each acronym at its first use, e.g., "Federal Aviation Administration (FAA)"',
1912
  'example_fix': {
1913
  'before': 'This order establishes general FAA organizational policies.',
1914
  'after': 'This order establishes general Federal Aviation Administration (FAA) organizational policies.'
@@ -1926,7 +2291,7 @@ class DocumentCheckResultsFormatter:
1926
  'terminology_check': {
1927
  'title': 'Incorrect Terminology',
1928
  'description': 'Evaluates document text against the various style manuals and orders to identify non-compliant terminology, ambiguous references, and outdated phrases. This includes checking for prohibited relative references (like "above" or "below"), proper legal terminology (like "must" instead of "shall"), and consistent formatting of regulatory citations.',
1929
- 'solution': 'Use explicit references to paragraphs, sections, tables, and figures',
1930
  'example_fix': {
1931
  'before': 'Operators shall comply with ADs to ensure aircraft safety and regulatory compliance',
1932
  'after': 'Operators must comply with ADs to ensure aircraft safety and regulatory compliance.'
@@ -1935,7 +2300,7 @@ class DocumentCheckResultsFormatter:
1935
  'section_symbol_usage_check': {
1936
  'title': 'Section Symbol (Β§) Format Issues',
1937
  'description': 'Examines the usage of section symbols (Β§) throughout your document. This includes verifying proper symbol placement in regulatory references, ensuring sections aren\'t started with the symbol, checking consistency in multiple-section citations, and validating proper CFR citations. For ACs, see FAA Order 1320.46.',
1938
- 'solution': 'Format section symbols correctly and never start sentences with them',
1939
  'example_fix': {
1940
  'before': 'Β§ 23.3 establishes design criteria.',
1941
  'after': 'Section 23.3 establishes design criteria.'
@@ -1944,7 +2309,7 @@ class DocumentCheckResultsFormatter:
1944
  'double_period_check': {
1945
  'title': 'Multiple Period Issues',
1946
  'description': 'Examines sentences for accidental double periods that often occur during document editing and revision. While double periods are sometimes found in ellipses (...) or web addresses, they should never appear at the end of standard sentences in FAA documentation.',
1947
- 'solution': 'Remove multiple periods that end sentences',
1948
  'example_fix': {
1949
  'before': 'The following ACs are related to the guidance in this document..',
1950
  'after': 'The following ACs are related to the guidance in this document.'
@@ -1971,7 +2336,7 @@ class DocumentCheckResultsFormatter:
1971
  'placeholders_check': {
1972
  'title': 'Placeholder Content',
1973
  'description': 'Identifies incomplete content and temporary placeholders that must be finalized before document publication. This includes common placeholder text (like "TBD" or "To be determined"), draft markers, and incomplete sections.',
1974
- 'solution': 'Replace all placeholder content with actual content',
1975
  'example_fix': {
1976
  'before': 'Pilots must submit the [Insert text] form to the FAA for approval.',
1977
  'after': 'Pilots must submit the Report of Eye Evaluation form 8500-7 to the FAA for approval.'
@@ -1980,11 +2345,29 @@ class DocumentCheckResultsFormatter:
1980
  'parentheses_check': {
1981
  'title': 'Parentheses Balance Check',
1982
  'description': 'Ensures that all parentheses in the document are properly paired with matching opening and closing characters.',
1983
- 'solution': 'Add missing opening or closing parentheses where indicated',
1984
  'example_fix': {
1985
  'before': 'The system (as defined in AC 25-11B performs...',
1986
  'after': 'The system (as defined in AC 25-11B) performs...'
1987
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1988
  }
1989
  }
1990
 
@@ -2093,12 +2476,18 @@ class DocumentCheckResultsFormatter:
2093
  if 'incorrect_term' in issue and 'correct_term' in issue:
2094
  return f" β€’ Replace '{issue['incorrect_term']}' with '{issue['correct_term']}'"
2095
 
 
 
 
2096
  if 'sentence' in issue:
2097
  return f" β€’ {issue['sentence']}"
2098
 
2099
  if 'description' in issue:
2100
  return f" β€’ {issue['description']}"
2101
 
 
 
 
2102
  # Fallback for other issue formats
2103
  return f" β€’ {str(issue)}"
2104
 
@@ -2324,12 +2713,14 @@ class DocumentCheckResultsFormatter:
2324
  output.extend(self._format_section_symbol_issues(result))
2325
  elif check_name == 'parentheses_check':
2326
  output.extend(self._format_parentheses_issues(result))
 
 
2327
  else:
2328
- formatted_issues = [self._format_standard_issue(issue) for issue in result.issues[:10]]
2329
  output.extend(formatted_issues)
2330
 
2331
  if len(result.issues) > 10:
2332
- output.append(f"\n ... and {len(result.issues) - 10} more similar issues.")
2333
 
2334
  return '\n'.join(output)
2335
 
@@ -2401,18 +2792,21 @@ def format_markdown_results(results: Dict[str, DocumentCheckResult], doc_type: s
2401
  check_categories = {
2402
  'heading_title_check': {'title': 'πŸ“‹ Required Headings', 'priority': 1},
2403
  'heading_title_period_check': {'title': 'πŸ” Heading Period Usage', 'priority': 1},
2404
- 'acronym_check': {'title': 'πŸ“ Acronym Definitions', 'priority': 2},
2405
- 'terminology_check': {'title': 'πŸ“– Terminology Usage', 'priority': 2},
 
2406
  'section_symbol_usage_check': {'title': 'Β§ Section Symbol Usage', 'priority': 2},
 
 
 
2407
  'caption_check_table': {'title': 'πŸ“Š Table Captions', 'priority': 3},
2408
  'caption_check_figure': {'title': 'πŸ–ΌοΈ Figure Captions', 'priority': 3},
2409
  'table_figure_reference_check': {'title': 'πŸ”— Table/Figure References', 'priority': 3},
2410
- 'document_title_check': {'title': 'πŸ“‘ Document Title Format', 'priority': 1},
2411
  'double_period_check': {'title': '⚑ Double Periods', 'priority': 4},
2412
  'spacing_check': {'title': '⌨️ Spacing Issues', 'priority': 4},
2413
- 'abbreviation_usage_check': {'title': 'πŸ“Ž Abbreviation Usage', 'priority': 3},
2414
- 'date_formats_check': {'title': 'πŸ“… Date Formats', 'priority': 3},
2415
- 'placeholders_check': {'title': '🚩 Placeholder Content', 'priority': 1}
2416
  }
2417
 
2418
  sorted_checks = sorted(
 
662
  'DISTRIBUTION', 'EXCEPTION', 'EXPLANATION', 'FIGURE', 'GENERAL', 'GROUPS',
663
  'INFORMATION', 'INSERT', 'INTRODUCTION', 'MATERIAL', 'NOTE', 'PARTS', 'PAST',
664
  'POLICY', 'PRACTICE', 'PROCEDURES', 'PURPOSE', 'RELEVANT', 'RELATED',
665
+ 'REQUIREMENTS', 'REPORT', 'SCOPE', 'SECTION', 'SUMMARY', 'TABLE', 'WARNING'
666
  }
667
 
668
  PREDEFINED_ACRONYMS = {
 
1457
 
1458
  # Patterns to ignore (aviation references)
1459
  ignore_patterns = [
1460
+ r'\bAC\s*\d+(?:[-.]\d+)*[A-Z]*\b', # AC reference pattern
1461
  r'\bAD \d{4}-\d{2}-\d{2}\b', # Airworthiness Directive references
1462
  r'\bSWPM \d{2}-\d{2}-\d{2}\b', # Standard Wiring Practices Manual references
1463
  r'\bAMM \d{2}-\d{2}-\d{2}\b', # Aircraft Maintenance Manual references
 
1668
  check_sequence = [
1669
  ('heading_title_check', lambda: self.heading_title_check(doc, doc_type)),
1670
  ('heading_title_period_check', lambda: self.heading_title_period_check(doc, doc_type)),
1671
+ ('terminology_check', lambda: self.check_terminology(doc)),
1672
  ('acronym_check', lambda: self.acronym_check(doc)),
1673
  ('acronym_usage_check', lambda: self.acronym_usage_check(doc)),
 
1674
  ('section_symbol_usage_check', lambda: self.check_section_symbol_usage(doc)),
1675
+ ('date_formats_check', lambda: self.check_date_formats(doc)),
1676
+ ('placeholders_check', lambda: self.check_placeholders(doc)),
1677
+ ('document_title_check', lambda: self.document_title_check(doc_path, doc_type) if not skip_title_check else DocumentCheckResult(success=True, issues=[])),
1678
  ('caption_check_table', lambda: self.caption_check(doc, doc_type, 'Table')),
1679
  ('caption_check_figure', lambda: self.caption_check(doc, doc_type, 'Figure')),
1680
  ('table_figure_reference_check', lambda: self.table_figure_reference_check(doc, doc_type)),
1681
+ ('parentheses_check', lambda: self.check_parentheses(doc)),
1682
  ('double_period_check', lambda: self.double_period_check(doc)),
1683
  ('spacing_check', lambda: self.spacing_check(doc)),
1684
+ ('paragraph_length_check', lambda: self.check_paragraph_length(doc)),
1685
+ ('sentence_length_check', lambda: self.check_sentence_length(doc)),
 
 
1686
  ]
1687
 
1688
  # Run each check and store results
 
1873
 
1874
  return formatted_issues
1875
 
1876
+ @profile_performance
1877
+ def check_abbreviation_usage(self, doc: List[str]) -> DocumentCheckResult:
1878
+ """Check for abbreviation consistency after first definition."""
1879
+ if not self.validate_input(doc):
1880
+ return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
1881
+
1882
+ # Track abbreviations and their usage
1883
+ abbreviations = {} # Store defined abbreviations
1884
+ inconsistent_uses = [] # Track full term usage after definition
1885
+
1886
+ def process_sentence(sentence: str) -> None:
1887
+ """Process a single sentence for abbreviation usage."""
1888
+ for acronym, data in abbreviations.items():
1889
+ full_term = data["full_term"]
1890
+ if full_term not in sentence:
1891
+ continue
1892
+
1893
+ # Skip if this is the definition sentence
1894
+ if sentence.strip() == data["first_occurrence"]:
1895
+ continue
1896
+
1897
+ # Track inconsistent usage
1898
+ if not data["defined"]:
1899
+ inconsistent_uses.append({
1900
+ 'issue_type': 'full_term_after_acronym',
1901
+ 'full_term': full_term,
1902
+ 'acronym': acronym,
1903
+ 'sentence': sentence.strip(),
1904
+ 'definition_context': data["first_occurrence"]
1905
+ })
1906
+ data["defined"] = False # Mark as used
1907
+
1908
+ # Process each paragraph
1909
+ for paragraph in doc:
1910
+ sentences = re.split(r'(?<=[.!?])\s+', paragraph)
1911
+ for sentence in sentences:
1912
+ process_sentence(sentence.strip())
1913
+
1914
+ success = len(inconsistent_uses) == 0
1915
+ return DocumentCheckResult(success=success, issues=inconsistent_uses)
1916
+
1917
+ @profile_performance
1918
+ def check_date_formats(self, doc: List[str]) -> DocumentCheckResult:
1919
+ """Check for inconsistent date formats while ignoring aviation reference numbers."""
1920
+ if not self.validate_input(doc):
1921
+ return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
1922
+
1923
+ # Get patterns from registry
1924
+ date_patterns = self.config_manager.pattern_registry.get('dates', [])
1925
+
1926
+ # Patterns to ignore (aviation references)
1927
+ ignore_patterns = [
1928
+ r'\bAD \d{4}-\d{2}-\d{2}\b', # Airworthiness Directive references
1929
+ r'\bSWPM \d{2}-\d{2}-\d{2}\b', # Standard Wiring Practices Manual references
1930
+ r'\bAMM \d{2}-\d{2}-\d{2}\b', # Aircraft Maintenance Manual references
1931
+ r'\bSOPM \d{2}-\d{2}-\d{2}\b', # Standard Operating Procedure references
1932
+ r'\b[A-Z]{2,4} \d{2}-\d{2}-\d{2}\b' # Generic manual reference pattern
1933
+ ]
1934
+
1935
+ # Combine ignore patterns into one
1936
+ ignore_regex = '|'.join(f'(?:{pattern})' for pattern in ignore_patterns)
1937
+ ignore_pattern = re.compile(ignore_regex)
1938
+
1939
+ # Track unique issues
1940
+ unique_issues = []
1941
+
1942
+ # Use _process_sentences helper
1943
+ for sentence, paragraph in self._process_sentences(doc, skip_empty=True, skip_headings=True):
1944
+ # First, identify and temporarily remove text that should be ignored
1945
+ working_sentence = sentence
1946
+
1947
+ # Find all matches to ignore
1948
+ ignored_matches = list(ignore_pattern.finditer(sentence))
1949
+
1950
+ # Replace ignored patterns with placeholders
1951
+ for match in reversed(ignored_matches):
1952
+ start, end = match.span()
1953
+ working_sentence = working_sentence[:start] + 'X' * (end - start) + working_sentence[end:]
1954
+
1955
+ # Now check for date patterns in the modified sentence
1956
+ for pattern_config in date_patterns:
1957
+ matches = list(re.finditer(pattern_config.pattern, working_sentence))
1958
+
1959
+ for match in matches:
1960
+ # Get the original text from the match position
1961
+ original_date = sentence[match.start():match.end()]
1962
+
1963
+ # Create formatted issue with incorrect/correct format
1964
+ formatted_issue = {
1965
+ 'incorrect': original_date,
1966
+ 'correct': 'Month Day, Year'
1967
+ }
1968
+ unique_issues.append(formatted_issue)
1969
+
1970
+ return DocumentCheckResult(success=len(unique_issues) == 0, issues=unique_issues)
1971
+
1972
+ @profile_performance
1973
+ def check_placeholders(self, doc: List[str]) -> DocumentCheckResult:
1974
+ """Check for placeholders that should be removed."""
1975
+ def process_placeholders(doc: List[str], patterns: List[PatternConfig]) -> DocumentCheckResult:
1976
+ tbd_placeholders = []
1977
+ to_be_determined_placeholders = []
1978
+ to_be_added_placeholders = []
1979
+
1980
+ pattern_categories = {
1981
+ r'\bTBD\b': ('tbd', tbd_placeholders),
1982
+ r'\bTo be determined\b': ('to_be_determined', to_be_determined_placeholders),
1983
+ r'\bTo be added\b': ('to_be_added', to_be_added_placeholders)
1984
+ }
1985
+
1986
+ # Use _process_sentences helper
1987
+ for sentence, paragraph in self._process_sentences(doc, skip_empty=True, skip_headings=True):
1988
+ for pattern_config in patterns:
1989
+ compiled_pattern = re.compile(pattern_config.pattern, re.IGNORECASE)
1990
+
1991
+ for pattern_key, (category_name, category_list) in pattern_categories.items():
1992
+ if pattern_config.pattern == pattern_key:
1993
+ matches = compiled_pattern.finditer(sentence)
1994
+ for match in matches:
1995
+ category_list.append({
1996
+ 'placeholder': match.group().strip(),
1997
+ 'sentence': sentence.strip(),
1998
+ 'description': pattern_config.description
1999
+ })
2000
+
2001
+ # Compile issues
2002
+ issues = []
2003
+ if tbd_placeholders:
2004
+ issues.append({
2005
+ 'issue_type': 'tbd_placeholder',
2006
+ 'description': 'Remove TBD placeholder',
2007
+ 'occurrences': tbd_placeholders
2008
+ })
2009
+
2010
+ if to_be_determined_placeholders:
2011
+ issues.append({
2012
+ 'issue_type': 'to_be_determined_placeholder',
2013
+ 'description': "Remove 'To be determined' placeholder",
2014
+ 'occurrences': to_be_determined_placeholders
2015
+ })
2016
+
2017
+ if to_be_added_placeholders:
2018
+ issues.append({
2019
+ 'issue_type': 'to_be_added_placeholder',
2020
+ 'description': "Remove 'To be added' placeholder",
2021
+ 'occurrences': to_be_added_placeholders
2022
+ })
2023
+
2024
+ details = {
2025
+ 'total_placeholders': len(tbd_placeholders) +
2026
+ len(to_be_determined_placeholders) +
2027
+ len(to_be_added_placeholders),
2028
+ 'placeholder_types': {
2029
+ 'TBD': len(tbd_placeholders),
2030
+ 'To be determined': len(to_be_determined_placeholders),
2031
+ 'To be added': len(to_be_added_placeholders)
2032
+ }
2033
+ }
2034
+
2035
+ return DocumentCheckResult(success=len(issues) == 0, issues=issues, details=details)
2036
+
2037
+ return self._process_patterns(doc, 'placeholders', process_placeholders)
2038
+
2039
+ @profile_performance
2040
+ def _process_patterns(
2041
+ self,
2042
+ doc: List[str],
2043
+ pattern_category: str,
2044
+ process_func: Optional[Callable] = None
2045
+ ) -> DocumentCheckResult:
2046
+ """
2047
+ Process document text against patterns from a specific category.
2048
+
2049
+ Args:
2050
+ doc: List of document paragraphs
2051
+ pattern_category: Category of patterns to check against
2052
+ process_func: Optional custom processing function
2053
+
2054
+ Returns:
2055
+ DocumentCheckResult with processed issues
2056
+ """
2057
+ if not self.validate_input(doc):
2058
+ self.logger.error("Invalid document input for pattern check")
2059
+ return DocumentCheckResult(
2060
+ success=False,
2061
+ issues=[{'error': 'Invalid document input'}]
2062
+ )
2063
+
2064
+ # Get patterns from registry
2065
+ patterns = self.config_manager.pattern_registry.get(pattern_category, [])
2066
+ if not patterns:
2067
+ self.logger.warning(f"No patterns found for category: {pattern_category}")
2068
+ return DocumentCheckResult(
2069
+ success=True,
2070
+ issues=[],
2071
+ details={'message': f'No patterns defined for {pattern_category}'}
2072
+ )
2073
+
2074
+ # Use custom processing function if provided
2075
+ if process_func:
2076
+ return process_func(doc, patterns)
2077
+
2078
+ # Default processing with deduplication
2079
+ unique_issues = set() # Using a set to track unique issues
2080
+
2081
+ for paragraph in doc:
2082
+ sentences = re.split(r'(?<=[.!?])\s+', paragraph)
2083
+ for sentence in sentences:
2084
+ sentence = sentence.strip()
2085
+ if not sentence:
2086
+ continue
2087
+
2088
+ for pattern_config in patterns:
2089
+ matches = list(re.finditer(pattern_config.pattern, sentence))
2090
+ if matches:
2091
+ # Add each match as a tuple to ensure uniqueness
2092
+ for match in matches:
2093
+ unique_issues.add((
2094
+ match.group(), # The matched text
2095
+ pattern_config.description, # The issue description
2096
+ pattern_config.replacement if hasattr(pattern_config, 'replacement') else None
2097
+ ))
2098
+
2099
+ # Convert unique issues back to the expected format
2100
+ formatted_issues = [
2101
+ {
2102
+ 'incorrect': issue[0],
2103
+ 'description': issue[1],
2104
+ 'replacement': issue[2]
2105
+ }
2106
+ for issue in sorted(unique_issues) # Sort for consistent output
2107
+ ]
2108
+
2109
+ return DocumentCheckResult(success=len(formatted_issues) == 0, issues=formatted_issues)
2110
+
2111
+ @profile_performance
2112
+ def check_paragraph_length(self, doc: List[str]) -> DocumentCheckResult:
2113
+ """
2114
+ Check for overly long paragraphs that may need to be split up.
2115
+
2116
+ Args:
2117
+ doc (List[str]): List of document paragraphs
2118
+
2119
+ Returns:
2120
+ DocumentCheckResult: Results of paragraph length check
2121
+ """
2122
+ if not self.validate_input(doc):
2123
+ return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
2124
+
2125
+ issues = []
2126
+
2127
+ for paragraph in doc:
2128
+ if not paragraph.strip(): # Skip empty paragraphs
2129
+ continue
2130
+
2131
+ # Count sentences (split on period, exclamation, question mark followed by space)
2132
+ sentences = re.split(r'(?<=[.!?])\s+', paragraph.strip())
2133
+ # Count lines (split on newlines or when length exceeds ~80 characters)
2134
+ lines = []
2135
+ current_line = ""
2136
+
2137
+ for word in paragraph.split():
2138
+ if len(current_line) + len(word) + 1 <= 80:
2139
+ current_line += " " + word if current_line else word
2140
+ else:
2141
+ lines.append(current_line)
2142
+ current_line = word
2143
+ if current_line:
2144
+ lines.append(current_line)
2145
+
2146
+ # Check if paragraph exceeds either threshold
2147
+ if len(sentences) > 6 or len(lines) > 8:
2148
+ # Get first sentence for context
2149
+ first_sentence = sentences[0].strip()
2150
+ issues.append(f"Review the paragraph that starts with: \"{first_sentence}\"")
2151
+
2152
+ return DocumentCheckResult(success=len(issues) == 0, issues=issues)
2153
+
2154
+ @profile_performance
2155
+ def check_sentence_length(self, doc: List[str]) -> DocumentCheckResult:
2156
+ """
2157
+ Check for overly long sentences that may need to be split for clarity.
2158
+
2159
+ Args:
2160
+ doc (List[str]): List of document paragraphs
2161
+
2162
+ Returns:
2163
+ DocumentCheckResult: Results of sentence length check including:
2164
+ - success: Boolean indicating if all sentences are acceptable length
2165
+ - issues: List of dicts with long sentence details
2166
+ - details: Additional statistics about sentence lengths
2167
+ """
2168
+ if not self.validate_input(doc):
2169
+ return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
2170
+
2171
+ issues = []
2172
+ sentence_stats = {
2173
+ 'total_sentences': 0,
2174
+ 'long_sentences': 0,
2175
+ 'max_length': 0,
2176
+ 'avg_length': 0
2177
+ }
2178
+
2179
+ # Skip patterns for technical content that might naturally be longer
2180
+ skip_patterns = [
2181
+ r'^(?:Note:|Warning:|Caution:)', # Notes and warnings
2182
+ r'^\d+\.', # Numbered lists
2183
+ r'^\([a-z]\)', # Letter lists
2184
+ r'^\([0-9]\)', # Number lists in parentheses
2185
+ r'^Table \d', # Table captions
2186
+ r'^Figure \d', # Figure captions
2187
+ r'(?:e\.g\.|i\.e\.|viz\.,)', # Latin abbreviations often used in complex sentences
2188
+ r'\b(?:AC|AD|TSO|SFAR)\s+\d', # Technical references
2189
+ r'\d+\s*(?:CFR|U\.S\.C\.)', # Regulatory references
2190
+ ]
2191
+ skip_regex = re.compile('|'.join(skip_patterns), re.IGNORECASE)
2192
+
2193
+ total_words = 0
2194
+
2195
+ for paragraph in doc:
2196
+ if not paragraph.strip():
2197
+ continue
2198
+
2199
+ # Skip if paragraph matches any skip patterns
2200
+ if skip_regex.search(paragraph):
2201
+ continue
2202
+
2203
+ # Split into sentences while preserving punctuation
2204
+ sentences = re.split(r'(?<=[.!?])\s+', paragraph.strip())
2205
+
2206
+ for sentence in sentences:
2207
+ sentence = sentence.strip()
2208
+ if not sentence:
2209
+ continue
2210
+
2211
+ # Count words (splitting on whitespace)
2212
+ words = sentence.split()
2213
+ word_count = len(words)
2214
+
2215
+ sentence_stats['total_sentences'] += 1
2216
+ total_words += word_count
2217
+
2218
+ if word_count > sentence_stats['max_length']:
2219
+ sentence_stats['max_length'] = word_count
2220
+
2221
+ # Flag sentences over 35 words
2222
+ if word_count > 35:
2223
+ sentence_stats['long_sentences'] += 1
2224
+ issues.append({
2225
+ 'sentence': sentence,
2226
+ 'word_count': word_count
2227
+ })
2228
+
2229
+ # Calculate average sentence length
2230
+ if sentence_stats['total_sentences'] > 0:
2231
+ sentence_stats['avg_length'] = round(total_words / sentence_stats['total_sentences'], 1)
2232
+
2233
+ return DocumentCheckResult(
2234
+ success=len(issues) == 0,
2235
+ issues=issues,
2236
+ details=sentence_stats
2237
+ )
2238
+
2239
  class DocumentCheckResultsFormatter:
2240
 
2241
  def __init__(self):
 
2255
  'heading_title_period_check': {
2256
  'title': 'Heading Period Format',
2257
  'description': 'Examines heading punctuation to ensure compliance with FAA document formatting standards. Some FAA documents (like Advisory Circulars and Orders) require periods at the end of headings, while others (like Federal Register Notices) don\'t.',
2258
+ 'solution': 'Format heading periods according to document type requirements.',
2259
  'example_fix': {
2260
  'before': 'Purpose',
2261
  'after': 'Purpose.' # For ACs and Orders
 
2264
  'table_figure_reference_check': {
2265
  'title': 'Table and Figure References',
2266
  'description': 'Analyzes how tables and figures are referenced within your document text. Capitalize references at the beginning of sentences (e.g., "Table 2-1 shows...") and use lowercase references within sentences (e.g., "...as shown in table 2-1").',
2267
+ 'solution': 'Capitalize references at start of sentences, use lowercase within sentences.',
2268
  'example_fix': {
2269
  'before': 'The DTR values are specified in Table 3-1 and Figure 3-2.',
2270
  'after': 'The DTR values are specified in table 3-1 and figure 3-2.'
 
2273
  'acronym_check': {
2274
  'title': 'Acronym Definition Issues',
2275
  'description': 'Ensures every acronym is properly introduced with its full term at first use. The check identifies undefined acronyms while recognizing common exceptions (like U.S.) that don\'t require definition.',
2276
+ 'solution': 'Define each acronym at its first use, e.g., "Federal Aviation Administration (FAA)".',
2277
  'example_fix': {
2278
  'before': 'This order establishes general FAA organizational policies.',
2279
  'after': 'This order establishes general Federal Aviation Administration (FAA) organizational policies.'
 
2291
  'terminology_check': {
2292
  'title': 'Incorrect Terminology',
2293
  'description': 'Evaluates document text against the various style manuals and orders to identify non-compliant terminology, ambiguous references, and outdated phrases. This includes checking for prohibited relative references (like "above" or "below"), proper legal terminology (like "must" instead of "shall"), and consistent formatting of regulatory citations.',
2294
+ 'solution': 'Use explicit references to paragraphs, sections, tables, and figures.',
2295
  'example_fix': {
2296
  'before': 'Operators shall comply with ADs to ensure aircraft safety and regulatory compliance',
2297
  'after': 'Operators must comply with ADs to ensure aircraft safety and regulatory compliance.'
 
2300
  'section_symbol_usage_check': {
2301
  'title': 'Section Symbol (Β§) Format Issues',
2302
  'description': 'Examines the usage of section symbols (Β§) throughout your document. This includes verifying proper symbol placement in regulatory references, ensuring sections aren\'t started with the symbol, checking consistency in multiple-section citations, and validating proper CFR citations. For ACs, see FAA Order 1320.46.',
2303
+ 'solution': 'Format section symbols correctly and never start sentences with them.',
2304
  'example_fix': {
2305
  'before': 'Β§ 23.3 establishes design criteria.',
2306
  'after': 'Section 23.3 establishes design criteria.'
 
2309
  'double_period_check': {
2310
  'title': 'Multiple Period Issues',
2311
  'description': 'Examines sentences for accidental double periods that often occur during document editing and revision. While double periods are sometimes found in ellipses (...) or web addresses, they should never appear at the end of standard sentences in FAA documentation.',
2312
+ 'solution': 'Remove multiple periods that end sentences.',
2313
  'example_fix': {
2314
  'before': 'The following ACs are related to the guidance in this document..',
2315
  'after': 'The following ACs are related to the guidance in this document.'
 
2336
  'placeholders_check': {
2337
  'title': 'Placeholder Content',
2338
  'description': 'Identifies incomplete content and temporary placeholders that must be finalized before document publication. This includes common placeholder text (like "TBD" or "To be determined"), draft markers, and incomplete sections.',
2339
+ 'solution': 'Replace all placeholder content with actual content.',
2340
  'example_fix': {
2341
  'before': 'Pilots must submit the [Insert text] form to the FAA for approval.',
2342
  'after': 'Pilots must submit the Report of Eye Evaluation form 8500-7 to the FAA for approval.'
 
2345
  'parentheses_check': {
2346
  'title': 'Parentheses Balance Check',
2347
  'description': 'Ensures that all parentheses in the document are properly paired with matching opening and closing characters.',
2348
+ 'solution': 'Add missing opening or closing parentheses where indicated.',
2349
  'example_fix': {
2350
  'before': 'The system (as defined in AC 25-11B performs...',
2351
  'after': 'The system (as defined in AC 25-11B) performs...'
2352
  }
2353
+ },
2354
+ 'paragraph_length_check': {
2355
+ 'title': 'Paragraph Length Issues',
2356
+ 'description': 'Flags paragraphs exceeding 6 sentences or 8 lines to enhance readability and clarity. While concise paragraphs are encouraged, with each focusing on a single idea or related points, exceeding these limits doesn\'t necessarily indicate a problem. Some content may appropriately extend beyond 8 lines, especially if it includes necessary details. Boilerplate language or template text exceeding these limits is not subject to modification or division.',
2357
+ 'solution': 'Where possible, split long paragraphs into smaller sections, ensuring each focuses on one primary idea. If restructuring is not feasible or the content is boilerplate text, no changes are needed.',
2358
+ 'example_fix': {
2359
+ 'before': 'A very long paragraph covering multiple topics and spanning many lines...',
2360
+ 'after': 'Multiple shorter paragraphs or restructured paragraphs, each focused on a single topic or related points.'
2361
+ }
2362
+ },
2363
+ 'sentence_length_check': {
2364
+ 'title': 'Sentence Length Issues',
2365
+ 'description': 'Analyzes sentence length to ensure readability. While the ideal length varies with content complexity, sentences over 35 words often become difficult to follow. Technical content, regulatory references, notes, warnings, and list items are excluded from this check.',
2366
+ 'solution': 'Break long sentences into smaller ones where possible, focusing on one main point per sentence. Consider using lists for complex items.',
2367
+ 'example_fix': {
2368
+ 'before': 'The operator must ensure that all required maintenance procedures are performed in accordance with the manufacturer\'s specifications and that proper documentation is maintained throughout the entire process to demonstrate compliance with regulatory requirements.',
2369
+ 'after': 'The operator must ensure all required maintenance procedures are performed according to manufacturer specifications. Additionally, proper documentation must be maintained to demonstrate regulatory compliance.'
2370
+ }
2371
  }
2372
  }
2373
 
 
2476
  if 'incorrect_term' in issue and 'correct_term' in issue:
2477
  return f" β€’ Replace '{issue['incorrect_term']}' with '{issue['correct_term']}'"
2478
 
2479
+ if 'sentence' in issue and 'word_count' in issue: # For sentence length check
2480
+ return f" β€’ Review this sentence: \"{issue['sentence']}\""
2481
+
2482
  if 'sentence' in issue:
2483
  return f" β€’ {issue['sentence']}"
2484
 
2485
  if 'description' in issue:
2486
  return f" β€’ {issue['description']}"
2487
 
2488
+ if 'type' in issue and issue['type'] == 'long_paragraph':
2489
+ return f" β€’ {issue['message']}"
2490
+
2491
  # Fallback for other issue formats
2492
  return f" β€’ {str(issue)}"
2493
 
 
2713
  output.extend(self._format_section_symbol_issues(result))
2714
  elif check_name == 'parentheses_check':
2715
  output.extend(self._format_parentheses_issues(result))
2716
+ elif check_name == 'paragraph_length_check':
2717
+ output.extend(self._format_paragraph_length_issues(result))
2718
  else:
2719
+ formatted_issues = [self._format_standard_issue(issue) for issue in result.issues[:15]]
2720
  output.extend(formatted_issues)
2721
 
2722
  if len(result.issues) > 10:
2723
+ output.append(f"\n ... and {len(result.issues) - 15} more similar issues.")
2724
 
2725
  return '\n'.join(output)
2726
 
 
2792
  check_categories = {
2793
  'heading_title_check': {'title': 'πŸ“‹ Required Headings', 'priority': 1},
2794
  'heading_title_period_check': {'title': 'πŸ” Heading Period Usage', 'priority': 1},
2795
+ 'terminology_check': {'title': 'πŸ“– Terminology Usage', 'priority': 1},
2796
+ 'acronym_check': {'title': 'πŸ“ Acronym Definitions', 'priority': 1},
2797
+ 'acronym_usage_check': {'title': 'πŸ“Ž Acronym Usage', 'priority': 1},
2798
  'section_symbol_usage_check': {'title': 'Β§ Section Symbol Usage', 'priority': 2},
2799
+ 'date_formats_check': {'title': 'πŸ“… Date Formats', 'priority': 2},
2800
+ 'placeholders_check': {'title': '🚩 Placeholder Content', 'priority': 2},
2801
+ 'document_title_check': {'title': 'πŸ“‘ Document Title Format', 'priority': 2},
2802
  'caption_check_table': {'title': 'πŸ“Š Table Captions', 'priority': 3},
2803
  'caption_check_figure': {'title': 'πŸ–ΌοΈ Figure Captions', 'priority': 3},
2804
  'table_figure_reference_check': {'title': 'πŸ”— Table/Figure References', 'priority': 3},
2805
+ 'parentheses_check': {'title': 'πŸ”— Parentheses Usage', 'priority': 4},
2806
  'double_period_check': {'title': '⚑ Double Periods', 'priority': 4},
2807
  'spacing_check': {'title': '⌨️ Spacing Issues', 'priority': 4},
2808
+ 'paragraph_length_check': {'title': 'πŸ“ Paragraph Length', 'priority': 5},
2809
+ 'sentence_length_check': {'title': 'πŸ“ Sentence Length', 'priority': 5}
 
2810
  }
2811
 
2812
  sorted_checks = sorted(