Hoctar77 commited on
Commit
0656ea9
Β·
verified Β·
1 Parent(s): bb6584a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +429 -459
app.py CHANGED
@@ -11,7 +11,12 @@ import io
11
  import os
12
  import traceback
13
  from datetime import datetime
 
14
 
 
 
 
 
15
  @dataclass
16
  class DocumentCheckResult:
17
  """Structured result for document checks."""
@@ -26,7 +31,6 @@ def profile_performance(func):
26
  start_time = time.time()
27
  result = func(*args, **kwargs)
28
  end_time = time.time()
29
- # Get logger from the class instance (first argument)
30
  logger = args[0].logger if hasattr(args[0], 'logger') else logging.getLogger(__name__)
31
  logger.info(
32
  f"Performance: {func.__name__} took {end_time - start_time:.4f} seconds"
@@ -37,7 +41,6 @@ def profile_performance(func):
37
  class DocumentCheckerConfig:
38
  """Configuration management for document checks."""
39
  def __init__(self, config_path: Optional[str] = None):
40
- """Initialize configuration with optional config file."""
41
  self.config = self._load_config(config_path)
42
  self.logger = self._setup_logger()
43
 
@@ -104,6 +107,26 @@ class DocumentCheckerConfig:
104
  ],
105
  "skip_title_check": False
106
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  "Other": {
108
  "required_headings": [],
109
  "skip_title_check": True
@@ -164,35 +187,27 @@ class DocumentChecker:
164
  return []
165
 
166
  class FAADocumentChecker(DocumentChecker):
 
167
  def __init__(self, config_path: Optional[str] = None):
168
  super().__init__(config_path)
169
 
170
- @profile_performance # Use the decorator directly
171
  def heading_title_check(self, doc: List[str], doc_type: str) -> DocumentCheckResult:
172
  """Check headings for a specific document type."""
173
  if not self.validate_input(doc):
174
  return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
175
 
176
- # Use configuration for document-specific headings
177
- checks = self.config_manager.config['document_types'].get(
178
- doc_type, {}
179
- )
180
  required_headings = checks.get('required_headings', [])
181
-
182
  headings_found = []
183
-
184
- # Create a set for faster lookup
185
  required_headings_set = set(required_headings)
186
 
187
  for para in doc:
188
  para_strip = para.strip()
189
- # Check if the paragraph is in the required headings list
190
  if para_strip in required_headings_set:
191
  headings_found.append(para_strip)
192
 
193
- # Check if all required headings are found
194
  all_headings_present = set(headings_found) == required_headings_set
195
-
196
  issues = []
197
  if not all_headings_present:
198
  missing_headings = required_headings_set - set(headings_found)
@@ -206,23 +221,13 @@ class FAADocumentChecker(DocumentChecker):
206
  'required_headings': required_headings
207
  }
208
  )
209
-
210
  @profile_performance
211
  def heading_title_period_check(self, doc: List[str], doc_type: str) -> DocumentCheckResult:
212
- """
213
- Check if headings end with periods according to document type requirements.
214
-
215
- Args:
216
- doc (List[str]): List of document paragraphs
217
- doc_type (str): Type of document being checked
218
-
219
- Returns:
220
- DocumentCheckResult: Result of the heading period check
221
- """
222
  if not self.validate_input(doc):
223
  return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
224
 
225
- # Define document types requiring periods in headings
226
  period_required = {
227
  "Advisory Circular": True,
228
  "Airworthiness Criteria": False,
@@ -237,10 +242,7 @@ class FAADocumentChecker(DocumentChecker):
237
  "Other": False
238
  }
239
 
240
- # Get whether periods are required for this document type
241
  should_have_period = period_required.get(doc_type, False)
242
-
243
- # Get the headings configuration for this document type
244
  checks = self.config_manager.config['document_types'].get(doc_type, {})
245
  required_headings = checks.get('required_headings', [])
246
  required_headings_set = set(required_headings)
@@ -250,7 +252,6 @@ class FAADocumentChecker(DocumentChecker):
250
 
251
  for para in doc:
252
  para_strip = para.strip()
253
- # Check only if paragraph is a heading
254
  if para_strip in required_headings_set:
255
  ends_with_period = para_strip.endswith('.')
256
 
@@ -283,78 +284,85 @@ class FAADocumentChecker(DocumentChecker):
283
  'needs_period': should_have_period
284
  })
285
 
286
- success = len(issues) == 0
287
-
288
  return DocumentCheckResult(
289
- success=success,
290
- issues=issues,
291
- details={
292
- 'document_type': doc_type,
293
- 'periods_required': should_have_period,
294
- 'checked_headings': checked_headings
295
- }
296
- )
297
 
298
  @profile_performance
299
  def acronym_check(self, doc: List[str]) -> DocumentCheckResult:
300
- """Check if acronyms are defined at their first use, only flagging the first instance of undefined acronyms."""
301
  if not self.validate_input(doc):
302
  return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
303
 
304
  defined_acronyms = set()
305
- first_occurrences = {} # Track first occurrence of each acronym
306
- undefined_acronyms = []
 
 
 
 
 
 
 
307
 
308
- acronym_pattern = re.compile(r'\b[A-Z]{2,}\b')
309
- defined_pattern = re.compile(r'\b([\w\s&]+?)\s*\((\b[A-Z]{2,}\b)\)')
 
 
310
 
311
- # Predefined acronyms
312
- defined_acronyms.add("14 CFR")
 
313
 
314
  for paragraph in doc:
315
- # Check for definitions first
 
 
 
316
  defined_matches = defined_pattern.findall(paragraph)
317
  for full_term, acronym in defined_matches:
318
  defined_acronyms.add(acronym)
319
- # If this was previously marked as undefined, remove it since we found its definition
320
  if acronym in first_occurrences:
321
  del first_occurrences[acronym]
322
 
323
- # Check for acronyms in the paragraph
324
- usage_matches = acronym_pattern.findall(paragraph)
325
- for acronym in usage_matches:
326
- if acronym not in defined_acronyms:
327
- # Only process if we haven't seen this acronym before
 
 
 
 
328
  if acronym not in first_occurrences:
329
- # Find the sentence containing the first undefined acronym
330
  sentences = re.split(r'(?<=[.!?])\s+', paragraph)
331
  for sentence in sentences:
332
  if acronym in sentence:
333
- first_occurrences[acronym] = {
334
- 'acronym': acronym,
335
- 'sentence': sentence.strip()
336
- }
 
337
  break
338
 
339
- # Convert first occurrences to list of issues
340
  undefined_acronyms = list(first_occurrences.values())
341
-
342
  success = len(undefined_acronyms) == 0
343
  issues = undefined_acronyms if not success else []
344
 
345
- return DocumentCheckResult(success=success, issues=issues)
346
 
347
  @profile_performance
348
  def check_terminology(self, doc: List[str]) -> DocumentCheckResult:
349
- """
350
- Check document terminology for:
351
- 1. Legal reference formatting and preferred terms
352
- 2. Prohibited phrases and constructions
353
- """
354
  if not self.validate_input(doc):
355
  return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
356
 
357
- # Dictionary of terms that should be replaced with preferred alternatives
358
  term_replacements = {
359
  r'\bUSC\b': 'U.S.C.',
360
  r'\bCFR Part\b': 'CFR part',
@@ -368,11 +376,10 @@ class FAADocumentChecker(DocumentChecker):
368
  r'\bflight crew\b': 'flightcrew'
369
  }
370
 
371
- # Prohibited phrases that should be flagged
372
  prohibited_phrases = [
373
  r'\babove\b',
374
  r'\bbelow\b',
375
- r'(?:^|(?<=[.!?]\s))There\s+(?:is|are)\b' # Matches 'There is/are' at start of sentences
376
  ]
377
 
378
  issues = []
@@ -380,7 +387,6 @@ class FAADocumentChecker(DocumentChecker):
380
  for paragraph in doc:
381
  sentences = re.split(r'(?<=[.!?])\s+', paragraph)
382
  for sentence in sentences:
383
- # Check for incorrect terms that need replacement
384
  for incorrect_pattern, correct_term in term_replacements.items():
385
  matches = re.finditer(incorrect_pattern, sentence)
386
  for match in matches:
@@ -392,7 +398,6 @@ class FAADocumentChecker(DocumentChecker):
392
  'sentence': sentence.strip()
393
  })
394
 
395
- # Check for prohibited phrases
396
  for phrase_pattern in prohibited_phrases:
397
  match = re.search(phrase_pattern, sentence, re.IGNORECASE)
398
  if match:
@@ -402,65 +407,46 @@ class FAADocumentChecker(DocumentChecker):
402
  'sentence': sentence.strip()
403
  })
404
 
405
- success = len(issues) == 0
406
- return DocumentCheckResult(success=success, issues=issues)
407
 
408
  @profile_performance
409
  def check_section_symbol_usage(self, doc: List[str]) -> DocumentCheckResult:
410
- """Check for various section symbol (Β§) usage issues."""
411
  if not self.validate_input(doc):
412
  return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
413
 
414
  issues = []
415
-
416
- # Patterns to identify issues
417
  sentences_starting_with_section_symbol = []
418
  incorrect_14_CFR_section_symbol_usage = []
419
  single_section_symbol_multiple_sections = []
420
  missing_section_symbol_in_multiple_sections = []
421
 
422
- # Pattern to find '14 CFR Β§25.25'
423
  pattern_14_CFR_section = re.compile(r'\b14 CFR Β§\s*\d+\.\d+\b')
424
-
425
- # Patterns for multiple sections with single 'Β§'
426
  pattern_single_section_symbol_and = re.compile(r'Β§\s*\d+\.\d+\s+and\s+\d+\.\d+')
427
  pattern_single_section_symbol_or = re.compile(r'Β§\s*\d+\.\d+\s+or\s+\d+\.\d+')
428
  pattern_single_section_symbol_through = re.compile(r'Β§\s*\d+\.\d+\s+through\s+\d+\.\d+')
429
-
430
- # Pattern for missing 'Β§' before subsequent sections with 'or'
431
  pattern_missing_section_symbol_or = re.compile(r'Β§\s*\d+\.\d+\s+or\s+Β§?\s*\d+\.\d+')
432
 
433
  for paragraph in doc:
434
- # Check for sentences starting with 'Β§'
435
  sentences = re.split(r'(?<=[.!?])\s+', paragraph)
436
  for sentence in sentences:
437
  if sentence.strip().startswith('Β§'):
438
  sentences_starting_with_section_symbol.append(sentence.strip())
439
 
440
- # Check for '14 CFR Β§25.25' usage
441
  matches_14_CFR = pattern_14_CFR_section.findall(paragraph)
442
- for match in matches_14_CFR:
443
- incorrect_14_CFR_section_symbol_usage.append(match)
444
 
445
- # Check for single 'Β§' with multiple sections using 'and'
446
  matches_and = pattern_single_section_symbol_and.findall(paragraph)
447
- for match in matches_and:
448
- single_section_symbol_multiple_sections.append(match)
449
 
450
- # Check for single 'Β§' with multiple sections using 'or'
451
  matches_or = pattern_single_section_symbol_or.findall(paragraph)
452
- for match in matches_or:
453
- single_section_symbol_multiple_sections.append(match)
454
 
455
- # Check for single 'Β§' with multiple sections using 'through'
456
  matches_through = pattern_single_section_symbol_through.findall(paragraph)
457
- for match in matches_through:
458
- single_section_symbol_multiple_sections.append(match)
459
 
460
- # Check for missing 'Β§' before subsequent sections with 'or'
461
  matches_missing_or = pattern_missing_section_symbol_or.findall(paragraph)
462
- for match in matches_missing_or:
463
- missing_section_symbol_in_multiple_sections.append(match)
464
 
465
  if sentences_starting_with_section_symbol:
466
  issues.append({
@@ -483,17 +469,14 @@ class FAADocumentChecker(DocumentChecker):
483
  'matches': missing_section_symbol_in_multiple_sections
484
  })
485
 
486
- success = len(issues) == 0
487
-
488
- return DocumentCheckResult(success=success, issues=issues)
489
 
490
  @profile_performance
491
  def caption_check(self, doc: List[str], doc_type: str, caption_type: str) -> DocumentCheckResult:
492
- """Check for correctly formatted captions (Table or Figure)."""
493
  if not self.validate_input(doc):
494
  return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
495
 
496
- # Determine the caption pattern based on document type
497
  if doc_type in ["Advisory Circular", "Order"]:
498
  caption_pattern = re.compile(rf'^{caption_type}\s+([A-Z0-9]+)-([A-Z0-9]+)[\.\s]', re.IGNORECASE)
499
  correct_format = f"{caption_type} X-Y"
@@ -505,18 +488,15 @@ class FAADocumentChecker(DocumentChecker):
505
  in_toc = False
506
 
507
  for paragraph in doc:
508
- # Check for start or end of Table of Contents (TOC)
509
  if "Table of Contents" in paragraph or "Contents" in paragraph:
510
  in_toc = True
511
  continue
512
  elif in_toc and paragraph.strip() == "":
513
- in_toc = False # Assume blank line marks the end of TOC
514
 
515
- # If within TOC, skip this paragraph
516
  if in_toc:
517
  continue
518
 
519
- # Only check paragraphs that start with "Table" or "Figure" for proper caption format
520
  paragraph_strip = paragraph.strip()
521
  if paragraph_strip.lower().startswith(caption_type.lower()):
522
  if not caption_pattern.match(paragraph_strip):
@@ -525,24 +505,17 @@ class FAADocumentChecker(DocumentChecker):
525
  'correct_format': correct_format
526
  })
527
 
528
- success = len(incorrect_captions) == 0
529
-
530
- return DocumentCheckResult(success=success, issues=incorrect_captions)
531
 
532
  @profile_performance
533
  def table_figure_reference_check(self, doc: List[str], doc_type: str) -> DocumentCheckResult:
534
- """
535
- Check for incorrect references to tables and figures in the document.
536
- References should be lowercase within sentences and capitalized at sentence start.
537
- """
538
  if not self.validate_input(doc):
539
  return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
540
 
541
  incorrect_references = []
542
 
543
- # Define patterns based on document type
544
  if doc_type in ["Advisory Circular", "Order"]:
545
- # Matches both capitalized and lowercase variations
546
  table_pattern = r'\b[Tt]able\s+\d+-\d+\b'
547
  figure_pattern = r'\b[Ff]igure\s+\d+-\d+\b'
548
  correct_mid_table_format = "table X-Y"
@@ -562,11 +535,9 @@ class FAADocumentChecker(DocumentChecker):
562
 
563
  for paragraph in doc:
564
  paragraph_strip = paragraph.strip()
565
- # Exclude captions
566
  starts_with_table_or_figure = paragraph_strip.lower().startswith('table') or paragraph_strip.lower().startswith('figure')
567
 
568
  if not starts_with_table_or_figure:
569
- # Split into sentences while preserving the original text
570
  sentences = re.split(r'(?<=[.!?])\s+', paragraph)
571
 
572
  for sentence in sentences:
@@ -576,13 +547,9 @@ class FAADocumentChecker(DocumentChecker):
576
  matches = table_ref_pattern.finditer(sentence)
577
  for match in matches:
578
  ref = match.group()
579
- # Get the text before the reference
580
  text_before = sentence[:match.start()].strip()
581
-
582
- # Determine if reference is at start of sentence
583
  is_sentence_start = text_before == ""
584
 
585
- # Check if capitalization is correct
586
  if is_sentence_start and not ref.startswith('Table'):
587
  incorrect_references.append({
588
  'incorrect_ref': ref,
@@ -602,13 +569,9 @@ class FAADocumentChecker(DocumentChecker):
602
  matches = figure_ref_pattern.finditer(sentence)
603
  for match in matches:
604
  ref = match.group()
605
- # Get the text before the reference
606
  text_before = sentence[:match.start()].strip()
607
-
608
- # Determine if reference is at start of sentence
609
  is_sentence_start = text_before == ""
610
 
611
- # Check if capitalization is correct
612
  if is_sentence_start and not ref.startswith('Figure'):
613
  incorrect_references.append({
614
  'incorrect_ref': ref,
@@ -624,14 +587,12 @@ class FAADocumentChecker(DocumentChecker):
624
  'issue': "Figure reference within sentence should be lowercase"
625
  })
626
 
627
- success = len(incorrect_references) == 0
628
- return DocumentCheckResult(success=success, issues=incorrect_references)
629
 
630
  @profile_performance
631
- def document_title_check(self, doc_path, doc_type: str) -> DocumentCheckResult:
632
  """Check for correct formatting of document titles."""
633
  try:
634
- # Handle both file paths and BytesIO objects
635
  if isinstance(doc_path, (str, bytes, io.BytesIO)):
636
  doc = Document(doc_path)
637
  else:
@@ -639,11 +600,8 @@ class FAADocumentChecker(DocumentChecker):
639
  success=False,
640
  issues=[{'error': 'Invalid document input type'}]
641
  )
642
-
643
- # Rest of the method remains the same
644
- incorrect_titles = []
645
 
646
- # Define formatting rules for different document types
647
  formatting_rules = {
648
  "Advisory Circular": {"italics": True, "quotes": False},
649
  "Airworthiness Criteria": {"italics": False, "quotes": True},
@@ -657,25 +615,23 @@ class FAADocumentChecker(DocumentChecker):
657
  "Technical Standard Order": {"italics": False, "quotes": True},
658
  "Other": {"italics": False, "quotes": False}
659
  }
660
-
661
  if doc_type not in formatting_rules:
662
  self.logger.warning(f"Unsupported document type: {doc_type}. Skipping title check.")
663
  return DocumentCheckResult(success=True, issues=[])
664
-
665
  required_format = formatting_rules[doc_type]
666
  ac_pattern = re.compile(r'(AC\s+\d+(?:-\d+)?(?:,|\s)+)(.+?)(?=\.|,|$)')
667
-
668
  for paragraph in doc.paragraphs:
669
  text = paragraph.text
670
  matches = ac_pattern.finditer(text)
671
-
672
  for match in matches:
673
- full_match = match.group(0)
674
  title_text = match.group(2).strip()
675
  title_start = match.start(2)
676
- title_end = match.end(2)
677
  title_in_quotes = any(q in title_text for q in ['"', "'", '"', '"', ''', '''])
678
-
679
  title_is_italicized = False
680
  current_pos = 0
681
  for run in paragraph.runs:
@@ -686,36 +642,36 @@ class FAADocumentChecker(DocumentChecker):
686
  title_is_italicized = run.italic
687
  break
688
  current_pos += run_length
689
-
690
  formatting_incorrect = False
691
  issue_message = []
692
-
693
  if required_format["italics"] and not title_is_italicized:
694
  formatting_incorrect = True
695
  issue_message.append("should be italicized")
696
  elif not required_format["italics"] and title_is_italicized:
697
  formatting_incorrect = True
698
  issue_message.append("should not be italicized")
699
-
700
  if required_format["quotes"] and not title_in_quotes:
701
  formatting_incorrect = True
702
  issue_message.append("should be in quotes")
703
  elif not required_format["quotes"] and title_in_quotes:
704
  formatting_incorrect = True
705
  issue_message.append("should not be in quotes")
706
-
707
  if formatting_incorrect:
708
  incorrect_titles.append({
709
  'text': title_text,
710
  'issue': ', '.join(issue_message),
711
  'sentence': text.strip()
712
  })
713
-
714
  return DocumentCheckResult(
715
  success=len(incorrect_titles) == 0,
716
  issues=incorrect_titles
717
  )
718
-
719
  except Exception as e:
720
  self.logger.error(f"Error in document_title_check: {e}")
721
  return DocumentCheckResult(
@@ -732,7 +688,6 @@ class FAADocumentChecker(DocumentChecker):
732
  incorrect_sentences = []
733
 
734
  for paragraph in doc:
735
- # Split the paragraph into sentences based on common sentence-ending punctuation
736
  sentences = re.split(r'(?<=[.!?]) +', paragraph)
737
  for sentence in sentences:
738
  if sentence.endswith('..'):
@@ -750,13 +705,17 @@ class FAADocumentChecker(DocumentChecker):
750
 
751
  incorrect_spacing = []
752
 
753
- # Regex patterns to find incorrect spacing
754
  patterns = [
755
- (re.compile(r'(?<!\s)(AC|AD|CFR|FAA|N|SFAR)(\d+[-]?\d*)', re.IGNORECASE), "Missing space between document type and number"),
756
- (re.compile(r'(?<!\s)(Β§|Β§Β§)(\d+\.\d+)', re.IGNORECASE), "Missing space after section symbol (Β§)"),
757
- (re.compile(r'(?<!\s)Part(\d+)', re.IGNORECASE), "Missing space between 'Part' and number"),
758
- (re.compile(r'(?<!\s)(\([a-z](?!\))|\([1-9](?!\)))', re.IGNORECASE), "Missing space before paragraph indication"),
759
- (re.compile(r'\s{2,}'), "Double spaces between words")
 
 
 
 
 
760
  ]
761
 
762
  for paragraph in doc:
@@ -784,79 +743,67 @@ class FAADocumentChecker(DocumentChecker):
784
  for paragraph in doc:
785
  sentences = re.split(r'(?<=[.!?])\s+', paragraph)
786
  for sentence in sentences:
787
- # Find definitions like "Federal Aviation Administration (FAA)"
788
  defined_matches = re.findall(r'\b([A-Za-z &]+)\s+\((\b[A-Z]{2,}\b)\)', sentence)
789
  for full_term, acronym in defined_matches:
790
  if acronym not in abbreviations:
791
  abbreviations[acronym] = {"full_term": full_term.strip(), "defined": True}
792
 
793
- # Check for full term usage after definition
794
  for acronym, data in abbreviations.items():
795
  full_term = data["full_term"]
796
  if full_term in sentence:
797
- # Ignore first usage where it's defined
798
  if data["defined"]:
799
- data["defined"] = False # Mark it as now defined
800
  else:
801
- # Only flag subsequent occurrences
802
  issues.append({
803
  'full_term': full_term,
804
  'acronym': acronym,
805
  'sentence': sentence.strip()
806
  })
807
 
808
- success = len(issues) == 0
809
-
810
- return DocumentCheckResult(success=success, issues=issues)
811
 
812
  @profile_performance
813
  def check_date_formats(self, doc: List[str]) -> DocumentCheckResult:
814
- """Check for inconsistent date formats while ignoring aviation reference numbers."""
815
  if not self.validate_input(doc):
816
  return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
817
 
818
  date_issues = []
819
-
820
- # Patterns to ignore (aviation references)
821
  ignore_patterns = [
822
- r'\bAD \d{4}-\d{2}-\d{2}\b', # Airworthiness Directive references
823
- r'\bSWPM \d{2}-\d{2}-\d{2}\b', # Standard Wiring Practices Manual references
824
- r'\bAMM \d{2}-\d{2}-\d{2}\b', # Aircraft Maintenance Manual references
825
- r'\bSOPM \d{2}-\d{2}-\d{2}\b', # Standard Operating Procedure references
826
- r'\b[A-Z]{2,4} \d{2}-\d{2}-\d{2}\b' # Generic manual reference pattern
827
  ]
828
 
829
- # Combine ignore patterns into one
830
  ignore_regex = '|'.join(ignore_patterns)
831
  ignore_pattern = re.compile(ignore_regex)
832
 
833
- # Correct date pattern: 'Month Day, Year' e.g., 'January 1, 2020'
834
  correct_date_pattern = re.compile(r'\b(January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}, \d{4}\b')
835
 
836
- # Incorrect date patterns
837
  date_patterns = [
838
- (re.compile(r'(?<![\w/-])\d{1,2}/\d{1,2}/\d{2,4}(?![\w/-])'), "Use 'Month Day, Year' format instead of 'MM/DD/YYYY'"),
839
- (re.compile(r'(?<![\w/-])\d{1,2}-\d{1,2}-\d{2,4}(?![\w/-])'), "Use 'Month Day, Year' format instead of 'MM-DD-YYYY'"),
840
- (re.compile(r'(?<![\w/-])\d{4}-\d{1,2}-\d{1,2}(?![\w/-])'), "Use 'Month Day, Year' format instead of 'YYYY-MM-DD'")
 
 
 
841
  ]
842
 
843
  for paragraph in doc:
844
  sentences = re.split(r'(?<=[.!?])\s+', paragraph)
845
  for sentence in sentences:
846
- # First, identify and temporarily remove text that should be ignored
847
  ignored_matches = list(ignore_pattern.finditer(sentence))
848
  working_sentence = sentence
849
 
850
- # Replace ignored patterns with placeholders
851
  for match in reversed(ignored_matches):
852
  start, end = match.span()
853
  working_sentence = working_sentence[:start] + 'X' * (end - start) + working_sentence[end:]
854
 
855
- # Now check for date patterns in the modified sentence
856
  for pattern, issue in date_patterns:
857
  matches = pattern.finditer(working_sentence)
858
  for match in matches:
859
- # Get the original text from the match position
860
  original_date = sentence[match.start():match.end()]
861
  date_issues.append({
862
  'date': original_date,
@@ -864,8 +811,7 @@ class FAADocumentChecker(DocumentChecker):
864
  'sentence': sentence.strip()
865
  })
866
 
867
- success = len(date_issues) == 0
868
- return DocumentCheckResult(success=success, issues=date_issues)
869
 
870
  @profile_performance
871
  def check_placeholders(self, doc: List[str]) -> DocumentCheckResult:
@@ -890,30 +836,18 @@ class FAADocumentChecker(DocumentChecker):
890
  'sentence': sentence.strip()
891
  })
892
 
893
- success = len(issues) == 0
894
-
895
- return DocumentCheckResult(success=success, issues=issues)
896
 
897
  def run_all_checks(self, doc_path: str, doc_type: str, template_type: Optional[str] = None) -> Dict[str, DocumentCheckResult]:
898
- """
899
- Run all checks on the document.
900
-
901
- Args:
902
- doc_path (str): Path to the document.
903
- doc_type (str): Type of the document.
904
- template_type (str, optional): Template type, if applicable.
905
-
906
- Returns:
907
- Dict[str, DocumentCheckResult]: Dictionary of check names to results.
908
- """
909
  # Read the document
910
  doc = self.extract_paragraphs(doc_path)
911
 
912
- # Retrieve any specific flags
913
  checks_config = self.config_manager.config['document_types'].get(doc_type, {})
914
  skip_title_check = checks_config.get('skip_title_check', False)
915
 
916
- # Run checks
917
  results = {}
918
  results['heading_title_check'] = self.heading_title_check(doc, doc_type)
919
  results['heading_title_period_check'] = self.heading_title_period_check(doc, doc_type)
@@ -935,18 +869,10 @@ class FAADocumentChecker(DocumentChecker):
935
 
936
  return results
937
 
938
- @dataclass
939
- class DocumentCheckResult:
940
- """Structured result for document checks."""
941
- success: bool
942
- issues: List[Dict[str, Any]]
943
- details: Optional[Dict[str, Any]] = None
944
-
945
- def format_check_results(results: Dict[str, DocumentCheckResult], doc_type: str) -> str:
946
- """Format check results into a Markdown string for display."""
947
  output = []
948
 
949
- # Add header with timestamp
950
  current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
951
  output.extend([
952
  f"# Document Check Results - {current_time}",
@@ -954,7 +880,6 @@ def format_check_results(results: Dict[str, DocumentCheckResult], doc_type: str)
954
  "---\n"
955
  ])
956
 
957
- # Count issues
958
  total_issues = sum(1 for r in results.values() if not r.success)
959
 
960
  if total_issues == 0:
@@ -963,73 +888,28 @@ def format_check_results(results: Dict[str, DocumentCheckResult], doc_type: str)
963
 
964
  output.append(f"❗ Found issues in {total_issues} check categories\n")
965
 
966
- # Define check categories and their display names
967
  check_categories = {
968
- 'heading_title_check': {
969
- 'title': 'πŸ“‹ Required Headings',
970
- 'priority': 1
971
- },
972
- 'heading_title_period_check': {
973
- 'title': 'πŸ” Heading Period Usage',
974
- 'priority': 1
975
- },
976
- 'acronym_check': {
977
- 'title': 'πŸ“ Acronym Definitions',
978
- 'priority': 2
979
- },
980
- 'terminology_check': {
981
- 'title': 'πŸ“– Terminology Usage',
982
- 'priority': 2
983
- },
984
- 'section_symbol_usage_check': {
985
- 'title': 'Β§ Section Symbol Usage',
986
- 'priority': 2
987
- },
988
- 'caption_check_table': {
989
- 'title': 'πŸ“Š Table Captions',
990
- 'priority': 3
991
- },
992
- 'caption_check_figure': {
993
- 'title': 'πŸ–ΌοΈ Figure Captions',
994
- 'priority': 3
995
- },
996
- 'table_figure_reference_check': {
997
- 'title': 'πŸ”— Table/Figure References',
998
- 'priority': 3
999
- },
1000
- 'document_title_check': {
1001
- 'title': 'πŸ“‘ Document Title Format',
1002
- 'priority': 1
1003
- },
1004
- 'double_period_check': {
1005
- 'title': '⚑ Double Periods',
1006
- 'priority': 4
1007
- },
1008
- 'spacing_check': {
1009
- 'title': '⌨️ Spacing Issues',
1010
- 'priority': 4
1011
- },
1012
- 'abbreviation_usage_check': {
1013
- 'title': 'πŸ“Ž Abbreviation Usage',
1014
- 'priority': 3
1015
- },
1016
- 'date_formats_check': {
1017
- 'title': 'πŸ“… Date Formats',
1018
- 'priority': 3
1019
- },
1020
- 'placeholders_check': {
1021
- 'title': '🚩 Placeholder Content',
1022
- 'priority': 1
1023
- }
1024
  }
1025
 
1026
- # Sort checks by priority
1027
  sorted_checks = sorted(
1028
  [(name, result) for name, result in results.items()],
1029
  key=lambda x: check_categories.get(x[0], {'priority': 999})['priority']
1030
  )
1031
 
1032
- # Process each check result
1033
  for check_name, result in sorted_checks:
1034
  if not result.success:
1035
  category = check_categories.get(check_name, {'title': check_name.replace('_', ' ').title()})
@@ -1037,9 +917,8 @@ def format_check_results(results: Dict[str, DocumentCheckResult], doc_type: str)
1037
  output.append(f"### {category['title']}")
1038
 
1039
  if isinstance(result.issues, list):
1040
- for issue in result.issues[:5]: # Show first 5 issues
1041
  if isinstance(issue, dict):
1042
- # Format dictionary issues
1043
  for key, value in issue.items():
1044
  if isinstance(value, list):
1045
  output.extend([f"- {item}" for item in value])
@@ -1048,13 +927,11 @@ def format_check_results(results: Dict[str, DocumentCheckResult], doc_type: str)
1048
  else:
1049
  output.append(f"- {issue}")
1050
 
1051
- # Show count of remaining issues
1052
  if len(result.issues) > 5:
1053
  output.append(f"\n*...and {len(result.issues) - 5} more similar issues*")
1054
 
1055
- output.append("") # Add spacing between sections
1056
 
1057
- # Add summary and recommendations
1058
  output.extend([
1059
  "## πŸ“‹ Summary and Recommendations",
1060
  "",
@@ -1073,206 +950,299 @@ def format_check_results(results: Dict[str, DocumentCheckResult], doc_type: str)
1073
 
1074
  return "\n".join(output)
1075
 
1076
- def process_document(file_obj, doc_type: str, template_type: Optional[str] = None) -> str:
1077
- """Process document and run all checks."""
1078
- try:
1079
- # Initialize checker
1080
- checker = FAADocumentChecker()
1081
-
1082
- # Convert file object to BytesIO if needed
1083
- if isinstance(file_obj, bytes):
1084
- file_obj = io.BytesIO(file_obj)
1085
-
1086
- # Extract paragraphs
1087
- doc = Document(file_obj)
1088
- paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
1089
-
1090
- # Rewind file object
1091
- file_obj.seek(0)
1092
-
1093
- # Run all checks
1094
- results = checker.run_all_checks(file_obj, doc_type, template_type)
1095
-
1096
- # Format results for display
1097
- return format_check_results(results, doc_type)
1098
-
1099
- except Exception as e:
1100
- logging.error(f"Error processing document: {str(e)}")
1101
- traceback.print_exc()
1102
- return f"❌ Error processing document: {str(e)}\n\nPlease ensure the file is a valid .docx document and try again."
1103
-
1104
- def format_results_for_gradio(results: Dict[str, DocumentCheckResult], doc_type: str) -> str:
1105
- """Format the results for display in Gradio."""
1106
- output = ["# Document Check Results\n"]
1107
-
1108
- # Map check names to display titles
1109
- check_titles = {
1110
- 'heading_check': "Required Headings Check",
1111
- 'heading_period_check': "Heading Period Check",
1112
- 'acronym_check': "Acronym Check",
1113
- 'terminology_check': "Terminology Check",
1114
- 'section_symbol_check': "Section Symbol Usage",
1115
- 'table_caption_check': "Table Caption Format",
1116
- 'figure_caption_check': "Figure Caption Format",
1117
- 'references_check': "Table and Figure References",
1118
- 'title_check': "Document Title Style",
1119
- 'double_period_check': "Double Period Check",
1120
- 'spacing_check': "Spacing Check",
1121
- 'abbreviation_check': "Abbreviation Usage",
1122
- 'date_check': "Date Format Check",
1123
- 'placeholder_check': "Placeholder Check"
1124
- }
1125
 
1126
- for check_name, result in results.items():
1127
- title = check_titles.get(check_name, check_name.replace('_', ' ').title())
1128
- output.append(f"## {title}")
 
 
 
 
 
 
 
 
 
 
 
 
 
1129
 
1130
- if result.success:
1131
- output.append("βœ… All checks passed.\n")
1132
- else:
1133
- output.append("❌ Issues found:")
1134
- for issue in result.issues:
1135
- if isinstance(issue, dict):
1136
- for key, value in issue.items():
1137
- if isinstance(value, list):
1138
- for item in value:
1139
- output.append(f"- {item}")
1140
- else:
1141
- output.append(f"- {key}: {value}")
1142
- else:
1143
- output.append(f"- {issue}")
1144
- output.append("")
1145
 
1146
- if result.details:
1147
- output.append("Additional Details:")
1148
- for key, value in result.details.items():
1149
- if isinstance(value, list):
1150
- output.append(f"- {key}:")
1151
- for item in value:
1152
- output.append(f" - {item}")
1153
- else:
1154
- output.append(f"- {key}: {value}")
1155
- output.append("")
1156
 
1157
- return "\n".join(output)
 
1158
 
1159
- def create_interface():
1160
- """Create and configure the Gradio interface."""
1161
-
1162
- document_types = [
1163
- "Advisory Circular",
1164
- "Airworthiness Criteria",
1165
- "Deviation Memo",
1166
- "Exemption",
1167
- "Federal Register Notice",
1168
- "Order",
1169
- "Policy Statement",
1170
- "Rule",
1171
- "Special Condition",
1172
- "Technical Standard Order",
1173
- "Other"
1174
- ]
1175
-
1176
- template_types = ["Short AC template AC", "Long AC template AC"]
1177
-
1178
- # Custom CSS for better styling
1179
- custom_css = """
1180
- .gradio-container {
1181
- font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
1182
- }
1183
- .container {
1184
- max-width: 900px;
1185
- margin: auto;
1186
- }
1187
- .alert {
1188
- padding: 1rem;
1189
- margin-bottom: 1rem;
1190
- border-radius: 0.5rem;
1191
- background-color: #f8f9fa;
1192
- border: 1px solid #dee2e6;
1193
- }
1194
- """
1195
-
1196
- with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as demo:
1197
- gr.Markdown(
1198
- """
1199
- # πŸ“‘ Document Checker Tool
1200
-
1201
- ### Purpose
1202
- This tool checks Word documents for compliance with U.S. federal documentation standards.
1203
-
1204
- ### How to Use
1205
- 1. Upload your Word document (.docx format)
1206
- 2. Select the document type
1207
- 3. Click "Check Document"
1208
-
1209
- > **Note:** Please ensure your document is clean (no track changes or comments)
1210
- """
1211
- )
1212
 
1213
- with gr.Row():
1214
- with gr.Column(scale=1):
1215
- file_input = gr.File(
1216
- label="πŸ“Ž Upload Word Document (.docx)",
1217
- file_types=[".docx"],
1218
- type="binary"
1219
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1220
 
1221
- doc_type = gr.Dropdown(
1222
- choices=document_types,
1223
- label="πŸ“‹ Document Type",
1224
- value="Advisory Circular",
1225
- info="Select the type of document you're checking"
1226
- )
1227
 
1228
- template_type = gr.Radio(
1229
- choices=template_types,
1230
- label="πŸ“‘ Template Type",
1231
- visible=False,
1232
- info="Only applicable for Advisory Circulars"
1233
- )
1234
 
1235
- submit_btn = gr.Button(
1236
- "πŸ” Check Document",
1237
- variant="primary"
1238
- )
 
 
 
 
 
1239
 
1240
- with gr.Column(scale=2):
1241
- results = gr.Markdown(
1242
- label="Check Results",
1243
- value="Results will appear here after processing...",
1244
- elem_classes=["results-panel"]
1245
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1246
 
1247
- # Update template type visibility based on document type
1248
- def update_template_visibility(doc_type):
1249
- return gr.update(visible=doc_type == "Advisory Circular")
 
 
1250
 
1251
- doc_type.change(
1252
- fn=update_template_visibility,
1253
- inputs=[doc_type],
1254
- outputs=[template_type]
1255
- )
 
 
 
1256
 
1257
- # Handle document processing
1258
- submit_btn.click(
1259
- fn=process_document,
1260
- inputs=[file_input, doc_type, template_type],
1261
- outputs=[results]
1262
- )
1263
 
1264
- gr.Markdown(
1265
- """
1266
- ### πŸ“Œ Important Notes
1267
- - This tool is in development; you may encounter false positives
1268
- - For questions or feedback, contact Eric Putnam
1269
- - Results are not stored or saved
1270
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1271
  )
1272
-
1273
- return demo
1274
 
1275
- # Initialize and launch the interface
1276
- if __name__ == "__main__":
1277
- demo = create_interface()
1278
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  import os
12
  import traceback
13
  from datetime import datetime
14
+ import gc
15
 
16
+ # Import your document checker classes
17
+ from main import FAADocumentChecker, DocumentCheckResult
18
+
19
+ # Core data structures and utilities
20
  @dataclass
21
  class DocumentCheckResult:
22
  """Structured result for document checks."""
 
31
  start_time = time.time()
32
  result = func(*args, **kwargs)
33
  end_time = time.time()
 
34
  logger = args[0].logger if hasattr(args[0], 'logger') else logging.getLogger(__name__)
35
  logger.info(
36
  f"Performance: {func.__name__} took {end_time - start_time:.4f} seconds"
 
41
  class DocumentCheckerConfig:
42
  """Configuration management for document checks."""
43
  def __init__(self, config_path: Optional[str] = None):
 
44
  self.config = self._load_config(config_path)
45
  self.logger = self._setup_logger()
46
 
 
107
  ],
108
  "skip_title_check": False
109
  },
110
+ "Airworthiness Criteria": {
111
+ "required_headings": [],
112
+ "skip_title_check": True
113
+ },
114
+ "Deviation Memo": {
115
+ "required_headings": [],
116
+ "skip_title_check": True
117
+ },
118
+ "Exemption": {
119
+ "required_headings": [],
120
+ "skip_title_check": True
121
+ },
122
+ "Rule": {
123
+ "required_headings": [],
124
+ "skip_title_check": True
125
+ },
126
+ "Special Condition": {
127
+ "required_headings": [],
128
+ "skip_title_check": True
129
+ },
130
  "Other": {
131
  "required_headings": [],
132
  "skip_title_check": True
 
187
  return []
188
 
189
  class FAADocumentChecker(DocumentChecker):
190
+ """Main document checker implementation with all check methods."""
191
  def __init__(self, config_path: Optional[str] = None):
192
  super().__init__(config_path)
193
 
194
+ @profile_performance
195
  def heading_title_check(self, doc: List[str], doc_type: str) -> DocumentCheckResult:
196
  """Check headings for a specific document type."""
197
  if not self.validate_input(doc):
198
  return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
199
 
200
+ checks = self.config_manager.config['document_types'].get(doc_type, {})
 
 
 
201
  required_headings = checks.get('required_headings', [])
 
202
  headings_found = []
 
 
203
  required_headings_set = set(required_headings)
204
 
205
  for para in doc:
206
  para_strip = para.strip()
 
207
  if para_strip in required_headings_set:
208
  headings_found.append(para_strip)
209
 
 
210
  all_headings_present = set(headings_found) == required_headings_set
 
211
  issues = []
212
  if not all_headings_present:
213
  missing_headings = required_headings_set - set(headings_found)
 
221
  'required_headings': required_headings
222
  }
223
  )
224
+
225
  @profile_performance
226
  def heading_title_period_check(self, doc: List[str], doc_type: str) -> DocumentCheckResult:
227
+ """Check if headings end with periods according to document type requirements."""
 
 
 
 
 
 
 
 
 
228
  if not self.validate_input(doc):
229
  return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
230
 
 
231
  period_required = {
232
  "Advisory Circular": True,
233
  "Airworthiness Criteria": False,
 
242
  "Other": False
243
  }
244
 
 
245
  should_have_period = period_required.get(doc_type, False)
 
 
246
  checks = self.config_manager.config['document_types'].get(doc_type, {})
247
  required_headings = checks.get('required_headings', [])
248
  required_headings_set = set(required_headings)
 
252
 
253
  for para in doc:
254
  para_strip = para.strip()
 
255
  if para_strip in required_headings_set:
256
  ends_with_period = para_strip.endswith('.')
257
 
 
284
  'needs_period': should_have_period
285
  })
286
 
 
 
287
  return DocumentCheckResult(
288
+ success=len(issues) == 0,
289
+ issues=issues,
290
+ details={
291
+ 'document_type': doc_type,
292
+ 'periods_required': should_have_period,
293
+ 'checked_headings': checked_headings
294
+ }
295
+ )
296
 
297
  @profile_performance
298
  def acronym_check(self, doc: List[str]) -> DocumentCheckResult:
299
+ """Check if acronyms are defined at their first use."""
300
  if not self.validate_input(doc):
301
  return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
302
 
303
  defined_acronyms = set()
304
+ first_occurrences = {}
305
+ heading_words = {
306
+ 'INFORMATION', 'GENERAL', 'SUMMARY', 'INTRODUCTION', 'BACKGROUND',
307
+ 'DISCUSSION', 'CONCLUSION', 'APPENDIX', 'CHAPTER', 'SECTION',
308
+ 'PURPOSE', 'APPLICABILITY', 'CANCELLATION', 'DEFINITION', 'REQUIREMENTS',
309
+ 'AUTHORITY', 'POLICY', 'SCOPE', 'RELATED', 'MATERIAL', 'DISTRIBUTION',
310
+ 'EXPLANATION', 'PROCEDURES', 'NOTE', 'WARNING', 'CAUTION', 'EXCEPTION',
311
+ 'GROUPS', 'PARTS', 'TABLE', 'FIGURE', 'REFERENCES', 'DEFINITIONS'
312
+ }
313
 
314
+ predefined_acronyms = {
315
+ 'CFR', 'U.S.', 'USA', 'US', 'U.S.C', 'e.g.', 'i.e.', 'FAQ', 'No.', 'ZIP', 'PDF', 'SSN',
316
+ 'DC', 'MA', 'WA', 'TX', 'MO'
317
+ }
318
 
319
+ defined_acronyms.update(predefined_acronyms)
320
+ defined_pattern = re.compile(r'\b([\w\s&]+?)\s*\((\b[A-Z]{2,}\b)\)')
321
+ acronym_pattern = re.compile(r'\b[A-Z]{2,}\b(?!\s*[:.]\s*)')
322
 
323
  for paragraph in doc:
324
+ words = paragraph.strip().split()
325
+ if all(word.isupper() for word in words) and any(word in heading_words for word in words):
326
+ continue
327
+
328
  defined_matches = defined_pattern.findall(paragraph)
329
  for full_term, acronym in defined_matches:
330
  defined_acronyms.add(acronym)
 
331
  if acronym in first_occurrences:
332
  del first_occurrences[acronym]
333
 
334
+ usage_matches = acronym_pattern.finditer(paragraph)
335
+ for match in usage_matches:
336
+ acronym = match.group()
337
+
338
+ if (acronym not in defined_acronyms and
339
+ acronym not in heading_words and
340
+ not any(not c.isalpha() for c in acronym) and
341
+ len(acronym) <= 10):
342
+
343
  if acronym not in first_occurrences:
 
344
  sentences = re.split(r'(?<=[.!?])\s+', paragraph)
345
  for sentence in sentences:
346
  if acronym in sentence:
347
+ if not (sentence.isupper() and any(word in heading_words for word in sentence.split())):
348
+ first_occurrences[acronym] = {
349
+ 'acronym': acronym,
350
+ 'sentence': sentence.strip()
351
+ }
352
  break
353
 
 
354
  undefined_acronyms = list(first_occurrences.values())
 
355
  success = len(undefined_acronyms) == 0
356
  issues = undefined_acronyms if not success else []
357
 
358
+ return DocumentCheckResult(success=success, issues=issues)
359
 
360
  @profile_performance
361
  def check_terminology(self, doc: List[str]) -> DocumentCheckResult:
362
+ """Check document terminology for consistency and preferred terms."""
 
 
 
 
363
  if not self.validate_input(doc):
364
  return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
365
 
 
366
  term_replacements = {
367
  r'\bUSC\b': 'U.S.C.',
368
  r'\bCFR Part\b': 'CFR part',
 
376
  r'\bflight crew\b': 'flightcrew'
377
  }
378
 
 
379
  prohibited_phrases = [
380
  r'\babove\b',
381
  r'\bbelow\b',
382
+ r'(?:^|(?<=[.!?]\s))There\s+(?:is|are)\b'
383
  ]
384
 
385
  issues = []
 
387
  for paragraph in doc:
388
  sentences = re.split(r'(?<=[.!?])\s+', paragraph)
389
  for sentence in sentences:
 
390
  for incorrect_pattern, correct_term in term_replacements.items():
391
  matches = re.finditer(incorrect_pattern, sentence)
392
  for match in matches:
 
398
  'sentence': sentence.strip()
399
  })
400
 
 
401
  for phrase_pattern in prohibited_phrases:
402
  match = re.search(phrase_pattern, sentence, re.IGNORECASE)
403
  if match:
 
407
  'sentence': sentence.strip()
408
  })
409
 
410
+ return DocumentCheckResult(success=len(issues) == 0, issues=issues)
 
411
 
412
  @profile_performance
413
  def check_section_symbol_usage(self, doc: List[str]) -> DocumentCheckResult:
414
+ """Check for correct usage of section symbols (Β§)."""
415
  if not self.validate_input(doc):
416
  return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
417
 
418
  issues = []
 
 
419
  sentences_starting_with_section_symbol = []
420
  incorrect_14_CFR_section_symbol_usage = []
421
  single_section_symbol_multiple_sections = []
422
  missing_section_symbol_in_multiple_sections = []
423
 
 
424
  pattern_14_CFR_section = re.compile(r'\b14 CFR Β§\s*\d+\.\d+\b')
 
 
425
  pattern_single_section_symbol_and = re.compile(r'Β§\s*\d+\.\d+\s+and\s+\d+\.\d+')
426
  pattern_single_section_symbol_or = re.compile(r'Β§\s*\d+\.\d+\s+or\s+\d+\.\d+')
427
  pattern_single_section_symbol_through = re.compile(r'Β§\s*\d+\.\d+\s+through\s+\d+\.\d+')
 
 
428
  pattern_missing_section_symbol_or = re.compile(r'Β§\s*\d+\.\d+\s+or\s+Β§?\s*\d+\.\d+')
429
 
430
  for paragraph in doc:
 
431
  sentences = re.split(r'(?<=[.!?])\s+', paragraph)
432
  for sentence in sentences:
433
  if sentence.strip().startswith('Β§'):
434
  sentences_starting_with_section_symbol.append(sentence.strip())
435
 
 
436
  matches_14_CFR = pattern_14_CFR_section.findall(paragraph)
437
+ incorrect_14_CFR_section_symbol_usage.extend(matches_14_CFR)
 
438
 
 
439
  matches_and = pattern_single_section_symbol_and.findall(paragraph)
440
+ single_section_symbol_multiple_sections.extend(matches_and)
 
441
 
 
442
  matches_or = pattern_single_section_symbol_or.findall(paragraph)
443
+ single_section_symbol_multiple_sections.extend(matches_or)
 
444
 
 
445
  matches_through = pattern_single_section_symbol_through.findall(paragraph)
446
+ single_section_symbol_multiple_sections.extend(matches_through)
 
447
 
 
448
  matches_missing_or = pattern_missing_section_symbol_or.findall(paragraph)
449
+ missing_section_symbol_in_multiple_sections.extend(matches_missing_or)
 
450
 
451
  if sentences_starting_with_section_symbol:
452
  issues.append({
 
469
  'matches': missing_section_symbol_in_multiple_sections
470
  })
471
 
472
+ return DocumentCheckResult(success=len(issues) == 0, issues=issues)
 
 
473
 
474
  @profile_performance
475
  def caption_check(self, doc: List[str], doc_type: str, caption_type: str) -> DocumentCheckResult:
476
+ """Check for correctly formatted captions."""
477
  if not self.validate_input(doc):
478
  return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
479
 
 
480
  if doc_type in ["Advisory Circular", "Order"]:
481
  caption_pattern = re.compile(rf'^{caption_type}\s+([A-Z0-9]+)-([A-Z0-9]+)[\.\s]', re.IGNORECASE)
482
  correct_format = f"{caption_type} X-Y"
 
488
  in_toc = False
489
 
490
  for paragraph in doc:
 
491
  if "Table of Contents" in paragraph or "Contents" in paragraph:
492
  in_toc = True
493
  continue
494
  elif in_toc and paragraph.strip() == "":
495
+ in_toc = False
496
 
 
497
  if in_toc:
498
  continue
499
 
 
500
  paragraph_strip = paragraph.strip()
501
  if paragraph_strip.lower().startswith(caption_type.lower()):
502
  if not caption_pattern.match(paragraph_strip):
 
505
  'correct_format': correct_format
506
  })
507
 
508
+ return DocumentCheckResult(success=len(incorrect_captions) == 0, issues=incorrect_captions)
 
 
509
 
510
  @profile_performance
511
  def table_figure_reference_check(self, doc: List[str], doc_type: str) -> DocumentCheckResult:
512
+ """Check for correct references to tables and figures."""
 
 
 
513
  if not self.validate_input(doc):
514
  return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
515
 
516
  incorrect_references = []
517
 
 
518
  if doc_type in ["Advisory Circular", "Order"]:
 
519
  table_pattern = r'\b[Tt]able\s+\d+-\d+\b'
520
  figure_pattern = r'\b[Ff]igure\s+\d+-\d+\b'
521
  correct_mid_table_format = "table X-Y"
 
535
 
536
  for paragraph in doc:
537
  paragraph_strip = paragraph.strip()
 
538
  starts_with_table_or_figure = paragraph_strip.lower().startswith('table') or paragraph_strip.lower().startswith('figure')
539
 
540
  if not starts_with_table_or_figure:
 
541
  sentences = re.split(r'(?<=[.!?])\s+', paragraph)
542
 
543
  for sentence in sentences:
 
547
  matches = table_ref_pattern.finditer(sentence)
548
  for match in matches:
549
  ref = match.group()
 
550
  text_before = sentence[:match.start()].strip()
 
 
551
  is_sentence_start = text_before == ""
552
 
 
553
  if is_sentence_start and not ref.startswith('Table'):
554
  incorrect_references.append({
555
  'incorrect_ref': ref,
 
569
  matches = figure_ref_pattern.finditer(sentence)
570
  for match in matches:
571
  ref = match.group()
 
572
  text_before = sentence[:match.start()].strip()
 
 
573
  is_sentence_start = text_before == ""
574
 
 
575
  if is_sentence_start and not ref.startswith('Figure'):
576
  incorrect_references.append({
577
  'incorrect_ref': ref,
 
587
  'issue': "Figure reference within sentence should be lowercase"
588
  })
589
 
590
+ return DocumentCheckResult(success=len(incorrect_references) == 0, issues=incorrect_references)
 
591
 
592
  @profile_performance
593
+ def document_title_check(self, doc_path: str, doc_type: str) -> DocumentCheckResult:
594
  """Check for correct formatting of document titles."""
595
  try:
 
596
  if isinstance(doc_path, (str, bytes, io.BytesIO)):
597
  doc = Document(doc_path)
598
  else:
 
600
  success=False,
601
  issues=[{'error': 'Invalid document input type'}]
602
  )
 
 
 
603
 
604
+ incorrect_titles = []
605
  formatting_rules = {
606
  "Advisory Circular": {"italics": True, "quotes": False},
607
  "Airworthiness Criteria": {"italics": False, "quotes": True},
 
615
  "Technical Standard Order": {"italics": False, "quotes": True},
616
  "Other": {"italics": False, "quotes": False}
617
  }
618
+
619
  if doc_type not in formatting_rules:
620
  self.logger.warning(f"Unsupported document type: {doc_type}. Skipping title check.")
621
  return DocumentCheckResult(success=True, issues=[])
622
+
623
  required_format = formatting_rules[doc_type]
624
  ac_pattern = re.compile(r'(AC\s+\d+(?:-\d+)?(?:,|\s)+)(.+?)(?=\.|,|$)')
625
+
626
  for paragraph in doc.paragraphs:
627
  text = paragraph.text
628
  matches = ac_pattern.finditer(text)
629
+
630
  for match in matches:
 
631
  title_text = match.group(2).strip()
632
  title_start = match.start(2)
 
633
  title_in_quotes = any(q in title_text for q in ['"', "'", '"', '"', ''', '''])
634
+
635
  title_is_italicized = False
636
  current_pos = 0
637
  for run in paragraph.runs:
 
642
  title_is_italicized = run.italic
643
  break
644
  current_pos += run_length
645
+
646
  formatting_incorrect = False
647
  issue_message = []
648
+
649
  if required_format["italics"] and not title_is_italicized:
650
  formatting_incorrect = True
651
  issue_message.append("should be italicized")
652
  elif not required_format["italics"] and title_is_italicized:
653
  formatting_incorrect = True
654
  issue_message.append("should not be italicized")
655
+
656
  if required_format["quotes"] and not title_in_quotes:
657
  formatting_incorrect = True
658
  issue_message.append("should be in quotes")
659
  elif not required_format["quotes"] and title_in_quotes:
660
  formatting_incorrect = True
661
  issue_message.append("should not be in quotes")
662
+
663
  if formatting_incorrect:
664
  incorrect_titles.append({
665
  'text': title_text,
666
  'issue': ', '.join(issue_message),
667
  'sentence': text.strip()
668
  })
669
+
670
  return DocumentCheckResult(
671
  success=len(incorrect_titles) == 0,
672
  issues=incorrect_titles
673
  )
674
+
675
  except Exception as e:
676
  self.logger.error(f"Error in document_title_check: {e}")
677
  return DocumentCheckResult(
 
688
  incorrect_sentences = []
689
 
690
  for paragraph in doc:
 
691
  sentences = re.split(r'(?<=[.!?]) +', paragraph)
692
  for sentence in sentences:
693
  if sentence.endswith('..'):
 
705
 
706
  incorrect_spacing = []
707
 
 
708
  patterns = [
709
+ (re.compile(r'(?<!\s)(AC|AD|CFR|FAA|N|SFAR)(\d+[-]?\d*)', re.IGNORECASE),
710
+ "Missing space between document type and number"),
711
+ (re.compile(r'(?<!\s)(Β§|Β§Β§)(\d+\.\d+)', re.IGNORECASE),
712
+ "Missing space after section symbol (Β§)"),
713
+ (re.compile(r'(?<!\s)Part(\d+)', re.IGNORECASE),
714
+ "Missing space between 'Part' and number"),
715
+ (re.compile(r'(?<!\s)(\([a-z](?!\))|\([1-9](?!\)))', re.IGNORECASE),
716
+ "Missing space before paragraph indication"),
717
+ (re.compile(r'\s{2,}'),
718
+ "Double spaces between words")
719
  ]
720
 
721
  for paragraph in doc:
 
743
  for paragraph in doc:
744
  sentences = re.split(r'(?<=[.!?])\s+', paragraph)
745
  for sentence in sentences:
 
746
  defined_matches = re.findall(r'\b([A-Za-z &]+)\s+\((\b[A-Z]{2,}\b)\)', sentence)
747
  for full_term, acronym in defined_matches:
748
  if acronym not in abbreviations:
749
  abbreviations[acronym] = {"full_term": full_term.strip(), "defined": True}
750
 
 
751
  for acronym, data in abbreviations.items():
752
  full_term = data["full_term"]
753
  if full_term in sentence:
 
754
  if data["defined"]:
755
+ data["defined"] = False
756
  else:
 
757
  issues.append({
758
  'full_term': full_term,
759
  'acronym': acronym,
760
  'sentence': sentence.strip()
761
  })
762
 
763
+ return DocumentCheckResult(success=len(issues) == 0, issues=issues)
 
 
764
 
765
  @profile_performance
766
  def check_date_formats(self, doc: List[str]) -> DocumentCheckResult:
767
+ """Check for inconsistent date formats."""
768
  if not self.validate_input(doc):
769
  return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
770
 
771
  date_issues = []
 
 
772
  ignore_patterns = [
773
+ r'\bAD \d{4}-\d{2}-\d{2}\b',
774
+ r'\bSWPM \d{2}-\d{2}-\d{2}\b',
775
+ r'\bAMM \d{2}-\d{2}-\d{2}\b',
776
+ r'\bSOPM \d{2}-\d{2}-\d{2}\b',
777
+ r'\b[A-Z]{2,4} \d{2}-\d{2}-\d{2}\b'
778
  ]
779
 
 
780
  ignore_regex = '|'.join(ignore_patterns)
781
  ignore_pattern = re.compile(ignore_regex)
782
 
 
783
  correct_date_pattern = re.compile(r'\b(January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}, \d{4}\b')
784
 
 
785
  date_patterns = [
786
+ (re.compile(r'(?<![\w/-])\d{1,2}/\d{1,2}/\d{2,4}(?![\w/-])'),
787
+ "Use 'Month Day, Year' format instead of 'MM/DD/YYYY'"),
788
+ (re.compile(r'(?<![\w/-])\d{1,2}-\d{1,2}-\d{2,4}(?![\w/-])'),
789
+ "Use 'Month Day, Year' format instead of 'MM-DD-YYYY'"),
790
+ (re.compile(r'(?<![\w/-])\d{4}-\d{1,2}-\d{1,2}(?![\w/-])'),
791
+ "Use 'Month Day, Year' format instead of 'YYYY-MM-DD'")
792
  ]
793
 
794
  for paragraph in doc:
795
  sentences = re.split(r'(?<=[.!?])\s+', paragraph)
796
  for sentence in sentences:
 
797
  ignored_matches = list(ignore_pattern.finditer(sentence))
798
  working_sentence = sentence
799
 
 
800
  for match in reversed(ignored_matches):
801
  start, end = match.span()
802
  working_sentence = working_sentence[:start] + 'X' * (end - start) + working_sentence[end:]
803
 
 
804
  for pattern, issue in date_patterns:
805
  matches = pattern.finditer(working_sentence)
806
  for match in matches:
 
807
  original_date = sentence[match.start():match.end()]
808
  date_issues.append({
809
  'date': original_date,
 
811
  'sentence': sentence.strip()
812
  })
813
 
814
+ return DocumentCheckResult(success=len(date_issues) == 0, issues=date_issues)
 
815
 
816
  @profile_performance
817
  def check_placeholders(self, doc: List[str]) -> DocumentCheckResult:
 
836
  'sentence': sentence.strip()
837
  })
838
 
839
+ return DocumentCheckResult(success=len(issues) == 0, issues=issues)
 
 
840
 
841
  def run_all_checks(self, doc_path: str, doc_type: str, template_type: Optional[str] = None) -> Dict[str, DocumentCheckResult]:
842
+ """Run all document checks."""
 
 
 
 
 
 
 
 
 
 
843
  # Read the document
844
  doc = self.extract_paragraphs(doc_path)
845
 
846
+ # Get configuration flags
847
  checks_config = self.config_manager.config['document_types'].get(doc_type, {})
848
  skip_title_check = checks_config.get('skip_title_check', False)
849
 
850
+ # Run all checks
851
  results = {}
852
  results['heading_title_check'] = self.heading_title_check(doc, doc_type)
853
  results['heading_title_period_check'] = self.heading_title_period_check(doc, doc_type)
 
869
 
870
  return results
871
 
872
+ def format_markdown_results(results: Dict[str, DocumentCheckResult], doc_type: str) -> str:
873
+ """Format check results into a Markdown string for Gradio display."""
 
 
 
 
 
 
 
874
  output = []
875
 
 
876
  current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
877
  output.extend([
878
  f"# Document Check Results - {current_time}",
 
880
  "---\n"
881
  ])
882
 
 
883
  total_issues = sum(1 for r in results.values() if not r.success)
884
 
885
  if total_issues == 0:
 
888
 
889
  output.append(f"❗ Found issues in {total_issues} check categories\n")
890
 
 
891
  check_categories = {
892
+ 'heading_title_check': {'title': 'πŸ“‹ Required Headings', 'priority': 1},
893
+ 'heading_title_period_check': {'title': 'πŸ” Heading Period Usage', 'priority': 1},
894
+ 'acronym_check': {'title': 'πŸ“ Acronym Definitions', 'priority': 2},
895
+ 'terminology_check': {'title': 'πŸ“– Terminology Usage', 'priority': 2},
896
+ 'section_symbol_usage_check': {'title': 'Β§ Section Symbol Usage', 'priority': 2},
897
+ 'caption_check_table': {'title': 'πŸ“Š Table Captions', 'priority': 3},
898
+ 'caption_check_figure': {'title': 'πŸ–ΌοΈ Figure Captions', 'priority': 3},
899
+ 'table_figure_reference_check': {'title': 'πŸ”— Table/Figure References', 'priority': 3},
900
+ 'document_title_check': {'title': 'πŸ“‘ Document Title Format', 'priority': 1},
901
+ 'double_period_check': {'title': '⚑ Double Periods', 'priority': 4},
902
+ 'spacing_check': {'title': '⌨️ Spacing Issues', 'priority': 4},
903
+ 'abbreviation_usage_check': {'title': 'πŸ“Ž Abbreviation Usage', 'priority': 3},
904
+ 'date_formats_check': {'title': 'πŸ“… Date Formats', 'priority': 3},
905
+ 'placeholders_check': {'title': '🚩 Placeholder Content', 'priority': 1}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
906
  }
907
 
 
908
  sorted_checks = sorted(
909
  [(name, result) for name, result in results.items()],
910
  key=lambda x: check_categories.get(x[0], {'priority': 999})['priority']
911
  )
912
 
 
913
  for check_name, result in sorted_checks:
914
  if not result.success:
915
  category = check_categories.get(check_name, {'title': check_name.replace('_', ' ').title()})
 
917
  output.append(f"### {category['title']}")
918
 
919
  if isinstance(result.issues, list):
920
+ for issue in result.issues[:5]:
921
  if isinstance(issue, dict):
 
922
  for key, value in issue.items():
923
  if isinstance(value, list):
924
  output.extend([f"- {item}" for item in value])
 
927
  else:
928
  output.append(f"- {issue}")
929
 
 
930
  if len(result.issues) > 5:
931
  output.append(f"\n*...and {len(result.issues) - 5} more similar issues*")
932
 
933
+ output.append("")
934
 
 
935
  output.extend([
936
  "## πŸ“‹ Summary and Recommendations",
937
  "",
 
950
 
951
  return "\n".join(output)
952
 
953
+ def create_interface():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
954
 
955
+ def process_document(file_obj, doc_type: str, template_type: Optional[str] = None) -> str:
956
+ """Process document and run all checks."""
957
+ try:
958
+ checker = FAADocumentChecker()
959
+
960
+ if isinstance(file_obj, bytes):
961
+ file_obj = io.BytesIO(file_obj)
962
+
963
+ results = checker.run_all_checks(file_obj, doc_type, template_type)
964
+ return format_markdown_results(results, doc_type)
965
+
966
+ except Exception as e:
967
+ logging.error(f"Error processing document: {str(e)}")
968
+ traceback.print_exc()
969
+ return f"""
970
+ # ❌ Error Processing Document
971
 
972
+ **Error Details:** {str(e)}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
973
 
974
+ Please ensure:
975
+ 1. The file is a valid .docx document
976
+ 2. The file is not corrupted or password protected
977
+ 3. The file is properly formatted
 
 
 
 
 
 
978
 
979
+ Try again after checking these issues. If the problem persists, contact support.
980
+ """
981
 
982
+ def create_interface():
983
+ """Create and configure the Gradio interface."""
984
+ document_types = [
985
+ "Advisory Circular",
986
+ "Airworthiness Criteria",
987
+ "Deviation Memo",
988
+ "Exemption",
989
+ "Federal Register Notice",
990
+ "Order",
991
+ "Policy Statement",
992
+ "Rule",
993
+ "Special Condition",
994
+ "Technical Standard Order",
995
+ "Other"
996
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
997
 
998
+ template_types = ["Short AC template AC", "Long AC template AC"]
999
+
1000
+ custom_css = """
1001
+ .gradio-container {
1002
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
1003
+ }
1004
+ .container {
1005
+ max-width: 900px;
1006
+ margin: auto;
1007
+ }
1008
+ .alert {
1009
+ padding: 1rem;
1010
+ margin-bottom: 1rem;
1011
+ border-radius: 0.5rem;
1012
+ background-color: #f8f9fa;
1013
+ border: 1px solid #dee2e6;
1014
+ }
1015
+ .results-panel {
1016
+ max-height: 800px;
1017
+ overflow-y: auto;
1018
+ padding: 1rem;
1019
+ background-color: #ffffff;
1020
+ border-radius: 0.5rem;
1021
+ border: 1px solid #e9ecef;
1022
+ box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
1023
+ }
1024
+ .info-box {
1025
+ background-color: #e7f3ff;
1026
+ padding: 1rem;
1027
+ border-radius: 0.5rem;
1028
+ margin: 1rem 0;
1029
+ }
1030
+ .warning-box {
1031
+ background-color: #fff3cd;
1032
+ padding: 1rem;
1033
+ border-radius: 0.5rem;
1034
+ margin: 1rem 0;
1035
+ }
1036
+ """
1037
+
1038
+ with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as demo:
1039
+ gr.Markdown(
1040
+ """
1041
+ # πŸ“‘ Document Checker Tool
1042
 
1043
+ ### Purpose
1044
+ This tool checks Word documents for compliance with U.S. federal documentation standards.
 
 
 
 
1045
 
1046
+ ### Features
1047
+ - βœ“ Heading format and structure checks
1048
+ - βœ“ Terminology and style verification
1049
+ - βœ“ Figure and table caption validation
1050
+ - βœ“ Section symbol and abbreviation usage
1051
+ - βœ“ Date format standardization
1052
 
1053
+ ### Supported Document Types
1054
+ - Advisory Circulars
1055
+ - Orders
1056
+ - Policy Statements
1057
+ - Federal Register Notices
1058
+ - Technical Standard Orders
1059
+ - And more...
1060
+ """
1061
+ )
1062
 
1063
+ with gr.Row():
1064
+ with gr.Column(scale=1):
1065
+ with gr.Group():
1066
+ file_input = gr.File(
1067
+ label="πŸ“Ž Upload Word Document (.docx)",
1068
+ file_types=[".docx"],
1069
+ type="binary",
1070
+ elem_classes="file-input"
1071
+ )
1072
+
1073
+ doc_type = gr.Dropdown(
1074
+ choices=document_types,
1075
+ label="πŸ“‹ Document Type",
1076
+ value="Advisory Circular",
1077
+ info="Select the type of document you're checking",
1078
+ elem_classes="doc-type-select"
1079
+ )
1080
+
1081
+ template_type = gr.Radio(
1082
+ choices=template_types,
1083
+ label="πŸ“‘ Template Type",
1084
+ visible=False,
1085
+ info="Only applicable for Advisory Circulars",
1086
+ elem_classes="template-type-select"
1087
+ )
1088
+
1089
+ submit_btn = gr.Button(
1090
+ "πŸ” Check Document",
1091
+ variant="primary",
1092
+ elem_classes="submit-button"
1093
+ )
1094
+
1095
+ with gr.Group():
1096
+ gr.Markdown(
1097
+ """
1098
+ ### πŸ“Œ Quick Tips
1099
+ 1. Ensure document is clean (no track changes)
1100
+ 2. Save any pending changes before upload
1101
+ 3. Check document type selection
1102
+ 4. Review results in priority order
1103
+ """,
1104
+ elem_classes="tips-section"
1105
+ )
1106
+
1107
+ with gr.Column(scale=2):
1108
+ results = gr.Markdown(
1109
+ label="Check Results",
1110
+ value="Results will appear here after processing...",
1111
+ elem_classes=["results-panel"]
1112
+ )
1113
+
1114
+ def update_template_visibility(doc_type):
1115
+ return gr.update(visible=doc_type == "Advisory Circular")
1116
+
1117
+ doc_type.change(
1118
+ fn=update_template_visibility,
1119
+ inputs=[doc_type],
1120
+ outputs=[template_type]
1121
+ )
1122
+
1123
+ submit_btn.click(
1124
+ fn=process_document,
1125
+ inputs=[file_input, doc_type, template_type],
1126
+ outputs=[results]
1127
+ )
1128
+
1129
+ gr.Markdown(
1130
+ """
1131
+ ### πŸ“Œ Important Notes
1132
+ - This tool is in development; you may encounter false positives
1133
+ - For questions or feedback, contact Eric Putnam
1134
+ - Results are not stored or saved
1135
+
1136
+ ### πŸ”‘ Key Benefits
1137
+ - Saves time on manual document review
1138
+ - Ensures consistency across documents
1139
+ - Helps maintain compliance with standards
1140
+ - Identifies common issues early
1141
+
1142
+ ### πŸ’‘ Tips for Best Results
1143
+ 1. Address high-priority issues first
1144
+ 2. Use search/replace for consistent fixes
1145
+ 3. Re-run checks after making changes
1146
+ 4. Keep your document templates updated
1147
+ """
1148
+ )
1149
 
1150
+ return demo
1151
+
1152
+ def format_markdown_results(results: Dict[str, DocumentCheckResult], doc_type: str) -> str:
1153
+ """Format check results into a Markdown string for Gradio display."""
1154
+ output = []
1155
 
1156
+ current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
1157
+ output.extend([
1158
+ f"# Document Check Results - {current_time}",
1159
+ f"## Document Type: {doc_type}",
1160
+ "---\n"
1161
+ ])
1162
+
1163
+ total_issues = sum(1 for r in results.values() if not r.success)
1164
 
1165
+ if total_issues == 0:
1166
+ output.append("βœ… **All checks passed successfully!**\n")
1167
+ return "\n".join(output)
 
 
 
1168
 
1169
+ output.append(f"❗ Found issues in {total_issues} check categories\n")
1170
+
1171
+ check_categories = {
1172
+ 'heading_title_check': {'title': 'πŸ“‹ Required Headings', 'priority': 1},
1173
+ 'heading_title_period_check': {'title': 'πŸ” Heading Period Usage', 'priority': 1},
1174
+ 'acronym_check': {'title': 'πŸ“ Acronym Definitions', 'priority': 2},
1175
+ 'terminology_check': {'title': 'πŸ“– Terminology Usage', 'priority': 2},
1176
+ 'section_symbol_usage_check': {'title': 'Β§ Section Symbol Usage', 'priority': 2},
1177
+ 'caption_check_table': {'title': 'πŸ“Š Table Captions', 'priority': 3},
1178
+ 'caption_check_figure': {'title': 'πŸ–ΌοΈ Figure Captions', 'priority': 3},
1179
+ 'table_figure_reference_check': {'title': 'πŸ”— Table/Figure References', 'priority': 3},
1180
+ 'document_title_check': {'title': 'πŸ“‘ Document Title Format', 'priority': 1},
1181
+ 'double_period_check': {'title': '⚑ Double Periods', 'priority': 4},
1182
+ 'spacing_check': {'title': '⌨️ Spacing Issues', 'priority': 4},
1183
+ 'abbreviation_usage_check': {'title': 'πŸ“Ž Abbreviation Usage', 'priority': 3},
1184
+ 'date_formats_check': {'title': 'πŸ“… Date Formats', 'priority': 3},
1185
+ 'placeholders_check': {'title': '🚩 Placeholder Content', 'priority': 1}
1186
+ }
1187
+
1188
+ sorted_checks = sorted(
1189
+ [(name, result) for name, result in results.items()],
1190
+ key=lambda x: check_categories.get(x[0], {'priority': 999})['priority']
1191
  )
 
 
1192
 
1193
+ for check_name, result in sorted_checks:
1194
+ if not result.success:
1195
+ category = check_categories.get(check_name, {'title': check_name.replace('_', ' ').title()})
1196
+
1197
+ output.append(f"### {category['title']}")
1198
+
1199
+ if isinstance(result.issues, list):
1200
+ for issue in result.issues[:5]:
1201
+ if isinstance(issue, dict):
1202
+ for key, value in issue.items():
1203
+ if isinstance(value, list):
1204
+ output.extend([f"- {item}" for item in value])
1205
+ else:
1206
+ output.append(f"- {key}: {value}")
1207
+ else:
1208
+ output.append(f"- {issue}")
1209
+
1210
+ if len(result.issues) > 5:
1211
+ output.append(f"\n*...and {len(result.issues) - 5} more similar issues*")
1212
+
1213
+ output.append("")
1214
+
1215
+ output.extend([
1216
+ "## πŸ“‹ Summary and Recommendations",
1217
+ "",
1218
+ "### Priority Order for Fixes:",
1219
+ "1. πŸ”΄ Critical: Heading formats, required content, and document structure",
1220
+ "2. 🟑 Important: Terminology, acronyms, and references",
1221
+ "3. 🟒 Standard: Formatting, spacing, and style consistency",
1222
+ "",
1223
+ "### Next Steps:",
1224
+ "1. Address issues in priority order",
1225
+ "2. Use search/replace for consistent fixes",
1226
+ "3. Re-run checker after making changes",
1227
+ "4. Update your document template if needed",
1228
+ ""
1229
+ ])
1230
+
1231
+ return "\n".join(output)
1232
+
1233
+ # Initialize and launch the interface
1234
+ if __name__ == "__main__":
1235
+ # Setup logging
1236
+ logging.basicConfig(
1237
+ level=logging.INFO,
1238
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
1239
+ )
1240
+
1241
+ # Create and launch the interface
1242
+ demo = create_interface()
1243
+ demo.launch(
1244
+ share=False, # Set to True if you want to generate a public link
1245
+ server_name="0.0.0.0", # Allows external access
1246
+ server_port=7860, # Default Gradio port
1247
+ debug=True
1248
+ )