Hoctar77 commited on
Commit
dbf4df0
Β·
verified Β·
1 Parent(s): b8f25d8

January 2025 updates

Browse files
Files changed (1) hide show
  1. app.py +997 -333
app.py CHANGED
@@ -9,11 +9,14 @@ import logging
9
  import traceback
10
  from datetime import datetime
11
  from enum import Enum, auto
12
- from typing import Dict, List, Any, Tuple, Optional, Pattern, Callable
13
  from dataclasses import dataclass
14
  from functools import wraps
15
  from abc import ABC, abstractmethod
16
  # import tempfile # For creating temporary files
 
 
 
17
 
18
  # Third-party imports
19
  import gradio as gr
@@ -342,285 +345,34 @@ class DocumentCheckerConfig:
342
  Returns:
343
  Dict[str, List[PatternConfig]]: Dictionary of pattern configurations by category
344
  """
345
- patterns = {
346
- 'terminology': [
347
- PatternConfig(
348
- pattern=r'\btitle 14 of the Code of Federal Regulations \(14 CFR\)\b',
349
- description="Ignore 'title 14 of the Code of Federal Regulations (14 CFR)'",
350
- is_error=False # Set to False to ignore this phrase
351
- ),
352
- PatternConfig(
353
- pattern=r'\btitle 14, Code of Federal Regulations \(14 CFR\)\b',
354
- description="Ignore 'title 14, Code of Federal Regulations (14 CFR)'",
355
- is_error=False
356
- ),
357
- PatternConfig(
358
- pattern=r'\btitle 49 of the United States Code \(49 U.S.C.\)\b',
359
- description="Ignore 'title 49 of the United States Code (49 U.S.C.)'",
360
- is_error=False
361
- ),
362
- PatternConfig(
363
- pattern=r'\btitle 49, United States Code \(49 U.S.C.\)\b',
364
- description="Ignore 'title 49, United States Code (49 U.S.C.)'",
365
- is_error=False
366
- ),
367
- PatternConfig(
368
- pattern=r'\bAD Compliance Team \(AD CRT\)\b',
369
- description="Ignore 'AD Compliance Team (AD CRT)'",
370
- is_error=False
371
- ),
372
- PatternConfig(
373
- pattern=r'\bUSC\b',
374
- description="USC should be U.S.C.", # Per GPO Style Manual
375
- is_error=True,
376
- replacement="U.S.C."
377
- ),
378
- PatternConfig(
379
- pattern=r'\bCFR Part\b',
380
- description="CFR Part should be CFR part (lowercase)", # Per FAA Order 1320.46
381
- is_error=True,
382
- replacement="CFR part"
383
- ),
384
- PatternConfig(
385
- pattern=r'\bC\.F\.R\.\b',
386
- description="C.F.R. should be CFR", # GPO Style Manual
387
- is_error=True,
388
- replacement="CFR"
389
- ),
390
- PatternConfig(
391
- pattern=r'\bWe\b',
392
- description="'We' should be 'The FAA'",
393
- is_error=True,
394
- replacement="The FAA"
395
- ),
396
- PatternConfig(
397
- pattern=r'\bwe\b',
398
- description="'we' should be 'the FAA'",
399
- is_error=True,
400
- replacement="the FAA"
401
- ),
402
- PatternConfig(
403
- pattern=r'\bcancelled\b',
404
- description="'cancelled' should be 'canceled'", # Per GPO Style Manual
405
- is_error=True,
406
- replacement="canceled"
407
- ),
408
- PatternConfig(
409
- pattern=r'\bshall\b',
410
- description="'shall' should be 'must'", # Per FAA Order 1320.46
411
- is_error=True,
412
- replacement="must"
413
- ),
414
- PatternConfig(
415
- pattern=r'\b\&\b',
416
- description="'&' should be 'and'", # Per April 17, 2024 Use ampersand instead or 'and' email from Judith Watson
417
- is_error=True,
418
- replacement="and"
419
- ),
420
- PatternConfig(
421
- pattern=r'\bflight crew\b',
422
- description="'flight crew' should be 'flightcrew'", # Per AIR-600 Quick Reference Guide for Authors, Reviewers, and Writers/Editors
423
- is_error=True,
424
- replacement="flightcrew"
425
- ),
426
- PatternConfig(
427
- pattern=r'\bchairman\b',
428
- description="'chairman' should be 'chair'", # Per AIR-600 Quick Reference Guide for Authors, Reviewers, and Writers/Editors
429
- is_error=True,
430
- replacement="chair"
431
- ),
432
- PatternConfig(
433
- pattern=r'\bflagman\b',
434
- description="'flagman' should be 'flagger' or 'flagperson'", # Per AIR-600 Quick Reference Guide for Authors, Reviewers, and Writers/Editors
435
- is_error=True,
436
- replacement="flagperson"
437
- ),
438
- PatternConfig(
439
- pattern=r'\bman\b',
440
- description="'man' should be 'individual' or 'person'", # Per AIR-600 Quick Reference Guide for Authors, Reviewers, and Writers/Editors
441
- is_error=True,
442
- replacement="person"
443
- ),
444
- PatternConfig(
445
- pattern=r'\bmanmade\b',
446
- description="'manmade' should be 'personmade'", # Per AIR-600 Quick Reference Guide for Authors, Reviewers, and Writers/Editors
447
- is_error=True,
448
- replacement="personmade"
449
- ),
450
- PatternConfig(
451
- pattern=r'\bmanpower\b',
452
- description="'manpower' should be 'labor force'", # Per AIR-600 Quick Reference Guide for Authors, Reviewers, and Writers/Editors
453
- is_error=True,
454
- replacement="labor force"
455
- ),
456
- PatternConfig(
457
- pattern=r'\bnotice to airman\b',
458
- description="'notice to airman' should be 'notice to air missions'", # Per AIR-600 Quick Reference Guide for Authors, Reviewers, and Writers/Editors
459
- is_error=True,
460
- replacement="notice to air missions"
461
- ),
462
- PatternConfig(
463
- pattern=r'\bnotice to airmen\b',
464
- description="'notice to airmen' should be 'notice to air missions'", # Per AIR-600 Quick Reference Guide for Authors, Reviewers, and Writers/Editors
465
- is_error=True,
466
- replacement="notice to air missions"
467
- ),
468
- PatternConfig(
469
- pattern=r'\bcockpit\b',
470
- description="'cockpit' should be 'flight deck'", # Per AIR-600 Quick Reference Guide for Authors, Reviewers, and Writers/Editors
471
- is_error=True,
472
- replacement="flight deck"
473
- ),
474
- PatternConfig(
475
- pattern=r'\bA321 neo\b',
476
- description="'A321 neo' should be 'A321neo'", # Per TCDS
477
- is_error=True,
478
- replacement="A321neo"
479
- )
480
- ],
481
- 'section_symbol': [
482
- PatternConfig(
483
- pattern=r'^Β§',
484
- description="Don't start a sentence with the section symbol. Write out 'Section'",
485
- is_error=True
486
- ),
487
- PatternConfig(
488
- pattern=r'\b14 CFR Β§\s*\d+\.\d+\b',
489
- description="14 CFR should not use section symbol",
490
- is_error=True
491
- ),
492
- PatternConfig(
493
- pattern=r'Β§\s*\d+\.\d+\s+(?:and|or)\s+\d+\.\d+',
494
- description="Missing section symbol in multiple sections",
495
- is_error=True
496
- ),
497
- PatternConfig(
498
- pattern=r'Β§\s*\d+\.\d+\s+through\s+\d+\.\d+',
499
- description="Missing section symbol in range of sections",
500
- is_error=True
501
- ),
502
- PatternConfig(
503
- pattern=r'Β§\s*\d+\.\d+\s+or\s+Β§?\s*\d+\.\d+',
504
- description="Inconsistent section symbol usage with 'or'",
505
- is_error=True
506
- )
507
- ],
508
- 'spacing': [
509
- PatternConfig(
510
- pattern=r'([^\s]+)[ ]{2,}([^\s]+)', # Capture words before and after double space
511
- description="Remove double spacing between '{0}' and '{1}'",
512
- is_error=True
513
- ),
514
- PatternConfig(
515
- pattern=r'(?<!\s)(AC|AD|CFR|FAA|N|SFAR)(\d+[-]?\d*[A-Z]?)', # Capture doc type and number
516
- description="Add space between '{0}' and '{1}'",
517
- is_error=True
518
- ),
519
- PatternConfig(
520
- pattern=r'(Β§|Β§Β§)(\d+\.\d+)', # Removed (?<!\s) to catch all section symbols
521
- description="Add space after '{0}' before '{1}'",
522
- is_error=True
523
- ),
524
- PatternConfig(
525
- pattern=r'(?<!\s)(Part)(\d+)', # Capture 'Part' and number
526
- description="Add space between '{0}' and '{1}'",
527
- is_error=True
528
- )
529
- ],
530
- 'dates': [
531
- PatternConfig(
532
- pattern=r'(?<![\w/-])\d{1,2}/\d{1,2}/\d{2,4}(?![\w/-])',
533
- description="Use 'Month Day, Year' format instead of MM/DD/YYYY",
534
- is_error=True
535
- ),
536
- PatternConfig(
537
- pattern=r'(?<![\w/-])\d{1,2}-\d{1,2}-\d{2,4}(?![\w/-])',
538
- description="Use 'Month Day, Year' format instead of MM-DD-YYYY",
539
- is_error=True
540
- ),
541
- PatternConfig(
542
- pattern=r'(?<![\w/-])\d{4}-\d{1,2}-\d{1,2}(?![\w/-])',
543
- description="Use 'Month Day, Year' format instead of YYYY-MM-DD",
544
- is_error=True
545
- )
546
- ],
547
- 'placeholders': [
548
- PatternConfig(
549
- pattern=r'\bTBD\b',
550
- description="Remove TBD placeholder",
551
- is_error=True
552
- ),
553
- PatternConfig(
554
- pattern=r'\bTo be determined\b',
555
- description="Remove 'To be determined' placeholder",
556
- is_error=True
557
- ),
558
- PatternConfig(
559
- pattern=r'\bTo be added\b',
560
- description="Remove 'To be added' placeholder",
561
- is_error=True
562
- )
563
- ],
564
- 'reference_terms': [
565
- PatternConfig(
566
- pattern=r'\babove\b',
567
- description="Avoid using 'above' for references",
568
- is_error=True
569
- ),
570
- PatternConfig(
571
- pattern=r'\bbelow\b',
572
- description="Avoid using 'below' for references",
573
- is_error=True
574
- ),
575
- PatternConfig(
576
- pattern=r'(?:^|(?<=[.!?]\s))There\s+(?:is|are)\b',
577
- description="Avoid starting sentences with 'There is/are'",
578
- is_error=True
579
- )
580
- ],
581
- 'periods': [
582
- PatternConfig(
583
- pattern=r'\.\.',
584
- description="Remove double periods",
585
- is_error=True
586
- )
587
- ],
588
- 'table_figure_references': [
589
- PatternConfig(
590
- pattern=r'(?<!^)(?<![.!?])\s+[T]able\s+\d+(?:-\d+)?',
591
- description="Table reference within sentence should be lowercase",
592
- is_error=True
593
- ),
594
- PatternConfig(
595
- pattern=r'(?<!^)(?<![.!?])\s+[F]igure\s+\d+(?:-\d+)?',
596
- description="Figure reference within sentence should be lowercase",
597
- is_error=True
598
- ),
599
- PatternConfig(
600
- pattern=r'^[t]able\s+\d+(?:-\d+)?',
601
- description="Table reference at start of sentence should be capitalized",
602
- is_error=True
603
- ),
604
- PatternConfig(
605
- pattern=r'^[f]igure\s+\d+(?:-\d+)?',
606
- description="Figure reference at start of sentence should be capitalized",
607
- is_error=True
608
- )
609
- ],
610
- 'parentheses': [
611
- PatternConfig(
612
- pattern=r'\([^)]*$', # Finds opening parenthesis without closing
613
- description="Missing closing parenthesis",
614
- is_error=True
615
- ),
616
- PatternConfig(
617
- pattern=r'[^(]*\)', # Finds closing parenthesis without opening
618
- description="Missing opening parenthesis",
619
- is_error=True
620
- )
621
- ]
622
- }
623
- return patterns
624
 
625
  def profile_performance(func):
626
  """Decorator to profile function performance."""
@@ -667,7 +419,7 @@ class FAADocumentChecker(DocumentChecker):
667
 
668
  PREDEFINED_ACRONYMS = {
669
  'AGC', 'AIR', 'CFR', 'DC', 'DOT', 'FAA IR-M', 'FAQ', 'i.e.', 'e.g.', 'MA',
670
- 'MD', 'MIL', 'MO', 'No.', 'PDF', 'SSN', 'TX', 'U.S.', 'U.S.C.', 'USA', 'US',
671
  'WA', 'XX', 'ZIP'
672
  }
673
 
@@ -903,9 +655,13 @@ class FAADocumentChecker(DocumentChecker):
903
 
904
  @profile_performance
905
  def acronym_check(self, doc: List[str]) -> DocumentCheckResult:
 
906
  if not self.validate_input(doc):
907
  return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
908
 
 
 
 
909
  # Common words that might appear in uppercase but aren't acronyms
910
  heading_words = self.config_manager.config.get('heading_words', self.HEADING_WORDS)
911
 
@@ -932,12 +688,13 @@ class FAADocumentChecker(DocumentChecker):
932
  defined_acronyms = {} # Stores definition info
933
  used_acronyms = set() # Stores acronyms used after definition
934
  reported_acronyms = set() # Stores acronyms that have already been noted as issues
935
- issues = []
936
 
937
  # Patterns
938
  defined_pattern = re.compile(r'\b([\w\s&]+?)\s*\((\b[A-Z]{2,}\b)\)')
939
  acronym_pattern = re.compile(r'(?<!\()\b[A-Z]{2,}\b(?!\s*[:.]\s*)')
940
 
 
 
941
  for paragraph in doc:
942
  # Skip lines that appear to be headings
943
  words = paragraph.strip().split()
@@ -973,9 +730,10 @@ class FAADocumentChecker(DocumentChecker):
973
  if any(start <= start_pos <= end for start, end in ignored_spans):
974
  continue
975
 
976
- # Skip predefined acronyms and other checks
977
  if (acronym in predefined_acronyms or
978
  acronym in heading_words or
 
979
  any(not c.isalpha() for c in acronym) or
980
  len(acronym) > 10):
981
  continue
@@ -1674,12 +1432,16 @@ class FAADocumentChecker(DocumentChecker):
1674
 
1675
  # Define order of checks for better organization
1676
  check_sequence = [
 
1677
  ('heading_title_check', lambda: self.heading_title_check(doc, doc_type)),
1678
  ('heading_title_period_check', lambda: self.heading_title_period_check(doc, doc_type)),
1679
  ('terminology_check', lambda: self.check_terminology(doc)),
1680
  ('acronym_check', lambda: self.acronym_check(doc)),
1681
  ('acronym_usage_check', lambda: self.acronym_usage_check(doc)),
1682
  ('section_symbol_usage_check', lambda: self.check_section_symbol_usage(doc)),
 
 
 
1683
  ('date_formats_check', lambda: self.check_date_formats(doc)),
1684
  ('placeholders_check', lambda: self.check_placeholders(doc)),
1685
  ('document_title_check', lambda: self.document_title_check(doc_path, doc_type) if not skip_title_check else DocumentCheckResult(success=True, issues=[])),
@@ -2243,6 +2005,770 @@ class FAADocumentChecker(DocumentChecker):
2243
  issues=issues,
2244
  details=sentence_stats
2245
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2246
 
2247
  class DocumentCheckResultsFormatter:
2248
 
@@ -2385,6 +2911,50 @@ class DocumentCheckResultsFormatter:
2385
  'before': 'See AC 25.1309-1B, System Design and Analysis, for information on X.',
2386
  'after': 'See AC 25.1309-1B, <i>System Design and Analysis</i>, for information on X.'
2387
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2388
  }
2389
  }
2390
 
@@ -2472,15 +3042,24 @@ class DocumentCheckResultsFormatter:
2472
  return formatted_issues
2473
 
2474
  def _format_reference_issues(self, result: DocumentCheckResult) -> List[str]:
2475
- """Format reference-related issues with clear replacement instructions."""
2476
- output = []
2477
 
2478
- if result.issues:
2479
- for issue in result.issues:
2480
- if 'reference' in issue and 'correct_form' in issue:
2481
- output.append(f" β€’ Replace '{issue['reference']}' with '{issue['correct_form']}'")
 
 
 
 
 
 
 
 
 
2482
 
2483
- return output
2484
 
2485
  def _format_standard_issue(self, issue: Dict[str, Any]) -> str:
2486
  """Format standard issues consistently."""
@@ -2753,19 +3332,38 @@ class DocumentCheckResultsFormatter:
2753
  output.extend(self._format_section_symbol_issues(result))
2754
  elif check_name == 'parentheses_check':
2755
  output.extend(self._format_parentheses_issues(result))
2756
- elif check_name == 'paragraph_length_check':
2757
- output.extend(self._format_paragraph_length_issues(result))
2758
- elif check_name == 'sentence_length_check':
2759
- formatted_issues = [self._format_standard_issue(issue) for issue in result.issues[:15]]
2760
- output.extend(formatted_issues)
2761
-
2762
- if len(result.issues) > 15:
2763
- output.append(f"\n ... and {len(result.issues) - 15} more similar issues.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2764
  else:
2765
  formatted_issues = [self._format_standard_issue(issue) for issue in result.issues[:15]]
2766
  output.extend(formatted_issues)
2767
 
2768
- if len(result.issues) > 15:
2769
  output.append(f"\n ... and {len(result.issues) - 15} more similar issues.")
2770
 
2771
  return '\n'.join(output)
@@ -2788,33 +3386,29 @@ class DocumentCheckResultsFormatter:
2788
  except Exception as e:
2789
  print(f"Error saving report: {e}")
2790
 
2791
- def process_document(file_obj, doc_type: str, template_type: Optional[str] = None) -> str:
2792
- """Process document and run all checks."""
2793
- try:
2794
- print(f"Processing document at {time.time()}") # Debug print
2795
- checker = FAADocumentChecker()
2796
 
2797
- if isinstance(file_obj, bytes):
2798
- file_obj = io.BytesIO(file_obj)
2799
-
2800
- results = checker.run_all_checks(file_obj, doc_type, template_type)
2801
- return format_markdown_results(results, doc_type)
 
 
2802
 
2803
- except Exception as e:
2804
- logging.error(f"Error processing document: {str(e)}")
2805
- traceback.print_exc()
2806
- return f"""
2807
- # ❌ Error Processing Document
2808
-
2809
- **Error Details:** {str(e)}
2810
-
2811
- Please ensure:
2812
- 1. The file is a valid .docx document
2813
- 2. The file is not corrupted or password protected
2814
- 3. The file is properly formatted
2815
-
2816
- Try again after checking these issues. If the problem persists, contact support.
2817
- """
2818
 
2819
  def format_markdown_results(results: Dict[str, DocumentCheckResult], doc_type: str) -> str:
2820
  """Format check results into a Markdown string for Gradio display."""
@@ -2842,6 +3436,9 @@ def format_markdown_results(results: Dict[str, DocumentCheckResult], doc_type: s
2842
  'acronym_check': {'title': 'πŸ“ Acronym Definitions', 'priority': 1},
2843
  'acronym_usage_check': {'title': 'πŸ“Ž Acronym Usage', 'priority': 1},
2844
  'section_symbol_usage_check': {'title': 'Β§ Section Symbol Usage', 'priority': 2},
 
 
 
2845
  'date_formats_check': {'title': 'πŸ“… Date Formats', 'priority': 2},
2846
  'placeholders_check': {'title': '🚩 Placeholder Content', 'priority': 2},
2847
  'document_title_check': {'title': 'πŸ“‘ Document Title Format', 'priority': 2},
@@ -2852,7 +3449,8 @@ def format_markdown_results(results: Dict[str, DocumentCheckResult], doc_type: s
2852
  'double_period_check': {'title': '⚑ Double Periods', 'priority': 4},
2853
  'spacing_check': {'title': '⌨️ Spacing Issues', 'priority': 4},
2854
  'paragraph_length_check': {'title': 'πŸ“ Paragraph Length', 'priority': 5},
2855
- 'sentence_length_check': {'title': 'πŸ“ Sentence Length', 'priority': 5}
 
2856
  }
2857
 
2858
  sorted_checks = sorted(
@@ -2948,6 +3546,60 @@ def create_interface():
2948
  title = parts[0].strip()
2949
  content = parts[1].strip()
2950
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2951
  # Extract description and solution
2952
  description_parts = content.split('How to fix:', 1)
2953
  description = description_parts[0].strip()
@@ -2991,13 +3643,12 @@ def create_interface():
2991
  <h3 class="font-medium text-gray-800 mb-2">Issues found in your document:</h3>
2992
  <ul class="list-none space-y-2">
2993
  """
2994
- for issue in issues_match[:7]:
2995
- # Remove any existing bullet points from the issue text
2996
  clean_issue = issue.strip().lstrip('β€’').strip()
2997
  issues_html_section += f"""
2998
  <li class="text-gray-600 ml-4">β€’ {clean_issue}</li>
2999
  """
3000
- if len(issues_match) > 7:
3001
  issues_html_section += f"""
3002
  <li class="text-gray-500 italic ml-4">... and {len(issues_match) - 7} more similar issues.</li>
3003
  """
@@ -3028,7 +3679,21 @@ def create_interface():
3028
  </div>
3029
  """
3030
 
3031
- # Format summary section
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3032
  summary_html = f"""
3033
  <div class="bg-white rounded-lg shadow-sm mb-6 overflow-hidden">
3034
  <div class="bg-gray-50 px-6 py-4 border-b">
@@ -3058,12 +3723,13 @@ def create_interface():
3058
  </div>
3059
  """
3060
 
3061
- # Final HTML with styling
3062
  full_html = f"""
3063
  <div class="mx-auto p-4" style="font-family: system-ui, -apple-system, sans-serif;">
3064
  <style>
3065
  .text-2xl {{ font-size: 1.5rem; line-height: 2rem; }}
3066
  .text-lg {{ font-size: 1.125rem; }}
 
3067
  .font-bold {{ font-weight: 700; }}
3068
  .font-semibold {{ font-weight: 600; }}
3069
  .font-medium {{ font-weight: 500; }}
@@ -3095,9 +3761,7 @@ def create_interface():
3095
  .overflow-hidden {{ overflow: hidden; }}
3096
  .list-none {{ list-style-type: none; }}
3097
  .space-y-4 > * + * {{ margin-top: 1rem; }}
3098
- .text-red-600 {{ color: #dc2626; }}
3099
- .text-amber-600 {{ color: #d97706; }}
3100
- .text-green-600 {{ color: #059669; }}
3101
  </style>
3102
  {header_html}
3103
  {issues_html}
 
9
  import traceback
10
  from datetime import datetime
11
  from enum import Enum, auto
12
+ from typing import Dict, List, Any, Tuple, Optional, Pattern, Callable, Set
13
  from dataclasses import dataclass
14
  from functools import wraps
15
  from abc import ABC, abstractmethod
16
  # import tempfile # For creating temporary files
17
+ import requests
18
+ from concurrent.futures import ThreadPoolExecutor, as_completed
19
+ from pathlib import Path
20
 
21
  # Third-party imports
22
  import gradio as gr
 
345
  Returns:
346
  Dict[str, List[PatternConfig]]: Dictionary of pattern configurations by category
347
  """
348
+ try:
349
+ # Get the directory containing the current file
350
+ current_dir = os.path.dirname(os.path.abspath(__file__))
351
+ patterns_file = os.path.join(current_dir, 'patterns.json')
352
+
353
+ # Load patterns from JSON file
354
+ with open(patterns_file, 'r') as f:
355
+ patterns_data = json.load(f)
356
+
357
+ # Convert JSON data to PatternConfig objects
358
+ patterns = {}
359
+ for category, pattern_list in patterns_data.items():
360
+ patterns[category] = [
361
+ PatternConfig(
362
+ pattern=p['pattern'],
363
+ description=p['description'],
364
+ is_error=p['is_error'],
365
+ replacement=p.get('replacement'),
366
+ keep_together=p.get('keep_together', False)
367
+ ) for p in pattern_list
368
+ ]
369
+
370
+ return patterns
371
+
372
+ except Exception as e:
373
+ self.logger.error(f"Error loading patterns: {e}")
374
+ # Return empty patterns dictionary if file loading fails
375
+ return {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376
 
377
  def profile_performance(func):
378
  """Decorator to profile function performance."""
 
419
 
420
  PREDEFINED_ACRONYMS = {
421
  'AGC', 'AIR', 'CFR', 'DC', 'DOT', 'FAA IR-M', 'FAQ', 'i.e.', 'e.g.', 'MA',
422
+ 'MD', 'MIL', 'MO', 'No.', 'PDF', 'SAE', 'SSN', 'TX', 'U.S.', 'U.S.C.', 'USA', 'US',
423
  'WA', 'XX', 'ZIP'
424
  }
425
 
 
655
 
656
  @profile_performance
657
  def acronym_check(self, doc: List[str]) -> DocumentCheckResult:
658
+ """Check for acronyms and their definitions."""
659
  if not self.validate_input(doc):
660
  return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
661
 
662
+ # Load valid words
663
+ valid_words = self._load_valid_words()
664
+
665
  # Common words that might appear in uppercase but aren't acronyms
666
  heading_words = self.config_manager.config.get('heading_words', self.HEADING_WORDS)
667
 
 
688
  defined_acronyms = {} # Stores definition info
689
  used_acronyms = set() # Stores acronyms used after definition
690
  reported_acronyms = set() # Stores acronyms that have already been noted as issues
 
691
 
692
  # Patterns
693
  defined_pattern = re.compile(r'\b([\w\s&]+?)\s*\((\b[A-Z]{2,}\b)\)')
694
  acronym_pattern = re.compile(r'(?<!\()\b[A-Z]{2,}\b(?!\s*[:.]\s*)')
695
 
696
+ issues = []
697
+
698
  for paragraph in doc:
699
  # Skip lines that appear to be headings
700
  words = paragraph.strip().split()
 
730
  if any(start <= start_pos <= end for start, end in ignored_spans):
731
  continue
732
 
733
+ # Skip predefined acronyms, valid words, and other checks
734
  if (acronym in predefined_acronyms or
735
  acronym in heading_words or
736
+ acronym.lower() in valid_words or # Check against valid words list
737
  any(not c.isalpha() for c in acronym) or
738
  len(acronym) > 10):
739
  continue
 
1432
 
1433
  # Define order of checks for better organization
1434
  check_sequence = [
1435
+ ('readability_check', lambda: self.check_readability(doc)),
1436
  ('heading_title_check', lambda: self.heading_title_check(doc, doc_type)),
1437
  ('heading_title_period_check', lambda: self.heading_title_period_check(doc, doc_type)),
1438
  ('terminology_check', lambda: self.check_terminology(doc)),
1439
  ('acronym_check', lambda: self.acronym_check(doc)),
1440
  ('acronym_usage_check', lambda: self.acronym_usage_check(doc)),
1441
  ('section_symbol_usage_check', lambda: self.check_section_symbol_usage(doc)),
1442
+ ('508_compliance_check', lambda: self.check_508_compliance(doc_path)),
1443
+ ('cross_references_check', lambda: self.check_cross_references(doc_path)),
1444
+ ('hyperlink_check', lambda: self.check_hyperlinks(doc)),
1445
  ('date_formats_check', lambda: self.check_date_formats(doc)),
1446
  ('placeholders_check', lambda: self.check_placeholders(doc)),
1447
  ('document_title_check', lambda: self.document_title_check(doc_path, doc_type) if not skip_title_check else DocumentCheckResult(success=True, issues=[])),
 
2005
  issues=issues,
2006
  details=sentence_stats
2007
  )
2008
+
2009
+ @profile_performance
2010
+ def check_508_compliance(self, doc_path: str) -> DocumentCheckResult:
2011
+ """
2012
+ Perform Section 508 compliance checks focusing on image alt text and heading structure.
2013
+ """
2014
+ try:
2015
+ doc = Document(doc_path)
2016
+ issues = []
2017
+ images_with_alt = 0
2018
+ heading_structure = {}
2019
+ heading_issues = [] # Separate list for heading-specific issues
2020
+ hyperlink_issues = [] # New list for hyperlink issues
2021
+
2022
+ # Image alt text check
2023
+ for shape in doc.inline_shapes:
2024
+ alt_text = None
2025
+ if hasattr(shape, '_inline') and hasattr(shape._inline, 'docPr'):
2026
+ docPr = shape._inline.docPr
2027
+ alt_text = docPr.get('descr') or docPr.get('title')
2028
+
2029
+ if alt_text:
2030
+ images_with_alt += 1
2031
+ else:
2032
+ issues.append({
2033
+ 'category': 'image_alt_text',
2034
+ 'message': 'Image is missing descriptive alt text.',
2035
+ 'context': 'Ensure all images have descriptive alt text.'
2036
+ })
2037
+
2038
+ # Enhanced heading structure check
2039
+ headings = []
2040
+
2041
+ for paragraph in doc.paragraphs:
2042
+ if paragraph.style.name.startswith('Heading'):
2043
+ try:
2044
+ level = int(paragraph.style.name.split()[-1])
2045
+ text = paragraph.text.strip()
2046
+
2047
+ if not text:
2048
+ continue
2049
+
2050
+ headings.append((text, level))
2051
+ heading_structure[level] = heading_structure.get(level, 0) + 1
2052
+
2053
+ except ValueError:
2054
+ continue
2055
+
2056
+ # Check heading hierarchy
2057
+ if headings:
2058
+ min_level = min(level for _, level in headings)
2059
+
2060
+ if min_level > 1:
2061
+ heading_issues.append({
2062
+ 'severity': 'error',
2063
+ 'type': 'missing_h1',
2064
+ 'message': 'Document should start with a Heading 1',
2065
+ 'context': f"First heading found is level {headings[0][1]}: '{headings[0][0]}'",
2066
+ 'recommendation': 'Add a Heading 1 at the start of the document'
2067
+ })
2068
+
2069
+ # Check for skipped levels
2070
+ previous_heading = None
2071
+ for text, level in headings:
2072
+ if previous_heading:
2073
+ prev_text, prev_level = previous_heading
2074
+
2075
+ # Only check for skipped levels when going deeper
2076
+ if level > prev_level + 1:
2077
+ missing_levels = list(range(prev_level + 1, level))
2078
+ heading_issues.append({
2079
+ 'severity': 'error',
2080
+ 'type': 'skipped_levels',
2081
+ 'message': f"Skipped heading level(s) {', '.join(map(str, missing_levels))} - Found H{level} '{text}' after H{prev_level} '{prev_text}'. Add H{prev_level + 1} before this section.",
2082
+ })
2083
+
2084
+ previous_heading = (text, level)
2085
+
2086
+ # Enhanced Hyperlink Accessibility Check
2087
+ for paragraph in doc.paragraphs:
2088
+ # Check both hyperlink fields and runs with hyperlink formatting
2089
+ hyperlinks = []
2090
+
2091
+ # Method 1: Check for hyperlink fields
2092
+ if hasattr(paragraph, '_element') and hasattr(paragraph._element, 'xpath'):
2093
+ hyperlinks.extend(paragraph._element.xpath('.//w:hyperlink'))
2094
+
2095
+ # Method 2: Check for hyperlink style runs
2096
+ for run in paragraph.runs:
2097
+ if hasattr(run, '_element') and hasattr(run._element, 'rPr'):
2098
+ if run._element.rPr is not None:
2099
+ if run._element.rPr.xpath('.//w:rStyle[@w:val="Hyperlink"]'):
2100
+ hyperlinks.append(run)
2101
+
2102
+ # Method 3: Check for direct hyperlink elements
2103
+ if hasattr(run, '_r'):
2104
+ if run._r.xpath('.//w:hyperlink'):
2105
+ hyperlinks.append(run)
2106
+
2107
+ # Process found hyperlinks
2108
+ for hyperlink in hyperlinks:
2109
+ # Extract link text based on element type
2110
+ if hasattr(hyperlink, 'text'): # For run objects
2111
+ link_text = hyperlink.text.strip()
2112
+ else: # For hyperlink elements
2113
+ link_text = ''.join([t.text for t in hyperlink.xpath('.//w:t')])
2114
+
2115
+ if not link_text: # Skip empty links
2116
+ continue
2117
+
2118
+ # Check for accessibility issues
2119
+ non_descriptive = [
2120
+ 'click here', 'here', 'link', 'this link', 'more',
2121
+ 'read more', 'learn more', 'click', 'see this',
2122
+ 'see here', 'go', 'url', 'this', 'page'
2123
+ ]
2124
+
2125
+ if any(phrase == link_text.lower() for phrase in non_descriptive):
2126
+ hyperlink_issues.append({
2127
+ 'category': 'hyperlink_accessibility',
2128
+ 'severity': 'warning',
2129
+ 'message': 'Non-descriptive hyperlink text detected',
2130
+ 'context': f'Link text: "{link_text}"',
2131
+ 'recommendation': 'Replace with descriptive text that indicates the link destination',
2132
+ 'user_message': f'Replace non-descriptive link text "{link_text}" with text that clearly indicates where the link will take the user'
2133
+ })
2134
+ elif len(link_text.strip()) < 4: # Check for very short link text
2135
+ hyperlink_issues.append({
2136
+ 'category': 'hyperlink_accessibility',
2137
+ 'severity': 'warning',
2138
+ 'message': 'Hyperlink text may be too short to be meaningful',
2139
+ 'context': f'Link text: "{link_text}"',
2140
+ 'recommendation': 'Use longer, more descriptive text that indicates the link destination',
2141
+ 'user_message': f'Link text "{link_text}" is too short - use descriptive text that clearly indicates the link destination'
2142
+ })
2143
+ elif link_text.lower().startswith(('http', 'www', 'ftp')):
2144
+ hyperlink_issues.append({
2145
+ 'category': 'hyperlink_accessibility',
2146
+ 'severity': 'warning',
2147
+ 'message': 'Raw URL used as link text',
2148
+ 'context': f'Link text: "{link_text}"',
2149
+ 'recommendation': 'Replace the URL with descriptive text that indicates the link destination',
2150
+ 'user_message': f'Replace the URL "{link_text}" with meaningful text that describes the link destination'
2151
+ })
2152
+
2153
+ # Add hyperlink issues to main issues list
2154
+ if hyperlink_issues:
2155
+ issues.extend(hyperlink_issues)
2156
+
2157
+ # Combine all issues
2158
+ if heading_issues:
2159
+ issues.extend([{
2160
+ 'category': '508_compliance_heading_structure',
2161
+ **issue
2162
+ } for issue in heading_issues])
2163
+
2164
+ # Enhanced details with heading structure information
2165
+ details = {
2166
+ 'total_images': len(doc.inline_shapes),
2167
+ 'images_with_alt': images_with_alt,
2168
+ 'heading_structure': {
2169
+ 'total_headings': len(headings),
2170
+ 'levels_found': dict(sorted(heading_structure.items())),
2171
+ 'hierarchy_depth': max(heading_structure.keys()) if heading_structure else 0,
2172
+ 'heading_sequence': [(text[:50] + '...' if len(text) > 50 else text, level)
2173
+ for text, level in headings],
2174
+ 'issues_found': len(heading_issues)
2175
+ },
2176
+ 'hyperlink_accessibility': { # New details section
2177
+ 'total_issues': len(hyperlink_issues),
2178
+ 'non_descriptive_links': sum(1 for issue in hyperlink_issues
2179
+ if 'Non-descriptive' in issue['message']),
2180
+ 'raw_urls': sum(1 for issue in hyperlink_issues
2181
+ if 'Raw URL' in issue['message'])
2182
+ }
2183
+ }
2184
+
2185
+ return DocumentCheckResult(
2186
+ success=len(issues) == 0,
2187
+ issues=issues,
2188
+ details=details
2189
+ )
2190
+
2191
+ except Exception as e:
2192
+ self.logger.error(f"Error during 508 compliance check: {str(e)}")
2193
+ return DocumentCheckResult(
2194
+ success=False,
2195
+ issues=[{
2196
+ 'category': 'error',
2197
+ 'message': f'Error performing 508 compliance check: {str(e)}'
2198
+ }]
2199
+ )
2200
+
2201
+ def _format_compliance_issues(self, result: DocumentCheckResult) -> List[str]:
2202
+ """Format compliance issues with clear, user-friendly descriptions."""
2203
+ formatted_issues = []
2204
+
2205
+ for issue in result.issues:
2206
+ if issue.get('category') == '508_compliance_heading_structure':
2207
+ # Existing heading structure formatting...
2208
+ message = issue.get('message', 'No description provided')
2209
+ context = issue.get('context', 'No context provided').strip()
2210
+ recommendation = issue.get('recommendation', 'No recommendation provided').strip()
2211
+ formatted_issues.append(
2212
+ f" β€’ {message}. Context: {context}. Recommendation: {recommendation}"
2213
+ )
2214
+ elif issue.get('category') == 'image_alt_text':
2215
+ # Existing alt text formatting...
2216
+ formatted_issues.append(
2217
+ f" β€’ {issue.get('message', 'No description provided')}. {issue.get('context', '')}"
2218
+ )
2219
+ elif issue.get('category') == 'hyperlink_accessibility':
2220
+ # Use the new user-friendly message
2221
+ formatted_issues.append(
2222
+ f" β€’ {issue.get('user_message', issue.get('message', 'No description provided'))}"
2223
+ )
2224
+ elif 'context' in issue and issue['context'].startswith('Link text:'):
2225
+ # This catches the hyperlink issues that might not have the category set
2226
+ link_text = issue['context'].replace('Link text:', '').strip().strip('"')
2227
+ if any(phrase == link_text.lower() for phrase in ['here', 'click here', 'more', 'link']):
2228
+ formatted_issues.append(
2229
+ f" β€’ Replace non-descriptive link text \"{link_text}\" with text that clearly indicates where the link will take the user"
2230
+ )
2231
+ elif link_text.lower().startswith(('http', 'www', 'ftp')):
2232
+ formatted_issues.append(
2233
+ f" β€’ Replace the URL \"{link_text}\" with meaningful text that describes the link destination"
2234
+ )
2235
+ elif len(link_text) < 4:
2236
+ formatted_issues.append(
2237
+ f" β€’ Link text \"{link_text}\" is too short - use descriptive text that clearly indicates the link destination"
2238
+ )
2239
+ else:
2240
+ formatted_issues.append(f" β€’ {issue.get('message', 'No description provided')} {issue['context']}")
2241
+ else:
2242
+ # Generic formatting for other issues
2243
+ message = issue.get('message', 'No description provided')
2244
+ context = issue.get('context', '').strip()
2245
+ formatted_issues.append(
2246
+ f" β€’ {message} {context}"
2247
+ )
2248
+
2249
+ return formatted_issues
2250
+
2251
+ @profile_performance
2252
+ def check_hyperlinks(self, doc: List[str]) -> DocumentCheckResult:
2253
+ """
2254
+ Enhanced hyperlink checker that identifies potentially broken URLs.
2255
+
2256
+ Args:
2257
+ doc: List of document paragraphs.
2258
+
2259
+ Returns:
2260
+ DocumentCheckResult with any potentially broken links.
2261
+ """
2262
+ if not self.validate_input(doc):
2263
+ return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
2264
+
2265
+ issues = []
2266
+ checked_urls = set()
2267
+
2268
+ # URL pattern - matches http/https URLs
2269
+ url_pattern = re.compile(
2270
+ r'https?://(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b[-a-zA-Z0-9()@:%_\+.~#?&//=]*'
2271
+ )
2272
+
2273
+ # Helper function to check a single URL
2274
+ def check_url(url):
2275
+ try:
2276
+ response = requests.head(url, timeout=5, allow_redirects=True, headers={'User-Agent': 'CheckerTool/1.0'})
2277
+ if response.status_code >= 400:
2278
+ return {
2279
+ 'url': url,
2280
+ 'message': f"Broken link: {url} (HTTP {response.status_code})"
2281
+ }
2282
+ except requests.RequestException:
2283
+ return {
2284
+ 'url': url,
2285
+ 'message': f"Check the link or internet connection: {url} (connection error)"
2286
+ }
2287
+ return None
2288
+
2289
+ # Extract and deduplicate URLs
2290
+ for paragraph in doc:
2291
+ urls = {match.group() for match in url_pattern.finditer(paragraph)}
2292
+ checked_urls.update(urls)
2293
+
2294
+ # Concurrently check URLs
2295
+ with ThreadPoolExecutor(max_workers=10) as executor:
2296
+ future_to_url = {executor.submit(check_url, url): url for url in checked_urls}
2297
+ for future in as_completed(future_to_url):
2298
+ issue = future.result()
2299
+ if issue:
2300
+ issues.append(issue)
2301
+
2302
+ return DocumentCheckResult(
2303
+ success=len(issues) == 0,
2304
+ issues=issues,
2305
+ details={
2306
+ 'total_urls_checked': len(checked_urls),
2307
+ 'broken_urls': len(issues)
2308
+ }
2309
+ )
2310
+
2311
+ def _load_valid_words(self) -> Set[str]:
2312
+ """
2313
+ Load valid English words from valid_words.txt file.
2314
+
2315
+ Returns:
2316
+ Set[str]: Set of valid English words in lowercase
2317
+ """
2318
+ try:
2319
+ # Get the directory containing the current file
2320
+ current_dir = os.path.dirname(os.path.abspath(__file__))
2321
+ words_file = os.path.join(current_dir, 'valid_words.txt')
2322
+
2323
+ # Load words from file
2324
+ with open(words_file, 'r') as f:
2325
+ words = {line.strip().lower() for line in f if line.strip()}
2326
+
2327
+ return words
2328
+
2329
+ except Exception as e:
2330
+ self.logger.warning(f"Error loading word list: {e}")
2331
+ return set() # Return empty set as fallback
2332
+
2333
+ @profile_performance
2334
+ def check_cross_references(self, doc_path: str) -> DocumentCheckResult:
2335
+ """
2336
+ Check for missing cross-referenced elements in the document.
2337
+ """
2338
+ try:
2339
+ doc = Document(doc_path)
2340
+ except Exception as e:
2341
+ self.logger.error(f"Error reading the document: {e}")
2342
+ return DocumentCheckResult(success=False, issues=[{'error': str(e)}], details={})
2343
+
2344
+ heading_structure = self._extract_paragraph_numbering(doc)
2345
+ valid_sections = {number for number, _ in heading_structure}
2346
+ tables = set()
2347
+ figures = set()
2348
+ issues = []
2349
+
2350
+ # Skip patterns for external references
2351
+ skip_patterns = [
2352
+ r'(?:U\.S\.C\.|USC)\s+(?:Β§+\s*)?(?:Section|section)?\s*\d+',
2353
+ r'Section\s+\d+(?:\([a-z]\))*\s+of\s+(?:the\s+)?(?:United States Code|U\.S\.C\.)',
2354
+ r'Section\s+\d+(?:\([a-z]\))*\s+of\s+Title\s+\d+',
2355
+ r'(?:Section|Β§)\s*\d+(?:\([a-z]\))*\s+of\s+the\s+Act',
2356
+ r'Section\s+\d+\([a-z]\)',
2357
+ r'Β§\s*\d+\([a-z]\)',
2358
+ r'\d+\s*(?:CFR|C\.F\.R\.)',
2359
+ r'Part\s+\d+(?:\.[0-9]+)*\s+of\s+Title\s+\d+',
2360
+ r'Public\s+Law\s+\d+[-–]\d+',
2361
+ r'Title\s+\d+,\s+Section\s+\d+(?:\([a-z]\))*',
2362
+ r'\d+\s+U\.S\.C\.\s+\d+(?:\([a-z]\))*',
2363
+ ]
2364
+ skip_regex = re.compile('|'.join(skip_patterns), re.IGNORECASE)
2365
+
2366
+ try:
2367
+ # Extract tables and figures
2368
+ for para in doc.paragraphs:
2369
+ text = para.text.strip() if hasattr(para, 'text') else ''
2370
+
2371
+ # Table extraction
2372
+ if text.lower().startswith('table'):
2373
+ matches = [
2374
+ re.match(r'^table\s+(\d{1,2}(?:-\d+)?)\b', text, re.IGNORECASE),
2375
+ re.match(r'^table\s+(\d{1,2}(?:\.\d+)?)\b', text, re.IGNORECASE)
2376
+ ]
2377
+ for match in matches:
2378
+ if match:
2379
+ tables.add(match.group(1))
2380
+
2381
+ # Figure extraction
2382
+ if text.lower().startswith('figure'):
2383
+ matches = [
2384
+ re.match(r'^figure\s+(\d{1,2}(?:-\d+)?)\b', text, re.IGNORECASE),
2385
+ re.match(r'^figure\s+(\d{1,2}(?:\.\d+)?)\b', text, re.IGNORECASE)
2386
+ ]
2387
+ for match in matches:
2388
+ if match:
2389
+ figures.add(match.group(1))
2390
+
2391
+ # Check references
2392
+ for para in doc.paragraphs:
2393
+ para_text = para.text.strip() if hasattr(para, 'text') else ''
2394
+ if not para_text or skip_regex.search(para_text):
2395
+ continue
2396
+
2397
+ # Table reference check
2398
+ table_refs = re.finditer(
2399
+ r'(?:see|in|refer to)?\s*(?:table|Table)\s+(\d{1,2}(?:[-\.]\d+)?)\b',
2400
+ para_text
2401
+ )
2402
+ for match in table_refs:
2403
+ ref = match.group(1)
2404
+ if ref not in tables:
2405
+ issues.append({
2406
+ 'type': 'Table',
2407
+ 'reference': ref,
2408
+ 'context': para_text,
2409
+ 'message': f"Referenced Table {ref} not found in document"
2410
+ })
2411
+
2412
+ # Figure reference check
2413
+ figure_refs = re.finditer(
2414
+ r'(?:see|in|refer to)?\s*(?:figure|Figure)\s+(\d{1,2}(?:[-\.]\d+)?)\b',
2415
+ para_text
2416
+ )
2417
+ for match in figure_refs:
2418
+ ref = match.group(1)
2419
+ if ref not in figures:
2420
+ issues.append({
2421
+ 'type': 'Figure',
2422
+ 'reference': ref,
2423
+ 'context': para_text,
2424
+ 'message': f"Referenced Figure {ref} not found in document"
2425
+ })
2426
+
2427
+ # Section/paragraph reference check
2428
+ section_refs = re.finditer(
2429
+ r'(?:paragraph|section|appendix)\s+([A-Z]?\.?\d+(?:\.\d+)*)',
2430
+ para_text,
2431
+ re.IGNORECASE
2432
+ )
2433
+
2434
+ for match in section_refs:
2435
+ ref = match.group(1).strip('.')
2436
+ if not skip_regex.search(para_text):
2437
+ if ref not in valid_sections:
2438
+ found = False
2439
+ for valid_section in valid_sections:
2440
+ if valid_section.strip('.') == ref.strip('.'):
2441
+ found = True
2442
+ break
2443
+
2444
+ if not found:
2445
+ issues.append({
2446
+ 'type': 'Paragraph',
2447
+ 'reference': ref,
2448
+ 'context': para_text,
2449
+ 'message': f"Confirm paragraph {ref} referenced in '{para_text}' exists in the document"
2450
+ })
2451
+
2452
+ except Exception as e:
2453
+ self.logger.error(f"Error processing cross references: {str(e)}")
2454
+ return DocumentCheckResult(
2455
+ success=False,
2456
+ issues=[{'type': 'error', 'message': f"Error processing cross references: {str(e)}"}],
2457
+ details={}
2458
+ )
2459
+
2460
+ return DocumentCheckResult(
2461
+ success=len(issues) == 0,
2462
+ issues=issues,
2463
+ details={
2464
+ 'total_tables': len(tables),
2465
+ 'total_figures': len(figures),
2466
+ 'found_tables': sorted(list(tables)),
2467
+ 'found_figures': sorted(list(figures)),
2468
+ 'heading_structure': heading_structure,
2469
+ 'valid_sections': sorted(list(valid_sections))
2470
+ }
2471
+ )
2472
+
2473
+ def _extract_paragraph_numbering(self, doc: Document, in_appendix: bool = False) -> List[Tuple[str, str]]:
2474
+ """
2475
+ Extract paragraph numbers from document headings.
2476
+ """
2477
+ numbered_paragraphs = []
2478
+
2479
+ try:
2480
+ # Track heading hierarchy (limit to 6 levels as per standard heading styles)
2481
+ current_numbers = {
2482
+ 1: 0, # Heading 1: 1, 2, 3, ...
2483
+ 2: 0, # Heading 2: 1.1, 1.2, 1.3, ...
2484
+ 3: 0, # Heading 3: 1.1.1, 1.1.2, ...
2485
+ 4: 0,
2486
+ 5: 0,
2487
+ 6: 0
2488
+ }
2489
+ current_parent = {
2490
+ 2: 0, # Parent number for level 2
2491
+ 3: 0, # Parent number for level 3
2492
+ 4: 0,
2493
+ 5: 0,
2494
+ 6: 0
2495
+ }
2496
+ last_level = {
2497
+ 1: 0, # Last number used at each level
2498
+ 2: 0,
2499
+ 3: 0,
2500
+ 4: 0,
2501
+ 5: 0,
2502
+ 6: 0
2503
+ }
2504
+
2505
+ for para in doc.paragraphs:
2506
+ style_name = para.style.name if hasattr(para, 'style') and hasattr(para.style, 'name') else ''
2507
+ text = para.text.strip() if hasattr(para, 'text') else ''
2508
+
2509
+ # Only process if it's a heading style
2510
+ if style_name.startswith('Heading'):
2511
+ try:
2512
+ heading_level = int(style_name.replace('Heading ', ''))
2513
+
2514
+ # Skip if heading level is beyond our supported range
2515
+ if heading_level > 6:
2516
+ continue
2517
+
2518
+ if heading_level == 1:
2519
+ # For Heading 1, simply increment
2520
+ current_numbers[1] += 1
2521
+ last_level[1] = current_numbers[1]
2522
+ # Reset all lower levels
2523
+ for level in range(2, 7): # Changed from 8 to 7
2524
+ current_numbers[level] = 0
2525
+ current_parent[level] = current_numbers[1]
2526
+ last_level[level] = 0
2527
+ else:
2528
+ # Check if we're still in the same parent section
2529
+ parent_changed = current_parent[heading_level] != current_numbers[heading_level - 1]
2530
+
2531
+ if parent_changed:
2532
+ # Parent section changed
2533
+ current_numbers[heading_level] = 1
2534
+ current_parent[heading_level] = current_numbers[heading_level - 1]
2535
+ else:
2536
+ # Same parent, increment this level
2537
+ current_numbers[heading_level] += 1
2538
+
2539
+ last_level[heading_level] = current_numbers[heading_level]
2540
+
2541
+ # Reset all lower levels
2542
+ for level in range(heading_level + 1, 7): # Changed from 8 to 7
2543
+ current_numbers[level] = 0
2544
+ current_parent[level] = 0
2545
+ last_level[level] = 0
2546
+
2547
+ # Build section number
2548
+ section_parts = []
2549
+ for level in range(1, heading_level + 1):
2550
+ if level == 1:
2551
+ section_parts.append(str(current_numbers[1]))
2552
+ else:
2553
+ if current_numbers[level] > 0:
2554
+ section_parts.append(str(current_numbers[level]))
2555
+
2556
+ section_number = '.'.join(section_parts)
2557
+
2558
+ if text:
2559
+ numbered_paragraphs.append((section_number, text))
2560
+
2561
+ except ValueError:
2562
+ continue
2563
+
2564
+ except Exception as e:
2565
+ self.logger.error(f"Error processing document structure: {str(e)}, Type: {type(e)}, Details: {repr(e)}")
2566
+ return []
2567
+
2568
+ return numbered_paragraphs
2569
+
2570
+ def _check_heading_sequence(self, current_level: int, previous_level: int) -> Optional[str]:
2571
+ """
2572
+ Check if heading sequence is valid.
2573
+ Returns error message if invalid, None if valid.
2574
+
2575
+ Rules:
2576
+ - Can go from any level to H1 or H2 (restart numbering)
2577
+ - When going deeper, can only go one level at a time (e.g., H1 to H2, H2 to H3)
2578
+ - Can freely go to any higher level (e.g., H3 to H1, H4 to H2)
2579
+ """
2580
+ # When going to a deeper level, only allow one level at a time
2581
+ if current_level > previous_level:
2582
+ if current_level != previous_level + 1:
2583
+ return f"Skipped heading level(s) {previous_level + 1} - Found H{current_level} after H{previous_level}. Add H{previous_level + 1} before this section."
2584
+
2585
+ # All other cases are valid:
2586
+ # - Going to H1 (restart numbering)
2587
+ # - Going to any higher level (e.g., H3 to H1)
2588
+ return None
2589
+
2590
+ def _check_heading_structure(self, doc: Document) -> List[Dict[str, str]]:
2591
+ """Check document heading structure."""
2592
+ issues = []
2593
+ previous_level = 0
2594
+ previous_heading = ""
2595
+
2596
+ for para in doc.paragraphs:
2597
+ if para.style.name.startswith('Heading'):
2598
+ try:
2599
+ current_level = int(para.style.name.replace('Heading ', ''))
2600
+
2601
+ # Check sequence
2602
+ error = self._check_heading_sequence(current_level, previous_level)
2603
+ if error:
2604
+ issues.append({
2605
+ 'category': '508_compliance_heading_structure',
2606
+ 'message': error,
2607
+ 'context': f"'{para.text}'",
2608
+ 'recommendation': f"Ensure heading levels follow a logical sequence."
2609
+ })
2610
+
2611
+ previous_level = current_level
2612
+ previous_heading = para.text
2613
+
2614
+ except ValueError:
2615
+ continue
2616
+
2617
+ return issues
2618
+
2619
+ @profile_performance
2620
+ def check_readability(self, doc: List[str]) -> DocumentCheckResult:
2621
+ """
2622
+ Check document readability using multiple metrics and plain language standards.
2623
+
2624
+ Args:
2625
+ doc (List[str]): List of document paragraphs
2626
+
2627
+ Returns:
2628
+ DocumentCheckResult: Results including readability scores and identified issues
2629
+ """
2630
+ if not self.validate_input(doc):
2631
+ return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
2632
+
2633
+ issues = []
2634
+ text_stats = {
2635
+ 'total_words': 0,
2636
+ 'total_syllables': 0,
2637
+ 'total_sentences': 0,
2638
+ 'complex_words': 0,
2639
+ 'passive_voice_count': 0
2640
+ }
2641
+
2642
+ # Patterns for identifying passive voice
2643
+ passive_patterns = [
2644
+ r'\b(?:am|is|are|was|were|be|been|being)\s+\w+ed\b',
2645
+ r'\b(?:am|is|are|was|were|be|been|being)\s+\w+en\b',
2646
+ r'\b(?:has|have|had)\s+been\s+\w+ed\b',
2647
+ r'\b(?:has|have|had)\s+been\s+\w+en\b'
2648
+ ]
2649
+ passive_regex = re.compile('|'.join(passive_patterns), re.IGNORECASE)
2650
+
2651
+ def count_syllables(word: str) -> int:
2652
+ """Count syllables in a word using basic rules."""
2653
+ word = word.lower()
2654
+ count = 0
2655
+ vowels = 'aeiouy'
2656
+ on_vowel = False
2657
+
2658
+ for char in word:
2659
+ is_vowel = char in vowels
2660
+ if is_vowel and not on_vowel:
2661
+ count += 1
2662
+ on_vowel = is_vowel
2663
+
2664
+ if word.endswith('e'):
2665
+ count -= 1
2666
+ if word.endswith('le') and len(word) > 2 and word[-3] not in vowels:
2667
+ count += 1
2668
+ if count == 0:
2669
+ count = 1
2670
+
2671
+ return count
2672
+
2673
+ # Process each paragraph
2674
+ for paragraph in doc:
2675
+ if not paragraph.strip():
2676
+ continue
2677
+
2678
+ # Split into sentences
2679
+ sentences = re.split(r'(?<=[.!?])\s+', paragraph.strip())
2680
+ text_stats['total_sentences'] += len(sentences)
2681
+
2682
+ # Check each sentence
2683
+ for sentence in sentences:
2684
+ # Count passive voice instances
2685
+ if passive_regex.search(sentence):
2686
+ text_stats['passive_voice_count'] += 1
2687
+
2688
+ # Process words
2689
+ words = sentence.split()
2690
+ text_stats['total_words'] += len(words)
2691
+
2692
+ for word in words:
2693
+ word = re.sub(r'[^\w\s]', '', word.lower())
2694
+ if not word:
2695
+ continue
2696
+
2697
+ syllables = count_syllables(word)
2698
+ text_stats['total_syllables'] += syllables
2699
+
2700
+ if syllables >= 3:
2701
+ text_stats['complex_words'] += 1
2702
+
2703
+ # Calculate readability metrics
2704
+ try:
2705
+ # Flesch Reading Ease
2706
+ flesch_ease = 206.835 - 1.015 * (text_stats['total_words'] / text_stats['total_sentences']) - 84.6 * (text_stats['total_syllables'] / text_stats['total_words'])
2707
+
2708
+ # Flesch-Kincaid Grade Level
2709
+ flesch_grade = 0.39 * (text_stats['total_words'] / text_stats['total_sentences']) + 11.8 * (text_stats['total_syllables'] / text_stats['total_words']) - 15.59
2710
+
2711
+ # Gunning Fog Index
2712
+ fog_index = 0.4 * ((text_stats['total_words'] / text_stats['total_sentences']) + 100 * (text_stats['complex_words'] / text_stats['total_words']))
2713
+
2714
+ # Calculate passive voice percentage
2715
+ passive_percentage = (text_stats['passive_voice_count'] / text_stats['total_sentences']) * 100 if text_stats['total_sentences'] > 0 else 0
2716
+
2717
+ # Add readability summary with high-level guidance and specific issues
2718
+ issues = []
2719
+
2720
+ if flesch_ease < 50:
2721
+ issues.append({
2722
+ 'type': 'readability_score',
2723
+ 'metric': 'Flesch Reading Ease',
2724
+ 'score': round(flesch_ease, 1),
2725
+ 'message': 'Document may be too difficult for general audience. Consider simplifying language.'
2726
+ })
2727
+
2728
+ if flesch_grade > 12:
2729
+ issues.append({
2730
+ 'type': 'readability_score',
2731
+ 'metric': 'Flesch-Kincaid Grade Level',
2732
+ 'score': round(flesch_grade, 1),
2733
+ 'message': 'Reading level is above 12th grade. Consider simplifying for broader accessibility.'
2734
+ })
2735
+
2736
+ if fog_index > 12:
2737
+ issues.append({
2738
+ 'type': 'readability_score',
2739
+ 'metric': 'Gunning Fog Index',
2740
+ 'score': round(fog_index, 1),
2741
+ 'message': 'Text complexity may be too high. Consider using simpler words and shorter sentences.'
2742
+ })
2743
+
2744
+ if passive_percentage > 10:
2745
+ issues.append({
2746
+ 'type': 'passive_voice',
2747
+ 'percentage': round(passive_percentage, 1),
2748
+ 'message': f'Document uses {round(passive_percentage, 1)}% passive voice (target: less than 10%). Consider using more active voice.'
2749
+ })
2750
+
2751
+ details = {
2752
+ 'metrics': {
2753
+ 'flesch_reading_ease': round(flesch_ease, 1),
2754
+ 'flesch_kincaid_grade': round(flesch_grade, 1),
2755
+ 'gunning_fog_index': round(fog_index, 1),
2756
+ 'passive_voice_percentage': round(passive_percentage, 1)
2757
+ }
2758
+ }
2759
+
2760
+ return DocumentCheckResult(
2761
+ success=len(issues) == 0,
2762
+ issues=issues,
2763
+ details=details
2764
+ )
2765
+
2766
+ except Exception as e:
2767
+ self.logger.error(f"Error calculating readability metrics: {str(e)}")
2768
+ return DocumentCheckResult(
2769
+ success=False,
2770
+ issues=[{'error': f'Error calculating readability metrics: {str(e)}'}]
2771
+ )
2772
 
2773
  class DocumentCheckResultsFormatter:
2774
 
 
2911
  'before': 'See AC 25.1309-1B, System Design and Analysis, for information on X.',
2912
  'after': 'See AC 25.1309-1B, <i>System Design and Analysis</i>, for information on X.'
2913
  }
2914
+ },
2915
+ '508_compliance_check': {
2916
+ 'title': 'Section 508 Compliance Issues',
2917
+ 'description': 'Checks document accessibility features required by Section 508 standards: Image alt text for screen readers, heading structure issues (missing heading 1, skipped heading levels, and out of sequence headings), and hyperlink accessibility (ensuring links have meaningful descriptive text).',
2918
+ 'solution': 'Address each accessibility issue: add image alt text for screen readers, fix heading structure, and ensure hyperlinks have descriptive text that indicates their destination.',
2919
+ 'example_fix': {
2920
+ 'before': [
2921
+ 'Image without alt text',
2922
+ 'Heading sequence: H1 β†’ H2 β†’ H4 (skipped H3)',
2923
+ 'Link text: "click here" or "www.example.com"'
2924
+ ],
2925
+ 'after': [
2926
+ 'Image with descriptive alt text',
2927
+ 'Proper heading sequence: H1 β†’ H2 β†’ H3 β†’ H4',
2928
+ 'Descriptive link text: "FAA Compliance Guidelines" or "Download the Safety Report"'
2929
+ ]
2930
+ }
2931
+ },
2932
+ 'hyperlink_check': {
2933
+ 'title': 'Hyperlink Issues',
2934
+ 'description': 'Checks for potentially broken or inaccessible URLs in the document. This includes checking response codes and connection issues.',
2935
+ 'solution': 'Verify each flagged URL is correct and accessible.',
2936
+ 'example_fix': {
2937
+ 'before': 'See https://broken-link.example.com for more details.',
2938
+ 'after': 'See https://www.faa.gov for more details.'
2939
+ }
2940
+ },
2941
+ 'cross_references_check': {
2942
+ 'title': 'Cross-Reference Issues',
2943
+ 'description': 'Checks for missing or invalid cross-references to paragraphs, tables, figures, and appendices within the document.',
2944
+ 'solution': 'Ensure that all referenced elements are present in the document and update or remove any incorrect references.',
2945
+ 'example_fix': {
2946
+ 'before': 'See table 5-2 for more information. (there is no table 5-2)',
2947
+ 'after': 'Either update the table reference or add table 5-2 if missing'
2948
+ }
2949
+ },
2950
+ 'readability_check': {
2951
+ 'title': 'Readability Issues',
2952
+ 'description': 'Analyzes document readability using multiple metrics including Flesch Reading Ease, Flesch-Kincaid Grade Level, and Gunning Fog Index. Also checks for passive voice usage and technical jargon.',
2953
+ 'solution': 'Simplify language, reduce passive voice, and replace technical jargon with plain language alternatives.',
2954
+ 'example_fix': {
2955
+ 'before': 'The implementation of the procedure was facilitated by technical personnel.',
2956
+ 'after': 'Technical staff helped start the procedure.'
2957
+ }
2958
  }
2959
  }
2960
 
 
3042
  return formatted_issues
3043
 
3044
  def _format_reference_issues(self, result: DocumentCheckResult) -> List[str]:
3045
+ """Format reference issues with clear, concise descriptions."""
3046
+ formatted_issues = []
3047
 
3048
+ for issue in result.issues:
3049
+ ref_type = issue.get('type', '')
3050
+ ref_num = issue.get('reference', '')
3051
+ context = issue.get('context', '').strip()
3052
+
3053
+ if context: # Only include context if it exists
3054
+ formatted_issues.append(
3055
+ f" β€’ Confirm {ref_type} {ref_num} referenced in '{context}' exists in the document"
3056
+ )
3057
+ else:
3058
+ formatted_issues.append(
3059
+ f" β€’ Confirm {ref_type} {ref_num} exists in the document"
3060
+ )
3061
 
3062
+ return formatted_issues
3063
 
3064
  def _format_standard_issue(self, issue: Dict[str, Any]) -> str:
3065
  """Format standard issues consistently."""
 
3332
  output.extend(self._format_section_symbol_issues(result))
3333
  elif check_name == 'parentheses_check':
3334
  output.extend(self._format_parentheses_issues(result))
3335
+ elif check_name == '508_compliance_check':
3336
+ if not result.success:
3337
+ # Combine all 508 compliance issues into a single list
3338
+ for issue in result.issues:
3339
+ if issue.get('category') == '508_compliance_heading_structure':
3340
+ output.append(f" β€’ {issue['message']}")
3341
+ if 'context' in issue:
3342
+ output.append(f" Context: {issue['context']}")
3343
+ if 'recommendation' in issue:
3344
+ output.append(f" Recommendation: {issue['recommendation']}")
3345
+ elif issue.get('category') == 'image_alt_text':
3346
+ if 'context' in issue:
3347
+ output.append(f" β€’ {issue['context']}")
3348
+ elif issue.get('category') == 'hyperlink_accessibility':
3349
+ output.append(f" β€’ {issue.get('user_message', issue.get('message', 'No description provided'))}")
3350
+ elif check_name == 'hyperlink_check':
3351
+ for issue in result.issues:
3352
+ output.append(f" β€’ {issue['message']}")
3353
+ if 'status_code' in issue:
3354
+ output.append(f" (HTTP Status: {issue['status_code']})")
3355
+ elif 'error' in issue:
3356
+ output.append(f" (Error: {issue['error']})")
3357
+ elif check_name == 'cross_references_check':
3358
+ for issue in result.issues:
3359
+ output.append(f" β€’ Confirm {issue['type']} {issue['reference']} referenced in '{issue['context']}' exists in the document")
3360
+ elif check_name == 'readability_check':
3361
+ output.extend(self._format_readability_issues(result))
3362
  else:
3363
  formatted_issues = [self._format_standard_issue(issue) for issue in result.issues[:15]]
3364
  output.extend(formatted_issues)
3365
 
3366
+ if len(result.issues) > 10:
3367
  output.append(f"\n ... and {len(result.issues) - 15} more similar issues.")
3368
 
3369
  return '\n'.join(output)
 
3386
  except Exception as e:
3387
  print(f"Error saving report: {e}")
3388
 
3389
+ def _format_readability_issues(self, result: DocumentCheckResult) -> List[str]:
3390
+ """Format readability issues with clear, actionable feedback."""
3391
+ formatted_issues = []
 
 
3392
 
3393
+ if result.details and 'metrics' in result.details:
3394
+ metrics = result.details['metrics']
3395
+ formatted_issues.append("\n Readability Scores:")
3396
+ formatted_issues.append(f" β€’ Flesch Reading Ease: {metrics['flesch_reading_ease']} (Aim for 50+; higher is easier to read)")
3397
+ formatted_issues.append(f" β€’ Grade Level: {metrics['flesch_kincaid_grade']} (Aim for 10 or lower; 12 acceptable for technical/legal)")
3398
+ formatted_issues.append(f" β€’ Gunning Fog Index: {metrics['gunning_fog_index']} (Aim for 12 or lower)")
3399
+ formatted_issues.append(f" β€’ Passive Voice: {metrics['passive_voice_percentage']}% (Aim for less than 10%; use active voice for clarity)")
3400
 
3401
+ if result.issues:
3402
+ formatted_issues.append("\n Identified Issues:")
3403
+ for issue in result.issues:
3404
+ if issue['type'] == 'jargon':
3405
+ formatted_issues.append(
3406
+ f" β€’ Replace '{issue['word']}' with '{issue['suggestion']}' in: \"{issue['sentence']}\""
3407
+ )
3408
+ elif issue['type'] in ['readability_score', 'passive_voice']:
3409
+ formatted_issues.append(f" β€’ {issue['message']}")
3410
+
3411
+ return formatted_issues
 
 
 
 
3412
 
3413
  def format_markdown_results(results: Dict[str, DocumentCheckResult], doc_type: str) -> str:
3414
  """Format check results into a Markdown string for Gradio display."""
 
3436
  'acronym_check': {'title': 'πŸ“ Acronym Definitions', 'priority': 1},
3437
  'acronym_usage_check': {'title': 'πŸ“Ž Acronym Usage', 'priority': 1},
3438
  'section_symbol_usage_check': {'title': 'Β§ Section Symbol Usage', 'priority': 2},
3439
+ '508_compliance_check': {'title': 'πŸ•΅οΈβ€β™‚οΈ 508 Compliance', 'priority': 2},
3440
+ 'cross_references_check': {'title': 'πŸ”— Cross References', 'priority': 2},
3441
+ 'hyperlink_check': {'title': 'πŸ”— Hyperlinks', 'priority': 2},
3442
  'date_formats_check': {'title': 'πŸ“… Date Formats', 'priority': 2},
3443
  'placeholders_check': {'title': '🚩 Placeholder Content', 'priority': 2},
3444
  'document_title_check': {'title': 'πŸ“‘ Document Title Format', 'priority': 2},
 
3449
  'double_period_check': {'title': '⚑ Double Periods', 'priority': 4},
3450
  'spacing_check': {'title': '⌨️ Spacing Issues', 'priority': 4},
3451
  'paragraph_length_check': {'title': 'πŸ“ Paragraph Length', 'priority': 5},
3452
+ 'sentence_length_check': {'title': 'πŸ“ Sentence Length', 'priority': 5},
3453
+
3454
  }
3455
 
3456
  sorted_checks = sorted(
 
3546
  title = parts[0].strip()
3547
  content = parts[1].strip()
3548
 
3549
+ # Special handling for readability metrics
3550
+ if "Readability Issues" in title:
3551
+ metrics_match = re.search(r'Readability Scores:(.*?)(?=Identified Issues:|$)', content, re.DOTALL)
3552
+ issues_match = re.search(r'Identified Issues:(.*?)(?=\Z)', content, re.DOTALL)
3553
+
3554
+ metrics_html = ""
3555
+ if metrics_match:
3556
+ metrics = metrics_match.group(1).strip().split('\n')
3557
+ metrics_html = """
3558
+ <div class="bg-blue-50 rounded-lg p-4 mb-4">
3559
+ <h3 class="font-medium text-blue-800 mb-2">πŸ“Š Readability Metrics</h3>
3560
+ <div class="grid grid-cols-1 md:grid-cols-2 gap-4">
3561
+ """
3562
+ for metric in metrics:
3563
+ if metric.strip():
3564
+ label, value = metric.strip('β€’ ').split(':', 1)
3565
+ metrics_html += f"""
3566
+ <div class="flex flex-col">
3567
+ <span class="text-sm text-blue-600 font-medium">{label}</span>
3568
+ <span class="text-lg text-blue-900">{value}</span>
3569
+ </div>
3570
+ """
3571
+ metrics_html += "</div></div>"
3572
+
3573
+ issues_html_section = ""
3574
+ if issues_match:
3575
+ issues_list = issues_match.group(1).strip().split('\n')
3576
+ if issues_list:
3577
+ issues_html_section = """
3578
+ <div class="mt-4">
3579
+ <h3 class="font-medium text-gray-800 mb-2">πŸ“ Identified Issues:</h3>
3580
+ <ul class="list-none space-y-2">
3581
+ """
3582
+ for issue in issues_list:
3583
+ if issue.strip():
3584
+ issues_html_section += f"""
3585
+ <li class="text-gray-600 ml-4">β€’ {issue.strip('β€’ ')}</li>
3586
+ """
3587
+ issues_html_section += "</ul></div>"
3588
+
3589
+ # Combine the readability section
3590
+ issues_html += f"""
3591
+ <div class="bg-white rounded-lg shadow-sm mb-6 overflow-hidden">
3592
+ <div class="bg-gray-50 px-6 py-4 border-b">
3593
+ <h2 class="text-lg font-semibold text-gray-800">{title}</h2>
3594
+ </div>
3595
+ <div class="px-6 py-4">
3596
+ {metrics_html}
3597
+ {issues_html_section}
3598
+ </div>
3599
+ </div>
3600
+ """
3601
+ continue
3602
+
3603
  # Extract description and solution
3604
  description_parts = content.split('How to fix:', 1)
3605
  description = description_parts[0].strip()
 
3643
  <h3 class="font-medium text-gray-800 mb-2">Issues found in your document:</h3>
3644
  <ul class="list-none space-y-2">
3645
  """
3646
+ for issue in issues_match[:7]:
 
3647
  clean_issue = issue.strip().lstrip('β€’').strip()
3648
  issues_html_section += f"""
3649
  <li class="text-gray-600 ml-4">β€’ {clean_issue}</li>
3650
  """
3651
+ if len(issues_match) > 7:
3652
  issues_html_section += f"""
3653
  <li class="text-gray-500 italic ml-4">... and {len(issues_match) - 7} more similar issues.</li>
3654
  """
 
3679
  </div>
3680
  """
3681
 
3682
+ # Add new CSS classes for readability metrics
3683
+ additional_styles = """
3684
+ .bg-blue-50 { background-color: #eff6ff; }
3685
+ .text-blue-600 { color: #2563eb; }
3686
+ .text-blue-800 { color: #1e40af; }
3687
+ .text-blue-900 { color: #1e3a8a; }
3688
+ .grid { display: grid; }
3689
+ .grid-cols-1 { grid-template-columns: repeat(1, minmax(0, 1fr)); }
3690
+ .gap-4 { gap: 1rem; }
3691
+ @media (min-width: 768px) {
3692
+ .md\\:grid-cols-2 { grid-template-columns: repeat(2, minmax(0, 1fr)); }
3693
+ }
3694
+ """
3695
+
3696
+ # Add summary section before the final return
3697
  summary_html = f"""
3698
  <div class="bg-white rounded-lg shadow-sm mb-6 overflow-hidden">
3699
  <div class="bg-gray-50 px-6 py-4 border-b">
 
3723
  </div>
3724
  """
3725
 
3726
+ # Update the final HTML to include the summary section
3727
  full_html = f"""
3728
  <div class="mx-auto p-4" style="font-family: system-ui, -apple-system, sans-serif;">
3729
  <style>
3730
  .text-2xl {{ font-size: 1.5rem; line-height: 2rem; }}
3731
  .text-lg {{ font-size: 1.125rem; }}
3732
+ .text-sm {{ font-size: 0.875rem; }}
3733
  .font-bold {{ font-weight: 700; }}
3734
  .font-semibold {{ font-weight: 600; }}
3735
  .font-medium {{ font-weight: 500; }}
 
3761
  .overflow-hidden {{ overflow: hidden; }}
3762
  .list-none {{ list-style-type: none; }}
3763
  .space-y-4 > * + * {{ margin-top: 1rem; }}
3764
+ {additional_styles}
 
 
3765
  </style>
3766
  {header_html}
3767
  {issues_html}