Hoctar77 commited on
Commit
436beda
·
verified ·
1 Parent(s): c6ba992

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -88
app.py CHANGED
@@ -5,41 +5,26 @@ from docx import Document
5
  import io
6
  import traceback
7
 
8
- def heading_title_check(doc, required_headings):
9
- """
10
- Check if required headings are present in the document.
11
-
12
- Args:
13
- doc (list): List of paragraph texts from the document
14
- required_headings (list): List of required heading titles
15
-
16
- Returns:
17
- tuple: (bool, list) - (True if all headings present, list of found headings)
18
- """
19
  headings_found = []
20
-
21
- # Create a set of required headings for efficient lookup
22
  required_headings_set = set(required_headings)
23
 
24
- for para in doc:
25
  para_strip = para.strip()
26
- # Check if the paragraph is in the required headings list
27
  if para_strip in required_headings_set:
28
  headings_found.append(para_strip)
29
 
30
- # Check if all required headings are found
31
  all_headings_present = set(headings_found) == required_headings_set
32
-
33
  return all_headings_present, headings_found
34
 
35
- def acronym_check(doc):
36
  """Check if all acronyms are defined at first use and return undefined acronyms."""
37
  defined_acronyms = set() # Set to store defined acronyms
38
  undefined_acronyms = set() # Set to store undefined acronyms
39
  acronym_pattern = re.compile(r'(\b[A-Z]{2,}\b)') # Regex to find acronyms (2 or more uppercase letters)
40
  defined_pattern = re.compile(r'(\b\w+\b) \((\b[A-Z]{2,}\b)\)') # Regex to find definitions like "Federal Aviation Administration (FAA)"
41
 
42
- for paragraph in doc:
43
  # Check for defined acronyms
44
  defined_matches = defined_pattern.findall(paragraph)
45
  for full_term, acronym in defined_matches:
@@ -51,9 +36,10 @@ def acronym_check(doc):
51
  if acronym not in defined_acronyms:
52
  undefined_acronyms.add(acronym) # Add to undefined acronyms if not defined
53
 
54
- return len(undefined_acronyms) == 0, undefined_acronyms # Return True if all acronyms are defined, along with undefined acronyms
55
 
56
- def legal_check(doc):
 
57
  """Check for correct legal references in the document and suggest corrections.
58
 
59
  Args:
@@ -77,7 +63,7 @@ def legal_check(doc):
77
  # List to store tuples of incorrect terms and their correct versions
78
  incorrect_legal_references = []
79
 
80
- for paragraph in doc:
81
  # Special handling for "Title 14" / "title 14"
82
  title_14_pattern = r"(?P<prefix>^|[.!?\s])\s*(?P<title>title 14|Title 14)\b"
83
  matches = re.finditer(title_14_pattern, paragraph)
@@ -102,7 +88,7 @@ def legal_check(doc):
102
 
103
  return len(incorrect_legal_references) == 0, incorrect_legal_references
104
 
105
- def table_caption_check(doc, doc_type):
106
  """
107
  Check for correctly formatted table captions in the document.
108
  Supports both numeric (Table 1-2) and alphanumeric (Table C-1) formats.
@@ -116,7 +102,7 @@ def table_caption_check(doc, doc_type):
116
 
117
  incorrect_captions = []
118
 
119
- for paragraph in doc:
120
  paragraph_strip = paragraph.strip()
121
  if paragraph_strip.lower().startswith("table"):
122
  if not table_caption_pattern.match(paragraph_strip):
@@ -124,7 +110,7 @@ def table_caption_check(doc, doc_type):
124
 
125
  return len(incorrect_captions) == 0, incorrect_captions
126
 
127
- def figure_caption_check(doc, doc_type):
128
  """
129
  Check for correctly formatted figure captions in the document.
130
  Supports both numeric (Figure 1-2) and alphanumeric (Figure C-1) formats.
@@ -137,7 +123,7 @@ def figure_caption_check(doc, doc_type):
137
  figure_caption_pattern = re.compile(r'^Figure\s+([A-Z0-9]+)[\.\s]', re.IGNORECASE)
138
 
139
  incorrect_fig_captions = []
140
- for paragraph in doc:
141
  paragraph_strip = paragraph.strip()
142
  if paragraph_strip.lower().startswith("figure"):
143
  if not figure_caption_pattern.match(paragraph_strip):
@@ -145,7 +131,7 @@ def figure_caption_check(doc, doc_type):
145
 
146
  return len(incorrect_fig_captions) == 0, incorrect_fig_captions
147
 
148
- def table_figure_reference_check(doc, doc_type):
149
  """Check for incorrect references to tables and figures in the document."""
150
  incorrect_table_figure_references = []
151
 
@@ -158,7 +144,7 @@ def table_figure_reference_check(doc, doc_type):
158
  incorrect_table_ref_pattern = re.compile(r'\bTable\s+\d+(-\d+)?\b', re.IGNORECASE)
159
  incorrect_figure_ref_pattern = re.compile(r'\bFigure\s+\d+(-\d+)?\b', re.IGNORECASE)
160
 
161
- for paragraph in doc:
162
  paragraph_strip = paragraph.strip()
163
  # Exclude captions
164
  starts_with_table_or_figure = paragraph_strip.lower().startswith('table') or paragraph_strip.lower().startswith('figure')
@@ -366,11 +352,11 @@ def get_document_checks(doc_type, template_type):
366
  logger.info(f"Retrieved checks: {checks}")
367
  return checks
368
 
369
- def double_period_check(doc):
370
  """Check for sentences that end with two periods."""
371
  incorrect_sentences = []
372
 
373
- for paragraph in doc:
374
  # Split the paragraph into sentences based on common sentence-ending punctuation
375
  sentences = re.split(r'(?<=[.!?]) +', paragraph)
376
  for sentence in sentences:
@@ -379,7 +365,7 @@ def double_period_check(doc):
379
 
380
  return len(incorrect_sentences) == 0, incorrect_sentences # Return True if no double periods are found, along with any incorrect sentences
381
 
382
- def spacing_check(doc):
383
  """
384
  Check for correct spacing in US federal regulatory documents.
385
  Checks for:
@@ -421,7 +407,7 @@ def spacing_check(doc):
421
 
422
  return len(incorrect_spacing) == 0, incorrect_spacing
423
 
424
- def check_prohibited_phrases(doc):
425
  """Check for prohibited words or phrases."""
426
  prohibited_phrases = [
427
  r'\babove\b',
@@ -430,17 +416,17 @@ def check_prohibited_phrases(doc):
430
  r'\bthere are\b'
431
  ]
432
  issues = []
433
- for paragraph in doc:
434
  for phrase in prohibited_phrases:
435
  if re.search(phrase, paragraph, re.IGNORECASE):
436
  issues.append((phrase.strip(r'\b'), paragraph.strip()))
437
  return issues
438
 
439
- def check_abbreviation_usage(doc):
440
  """Check for abbreviation consistency after first definition."""
441
  abbreviations = {}
442
  issues = []
443
- for paragraph in doc:
444
  # Find definitions like "Federal Aviation Administration (FAA)"
445
  defined_matches = re.findall(r'\b([A-Za-z &]+)\s+\((\b[A-Z]{2,}\b)\)', paragraph)
446
  for full_term, acronym in defined_matches:
@@ -460,12 +446,12 @@ def check_abbreviation_usage(doc):
460
 
461
  return issues
462
 
463
- def check_date_formats(doc):
464
  """Check for inconsistent date formats."""
465
  date_issues = []
466
  correct_date_pattern = re.compile(r'\b(January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}, \d{4}\b')
467
  date_pattern = re.compile(r'\b\d{1,2}/\d{1,2}/\d{2,4}\b') # MM/DD/YYYY
468
- for paragraph in doc:
469
  if date_pattern.search(paragraph):
470
  dates = date_pattern.findall(paragraph)
471
  for date in dates:
@@ -473,7 +459,7 @@ def check_date_formats(doc):
473
  date_issues.append((date, paragraph.strip()))
474
  return date_issues
475
 
476
- def check_placeholders(doc):
477
  """Check for placeholders that should be removed."""
478
  placeholder_phrases = [
479
  r'\bTBD\b',
@@ -481,48 +467,12 @@ def check_placeholders(doc):
481
  r'\bTo be added\b'
482
  ]
483
  issues = []
484
- for paragraph in doc:
485
  for phrase in placeholder_phrases:
486
  if re.search(phrase, paragraph, re.IGNORECASE):
487
  issues.append((phrase.strip(r'\b'), paragraph.strip()))
488
  return issues
489
 
490
- def process_file(file_obj, doc_type, template_type):
491
- """
492
- Process the uploaded file and return results with error handling
493
- """
494
- if file_obj is None:
495
- return "Please upload a document first."
496
-
497
- try:
498
- # Convert bytes to BytesIO object that Document can read
499
- if isinstance(file_obj, bytes):
500
- doc_bytes = io.BytesIO(file_obj)
501
- else:
502
- doc_bytes = io.BytesIO(file_obj.read())
503
-
504
- # Process the document and get results
505
- results = process_document(doc_bytes, doc_type, template_type)
506
- return results
507
-
508
- except Exception as e:
509
- error_trace = traceback.format_exc()
510
- print(f"Error processing file: {str(e)}")
511
- print(f"Full traceback: {error_trace}")
512
-
513
- error_message = f"""An error occurred while processing the document:
514
-
515
- Error: {str(e)}
516
-
517
- Please ensure:
518
- 1. The file is a valid Word document (.docx)
519
- 2. The file is not corrupted
520
- 3. The file is not password protected
521
-
522
- Technical details: {str(e)}"""
523
-
524
- return error_message
525
-
526
  def process_document(file_obj, doc_type, template_type):
527
  """Process the document and perform checks."""
528
  try:
@@ -530,22 +480,25 @@ def process_document(file_obj, doc_type, template_type):
530
  doc = Document(file_obj)
531
  print("Document read successfully.")
532
 
 
 
 
533
  # Get required headings based on document type
534
  required_headings = get_document_checks(doc_type, template_type).get("required_headings", [])
535
 
536
  # Perform checks
537
- heading_valid, headings_found = heading_title_check(doc, required_headings)
538
- acronyms_valid, undefined_acronyms = acronym_check(doc)
539
- legal_valid, incorrect_legal_references = legal_check(doc) # Replace placeholder
540
- table_valid, incorrect_captions = table_caption_check(doc, doc_type) # Replace placeholder
541
- figure_valid, incorrect_fig_captions = figure_caption_check(doc, doc_type) # Replace placeholder
542
- references_valid, incorrect_table_figure_references = table_figure_reference_check(doc, doc_type) # Replace placeholder
543
- title_style_valid, incorrect_titles = document_title_check(doc, doc_type) # Replace placeholder
544
- double_period_valid, incorrect_sentences = double_period_check(doc) # Replace placeholder
545
- spacing_valid, incorrect_spacing = spacing_check(doc) # Replace placeholder
546
- abbreviation_issues = check_abbreviation_usage(doc) # Replace placeholder
547
- date_issues = check_date_formats(doc) # Replace placeholder
548
- placeholder_issues = check_placeholders(doc) # Replace placeholder
549
 
550
  # Format results
551
  results = format_results_for_gradio(
@@ -761,8 +714,11 @@ demo = gr.Blocks(theme='JohnSmith9982/small_and_pretty')
761
  with demo:
762
  gr.Markdown("# Document Checker Tool")
763
  gr.Markdown("Upload a Word (docx) document to check for compliance with U.S. federal documentation standards.")
764
- gr.Markdown("### This tool is still in development")
765
  gr.Markdown("Contact Eric Putnam if you have questions and comments.")
 
 
 
766
 
767
  document_types = [
768
  "Advisory Circular", "Airworthiness Criteria", "Deviation Memo", "Exemption",
 
5
  import io
6
  import traceback
7
 
8
+ def heading_title_check(paragraphs, required_headings):
 
 
 
 
 
 
 
 
 
 
9
  headings_found = []
 
 
10
  required_headings_set = set(required_headings)
11
 
12
+ for para in paragraphs:
13
  para_strip = para.strip()
 
14
  if para_strip in required_headings_set:
15
  headings_found.append(para_strip)
16
 
 
17
  all_headings_present = set(headings_found) == required_headings_set
 
18
  return all_headings_present, headings_found
19
 
20
+ def acronym_check(paragraphs):
21
  """Check if all acronyms are defined at first use and return undefined acronyms."""
22
  defined_acronyms = set() # Set to store defined acronyms
23
  undefined_acronyms = set() # Set to store undefined acronyms
24
  acronym_pattern = re.compile(r'(\b[A-Z]{2,}\b)') # Regex to find acronyms (2 or more uppercase letters)
25
  defined_pattern = re.compile(r'(\b\w+\b) \((\b[A-Z]{2,}\b)\)') # Regex to find definitions like "Federal Aviation Administration (FAA)"
26
 
27
+ for paragraph in paragraphs: # Use paragraphs here
28
  # Check for defined acronyms
29
  defined_matches = defined_pattern.findall(paragraph)
30
  for full_term, acronym in defined_matches:
 
36
  if acronym not in defined_acronyms:
37
  undefined_acronyms.add(acronym) # Add to undefined acronyms if not defined
38
 
39
+ return len(undefined_acronyms) == 0, undefined_acronyms
40
 
41
+
42
+ def legal_check(paragraphs):
43
  """Check for correct legal references in the document and suggest corrections.
44
 
45
  Args:
 
63
  # List to store tuples of incorrect terms and their correct versions
64
  incorrect_legal_references = []
65
 
66
+ for paragraph in paragraphs:
67
  # Special handling for "Title 14" / "title 14"
68
  title_14_pattern = r"(?P<prefix>^|[.!?\s])\s*(?P<title>title 14|Title 14)\b"
69
  matches = re.finditer(title_14_pattern, paragraph)
 
88
 
89
  return len(incorrect_legal_references) == 0, incorrect_legal_references
90
 
91
+ def table_caption_check(paragraphs, doc_type):
92
  """
93
  Check for correctly formatted table captions in the document.
94
  Supports both numeric (Table 1-2) and alphanumeric (Table C-1) formats.
 
102
 
103
  incorrect_captions = []
104
 
105
+ for paragraph in paragraphs:
106
  paragraph_strip = paragraph.strip()
107
  if paragraph_strip.lower().startswith("table"):
108
  if not table_caption_pattern.match(paragraph_strip):
 
110
 
111
  return len(incorrect_captions) == 0, incorrect_captions
112
 
113
+ def figure_caption_check(paragraphs, doc_type):
114
  """
115
  Check for correctly formatted figure captions in the document.
116
  Supports both numeric (Figure 1-2) and alphanumeric (Figure C-1) formats.
 
123
  figure_caption_pattern = re.compile(r'^Figure\s+([A-Z0-9]+)[\.\s]', re.IGNORECASE)
124
 
125
  incorrect_fig_captions = []
126
+ for paragraph in paragraphs:
127
  paragraph_strip = paragraph.strip()
128
  if paragraph_strip.lower().startswith("figure"):
129
  if not figure_caption_pattern.match(paragraph_strip):
 
131
 
132
  return len(incorrect_fig_captions) == 0, incorrect_fig_captions
133
 
134
+ def table_figure_reference_check(paragraphs, doc_type):
135
  """Check for incorrect references to tables and figures in the document."""
136
  incorrect_table_figure_references = []
137
 
 
144
  incorrect_table_ref_pattern = re.compile(r'\bTable\s+\d+(-\d+)?\b', re.IGNORECASE)
145
  incorrect_figure_ref_pattern = re.compile(r'\bFigure\s+\d+(-\d+)?\b', re.IGNORECASE)
146
 
147
+ for paragraph in paragraphs:
148
  paragraph_strip = paragraph.strip()
149
  # Exclude captions
150
  starts_with_table_or_figure = paragraph_strip.lower().startswith('table') or paragraph_strip.lower().startswith('figure')
 
352
  logger.info(f"Retrieved checks: {checks}")
353
  return checks
354
 
355
+ def double_period_check(paragraphs):
356
  """Check for sentences that end with two periods."""
357
  incorrect_sentences = []
358
 
359
+ for paragraph in paragraphs:
360
  # Split the paragraph into sentences based on common sentence-ending punctuation
361
  sentences = re.split(r'(?<=[.!?]) +', paragraph)
362
  for sentence in sentences:
 
365
 
366
  return len(incorrect_sentences) == 0, incorrect_sentences # Return True if no double periods are found, along with any incorrect sentences
367
 
368
+ def spacing_check(paragraphs):
369
  """
370
  Check for correct spacing in US federal regulatory documents.
371
  Checks for:
 
407
 
408
  return len(incorrect_spacing) == 0, incorrect_spacing
409
 
410
+ def check_prohibited_phrases(paragraphs):
411
  """Check for prohibited words or phrases."""
412
  prohibited_phrases = [
413
  r'\babove\b',
 
416
  r'\bthere are\b'
417
  ]
418
  issues = []
419
+ for paragraph in paragraphs:
420
  for phrase in prohibited_phrases:
421
  if re.search(phrase, paragraph, re.IGNORECASE):
422
  issues.append((phrase.strip(r'\b'), paragraph.strip()))
423
  return issues
424
 
425
+ def check_abbreviation_usage(paragraphs):
426
  """Check for abbreviation consistency after first definition."""
427
  abbreviations = {}
428
  issues = []
429
+ for paragraph in paragraphs:
430
  # Find definitions like "Federal Aviation Administration (FAA)"
431
  defined_matches = re.findall(r'\b([A-Za-z &]+)\s+\((\b[A-Z]{2,}\b)\)', paragraph)
432
  for full_term, acronym in defined_matches:
 
446
 
447
  return issues
448
 
449
+ def check_date_formats(paragraphs):
450
  """Check for inconsistent date formats."""
451
  date_issues = []
452
  correct_date_pattern = re.compile(r'\b(January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}, \d{4}\b')
453
  date_pattern = re.compile(r'\b\d{1,2}/\d{1,2}/\d{2,4}\b') # MM/DD/YYYY
454
+ for paragraph in paragraphs:
455
  if date_pattern.search(paragraph):
456
  dates = date_pattern.findall(paragraph)
457
  for date in dates:
 
459
  date_issues.append((date, paragraph.strip()))
460
  return date_issues
461
 
462
+ def check_placeholders(paragraphs):
463
  """Check for placeholders that should be removed."""
464
  placeholder_phrases = [
465
  r'\bTBD\b',
 
467
  r'\bTo be added\b'
468
  ]
469
  issues = []
470
+ for paragraph in paragraphs:
471
  for phrase in placeholder_phrases:
472
  if re.search(phrase, paragraph, re.IGNORECASE):
473
  issues.append((phrase.strip(r'\b'), paragraph.strip()))
474
  return issues
475
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
476
  def process_document(file_obj, doc_type, template_type):
477
  """Process the document and perform checks."""
478
  try:
 
480
  doc = Document(file_obj)
481
  print("Document read successfully.")
482
 
483
+ # Extract text from each paragraph to make it iterable
484
+ paragraphs = [para.text for para in doc.paragraphs]
485
+
486
  # Get required headings based on document type
487
  required_headings = get_document_checks(doc_type, template_type).get("required_headings", [])
488
 
489
  # Perform checks
490
+ heading_valid, headings_found = heading_title_check(paragraphs, required_headings)
491
+ acronyms_valid, undefined_acronyms = acronym_check(paragraphs)
492
+ legal_valid, incorrect_legal_references = legal_check(paragraphs)
493
+ table_valid, incorrect_captions = table_caption_check(paragraphs, doc_type)
494
+ figure_valid, incorrect_fig_captions = figure_caption_check(paragraphs, doc_type)
495
+ references_valid, incorrect_table_figure_references = table_figure_reference_check(paragraphs, doc_type)
496
+ title_style_valid, incorrect_titles = document_title_check(file_obj, doc_type)
497
+ double_period_valid, incorrect_sentences = double_period_check(paragraphs)
498
+ spacing_valid, incorrect_spacing = spacing_check(paragraphs)
499
+ abbreviation_issues = check_abbreviation_usage(paragraphs)
500
+ date_issues = check_date_formats(paragraphs)
501
+ placeholder_issues = check_placeholders(paragraphs)
502
 
503
  # Format results
504
  results = format_results_for_gradio(
 
714
  with demo:
715
  gr.Markdown("# Document Checker Tool")
716
  gr.Markdown("Upload a Word (docx) document to check for compliance with U.S. federal documentation standards.")
717
+ gr.Markdown("*This tool is still in development and you might get false positives in your results*")
718
  gr.Markdown("Contact Eric Putnam if you have questions and comments.")
719
+ gr.Markdown("""
720
+ 1. Upload a clean (no track changes or comments) Word file.
721
+ 2. Choose **Check Document**.""")
722
 
723
  document_types = [
724
  "Advisory Circular", "Airworthiness Criteria", "Deviation Memo", "Exemption",