Hoctar77 commited on
Commit
5e842ab
Β·
verified Β·
1 Parent(s): 92e51bf

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +736 -0
app.py ADDED
@@ -0,0 +1,736 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import logging
3
+ import re
4
+ from docx import Document
5
+ import tempfile
6
+ import os
7
+
8
+ def setup_logging():
9
+ """Initialize logging configuration."""
10
+ logging.basicConfig(
11
+ level=logging.INFO,
12
+ format='%(asctime)s %(levelname)s %(name)s - %(message)s',
13
+ handlers=[logging.StreamHandler()]
14
+ )
15
+
16
+ def read_word_document(doc_path):
17
+ """Read a Word document and return its content as a list of paragraphs."""
18
+ doc = Document(doc_path)
19
+ return [para.text for para in doc.paragraphs if para.text.strip() != ""]
20
+
21
+ def heading_title_check(doc, required_headings):
22
+ """
23
+ Check if required headings are present in the document.
24
+
25
+ Args:
26
+ doc (list): List of paragraph texts from the document
27
+ required_headings (list): List of required heading titles
28
+
29
+ Returns:
30
+ tuple: (bool, list) - (True if all headings present, list of found headings)
31
+ """
32
+ headings_found = []
33
+
34
+ # Create a set of required headings for efficient lookup
35
+ required_headings_set = set(required_headings)
36
+
37
+ for para in doc:
38
+ para_strip = para.strip()
39
+ # Check if the paragraph is in the required headings list
40
+ if para_strip in required_headings_set:
41
+ headings_found.append(para_strip)
42
+
43
+ # Check if all required headings are found
44
+ all_headings_present = set(headings_found) == required_headings_set
45
+
46
+ return all_headings_present, headings_found
47
+
48
+ def acronym_check(doc):
49
+ """Check if all acronyms are defined at first use and return undefined acronyms."""
50
+ defined_acronyms = set() # Set to store defined acronyms
51
+ undefined_acronyms = set() # Set to store undefined acronyms
52
+ acronym_pattern = re.compile(r'(\b[A-Z]{2,}\b)') # Regex to find acronyms (2 or more uppercase letters)
53
+ defined_pattern = re.compile(r'(\b\w+\b) \((\b[A-Z]{2,}\b)\)') # Regex to find definitions like "Federal Aviation Administration (FAA)"
54
+
55
+ for paragraph in doc:
56
+ # Check for defined acronyms
57
+ defined_matches = defined_pattern.findall(paragraph)
58
+ for full_term, acronym in defined_matches:
59
+ defined_acronyms.add(acronym) # Add the acronym to the defined set
60
+
61
+ # Check for usage of acronyms
62
+ usage_matches = acronym_pattern.findall(paragraph)
63
+ for acronym in usage_matches:
64
+ if acronym not in defined_acronyms:
65
+ undefined_acronyms.add(acronym) # Add to undefined acronyms if not defined
66
+
67
+ return len(undefined_acronyms) == 0, undefined_acronyms # Return True if all acronyms are defined, along with undefined acronyms
68
+
69
+ def legal_check(doc):
70
+ """Check for correct legal references in the document and suggest corrections.
71
+
72
+ Args:
73
+ doc (list): List of paragraphs/strings to check
74
+
75
+ Returns:
76
+ tuple: (bool, list) - (True if no errors found, list of (incorrect, correct) terms)
77
+ """
78
+ # Mapping of incorrect terms to their correct versions
79
+ incorrect_variations = {
80
+ r"\bUSC\b": "U.S.C.",
81
+ r"\bCFR Part\b": "CFR part",
82
+ r"\bC\.F\.R\.\b": "CFR",
83
+ r"\bWe\b": "The FAA",
84
+ r"\bwe\b": "the FAA",
85
+ r"\bcancelled\b": "canceled",
86
+ r"\bshall\b": "must or will",
87
+ r"\b&\b": "and"
88
+ }
89
+
90
+ # List to store tuples of incorrect terms and their correct versions
91
+ incorrect_legal_references = []
92
+
93
+ for paragraph in doc:
94
+ # Special handling for "Title 14" / "title 14"
95
+ title_14_pattern = r"(?P<prefix>^|[.!?\s])\s*(?P<title>title 14|Title 14)\b"
96
+ matches = re.finditer(title_14_pattern, paragraph)
97
+
98
+ for match in matches:
99
+ prefix = match.group('prefix')
100
+ current_title = match.group('title')
101
+
102
+ # If it follows a sentence-ending punctuation or is at start, it should be "Title 14"
103
+ if prefix in ('.', '!', '?', '') and current_title.lower() == "title 14":
104
+ if current_title != "Title 14":
105
+ incorrect_legal_references.append((current_title, "Title 14"))
106
+ # If it's within a sentence, it should be "title 14"
107
+ elif prefix.isspace() and current_title != "title 14":
108
+ incorrect_legal_references.append((current_title, "title 14"))
109
+
110
+ # Check other variations
111
+ for incorrect_pattern, correct_term in incorrect_variations.items():
112
+ matches = re.finditer(incorrect_pattern, paragraph)
113
+ for match in matches:
114
+ incorrect_legal_references.append((match.group(), correct_term))
115
+
116
+ return len(incorrect_legal_references) == 0, incorrect_legal_references
117
+
118
+ def table_caption_check(doc, doc_type):
119
+ """
120
+ Check for correctly formatted table captions in the document.
121
+ Supports both numeric (Table 1-2) and alphanumeric (Table C-1) formats.
122
+ """
123
+ if doc_type in ["Advisory Circular", "Order"]:
124
+ # Pattern for "Table X-Y" where X and Y can be either letters or numbers
125
+ table_caption_pattern = re.compile(r'^Table\s+([A-Z0-9]+)-([A-Z0-9]+)[\.\s]', re.IGNORECASE)
126
+ else:
127
+ # Pattern for "Table X" where X can be either letters or numbers
128
+ table_caption_pattern = re.compile(r'^Table\s+([A-Z0-9]+)[\.\s]', re.IGNORECASE)
129
+
130
+ incorrect_captions = []
131
+
132
+ for paragraph in doc:
133
+ paragraph_strip = paragraph.strip()
134
+ if paragraph_strip.lower().startswith("table"):
135
+ if not table_caption_pattern.match(paragraph_strip):
136
+ incorrect_captions.append(paragraph_strip)
137
+
138
+ return len(incorrect_captions) == 0, incorrect_captions
139
+
140
+ def figure_caption_check(doc, doc_type):
141
+ """
142
+ Check for correctly formatted figure captions in the document.
143
+ Supports both numeric (Figure 1-2) and alphanumeric (Figure C-1) formats.
144
+ """
145
+ if doc_type in ["Advisory Circular", "Order"]:
146
+ # Pattern for "Figure X-Y" where X and Y can be either letters or numbers
147
+ figure_caption_pattern = re.compile(r'^Figure\s+([A-Z0-9]+)-([A-Z0-9]+)[\.\s]', re.IGNORECASE)
148
+ else:
149
+ # Pattern for "Figure X" where X can be either letters or numbers
150
+ figure_caption_pattern = re.compile(r'^Figure\s+([A-Z0-9]+)[\.\s]', re.IGNORECASE)
151
+
152
+ incorrect_fig_captions = []
153
+ for paragraph in doc:
154
+ paragraph_strip = paragraph.strip()
155
+ if paragraph_strip.lower().startswith("figure"):
156
+ if not figure_caption_pattern.match(paragraph_strip):
157
+ incorrect_fig_captions.append(paragraph_strip)
158
+
159
+ return len(incorrect_fig_captions) == 0, incorrect_fig_captions
160
+
161
+ def table_figure_reference_check(doc, doc_type):
162
+ """Check for incorrect references to tables and figures in the document."""
163
+ incorrect_table_figure_references = []
164
+
165
+ if doc_type in ["Advisory Circular", "Order"]:
166
+ # For Advisory Circulars and Orders, correct references are "Table X-Y" or "Figure X-Y"
167
+ incorrect_table_ref_pattern = re.compile(r'\bTable\s+\d+(?!-\d+)\b', re.IGNORECASE)
168
+ incorrect_figure_ref_pattern = re.compile(r'\bFigure\s+\d+(?!-\d+)\b', re.IGNORECASE)
169
+ else:
170
+ # For other document types, correct references are "Table X" or "Figure X"
171
+ incorrect_table_ref_pattern = re.compile(r'\bTable\s+\d+(-\d+)?\b', re.IGNORECASE)
172
+ incorrect_figure_ref_pattern = re.compile(r'\bFigure\s+\d+(-\d+)?\b', re.IGNORECASE)
173
+
174
+ for paragraph in doc:
175
+ paragraph_strip = paragraph.strip()
176
+ # Exclude captions
177
+ starts_with_table_or_figure = paragraph_strip.lower().startswith('table') or paragraph_strip.lower().startswith('figure')
178
+ if not starts_with_table_or_figure:
179
+ # Find incorrect table references
180
+ incorrect_tables = incorrect_table_ref_pattern.findall(paragraph)
181
+ if incorrect_tables:
182
+ incorrect_table_figure_references.extend(incorrect_tables)
183
+ # Find incorrect figure references
184
+ incorrect_figures = incorrect_figure_ref_pattern.findall(paragraph)
185
+ if incorrect_figures:
186
+ incorrect_table_figure_references.extend(incorrect_figures)
187
+
188
+ # Return False if any incorrect references are found
189
+ return len(incorrect_table_figure_references) == 0, incorrect_table_figure_references
190
+
191
+ def document_title_check(doc_path, doc_type):
192
+ incorrect_titles = []
193
+ doc = Document(doc_path)
194
+
195
+ # Updated pattern to capture titles correctly
196
+ ac_pattern = re.compile(r'AC\s+\d+(?:-\d+)?(?:,|\s)+(.+?)(?=\.|,|$)')
197
+
198
+ # Define formatting rules for different document types
199
+ formatting_rules = {
200
+ "Advisory Circular": {"italics": True, "quotes": False},
201
+ "Airworthiness Criteria": {"italics": False, "quotes": True},
202
+ "Deviation Memo": {"italics": False, "quotes": True},
203
+ "Exemption": {"italics": False, "quotes": True},
204
+ "Federal Register Notice": {"italics": False, "quotes": True},
205
+ "Handbook/Manual": {"italics": False, "quotes": False},
206
+ "Order": {"italics": False, "quotes": True},
207
+ "Policy Statement": {"italics": False, "quotes": False},
208
+ "Rule": {"italics": False, "quotes": True},
209
+ "Special Condition": {"italics": False, "quotes": True},
210
+ "Technical Standard Order": {"italics": False, "quotes": True},
211
+ "Other": {"italics": False, "quotes": False}
212
+ }
213
+
214
+ # Get the rules for the current document type
215
+ if doc_type not in formatting_rules:
216
+ raise ValueError(f"Unsupported document type: {doc_type}")
217
+
218
+ required_format = formatting_rules[doc_type]
219
+
220
+ for paragraph in doc.paragraphs:
221
+ text = paragraph.text
222
+ matches = ac_pattern.finditer(text)
223
+
224
+ for match in matches:
225
+ full_match = match.group(0)
226
+ title_text = match.group(1).strip()
227
+
228
+ # Get the position where the title starts
229
+ title_start = match.start(1)
230
+
231
+ # Check for any type of quotation marks, including smart quotes
232
+ title_in_quotes = any(q in title_text for q in ['"', "'", '"', '"', ''', '''])
233
+
234
+ # Check the formatting of the title
235
+ title_is_italicized = False
236
+ current_pos = 0
237
+ for run in paragraph.runs:
238
+ run_length = len(run.text)
239
+ if current_pos <= title_start < current_pos + run_length:
240
+ relative_pos = title_start - current_pos
241
+ title_is_italicized = run.italic
242
+ break
243
+ current_pos += run_length
244
+
245
+ # Check if formatting matches the required format
246
+ formatting_incorrect = False
247
+ issue_message = []
248
+
249
+ # Check italics requirement
250
+ if required_format["italics"] and not title_is_italicized:
251
+ formatting_incorrect = True
252
+ issue_message.append("should be italicized")
253
+ elif not required_format["italics"] and title_is_italicized:
254
+ formatting_incorrect = True
255
+ issue_message.append("should not be italicized")
256
+
257
+ # Check quotes requirement
258
+ if required_format["quotes"] and not title_in_quotes:
259
+ formatting_incorrect = True
260
+ issue_message.append("should be in quotes")
261
+ elif not required_format["quotes"] and title_in_quotes:
262
+ formatting_incorrect = True
263
+ issue_message.append("should not be in quotes")
264
+
265
+ if formatting_incorrect:
266
+ incorrect_titles.append({
267
+ 'text': full_match,
268
+ 'issue': ', '.join(issue_message)
269
+ })
270
+
271
+ return len(incorrect_titles) == 0, incorrect_titles
272
+
273
+ def get_document_checks(doc_type, template_type):
274
+ """Return expected outline and required headings based on document type and template type."""
275
+ document_checks = {
276
+ "Advisory Circular": {
277
+ "Short AC template AC": {
278
+ "required_headings": [
279
+ "PURPOSE.",
280
+ "APPLICABILITY.",
281
+ "CANCELLATION.",
282
+ "RELATED MATERIAL.",
283
+ "DEFINITION OF KEY TERMS."
284
+ ]
285
+ },
286
+ "Long AC template AC": {
287
+ "required_headings": [
288
+ "Purpose.",
289
+ "Applicability.",
290
+ "Cancellation.",
291
+ "Related Material.",
292
+ "Definition of Key Terms."
293
+ ]
294
+ }
295
+ },
296
+ "Airworthiness Criteria": {
297
+ "required_headings": [
298
+ "TBD - Need to research"
299
+ ]
300
+ },
301
+ "Deviation Memo": {
302
+ "required_headings": [
303
+ "TBD - Need to research"
304
+ ]
305
+ },
306
+ "Exemption": {
307
+ "required_headings": [
308
+ "TBD - Need to research"
309
+ ]
310
+ },
311
+ "Federal Register Notice": {
312
+ "required_headings": [
313
+ "Purpose of This Notice",
314
+ "Audience",
315
+ "Where can I Find This Notice"
316
+ ]
317
+ },
318
+ "Handbook/Manual": {
319
+ "required_headings": [
320
+ "TBD - Need to research"
321
+ ]
322
+ },
323
+ "Order": {
324
+ "required_headings": [
325
+ "Purpose of This Order.",
326
+ "Audience.",
327
+ "Where to Find This Order."
328
+ ]
329
+ },
330
+ "Policy Statement": {
331
+ "required_headings": [
332
+ "SUMMARY",
333
+ "CURRENT REGULATORY AND ADVISORY MATERIAL",
334
+ "RELEVANT PAST PRACTICE",
335
+ "POLICY",
336
+ "EFFECT OF POLICY",
337
+ "CONCLUSION"
338
+ ]
339
+ },
340
+ "Rule": {
341
+ "required_headings": [
342
+ "TBD - Need to research"
343
+ ]
344
+ },
345
+ "Special Condition": {
346
+ "required_headings": [
347
+ "TBD - Need to research"
348
+ ]
349
+ },
350
+ "Technical Standard Order": {
351
+ "required_headings": [
352
+ "PURPOSE.",
353
+ "APPLICABILITY.",
354
+ "REQUIREMENTS.",
355
+ "MARKING.",
356
+ "APPLICATION DATA REQUIREMENTS.",
357
+ "MANUFACTURER DATA REQUIREMENTS.",
358
+ "FURNISHED DATA REQUIREMENTS.",
359
+ "HOW TO GET REFERENCED DOCUMENTS."
360
+ ]
361
+ },
362
+ "Other": {
363
+ "required_headings": [
364
+ "N/A"
365
+ ]
366
+ }
367
+ }
368
+
369
+ # Add debugging logs
370
+ logger = logging.getLogger(__name__)
371
+ logger.info(f"Requested document type: {doc_type}")
372
+ logger.info(f"Requested template type: {template_type}")
373
+
374
+ if doc_type == "Advisory Circular":
375
+ checks = document_checks.get(doc_type, {}).get(template_type, {})
376
+ else:
377
+ checks = document_checks.get(doc_type, {})
378
+
379
+ logger.info(f"Retrieved checks: {checks}")
380
+ return checks
381
+
382
+ def double_period_check(doc):
383
+ """Check for sentences that end with two periods."""
384
+ incorrect_sentences = []
385
+
386
+ for paragraph in doc:
387
+ # Split the paragraph into sentences based on common sentence-ending punctuation
388
+ sentences = re.split(r'(?<=[.!?]) +', paragraph)
389
+ for sentence in sentences:
390
+ if sentence.endswith('..'):
391
+ incorrect_sentences.append(sentence.strip()) # Log the incorrectly formatted sentence
392
+
393
+ return len(incorrect_sentences) == 0, incorrect_sentences # Return True if no double periods are found, along with any incorrect sentences
394
+
395
+ def spacing_check(doc):
396
+ """
397
+ Check for correct spacing in US federal regulatory documents.
398
+ Checks for:
399
+ - Spacing between document type and number (e.g., "AC 20-114")
400
+ - Spacing around section symbols (e.g., "Β§ 25.301")
401
+ - Spacing around part numbers (e.g., "Part 25")
402
+ - Spacing around paragraph indications (e.g., "(a)", "(1)")
403
+ - Double spaces between words
404
+ """
405
+ incorrect_spacing = []
406
+
407
+ # Regex patterns to find incorrect spacing
408
+ doc_type_pattern = re.compile(r'(?<!\s)(AC|AD|CFR|FAA|N|SFAR)(\d+[-]?\d*)', re.IGNORECASE)
409
+ section_symbol_pattern = re.compile(r'(?<!\s)(Β§|Β§Β§)(\d+\.\d+)', re.IGNORECASE)
410
+ part_number_pattern = re.compile(r'(?<!\s)Part(\d+)', re.IGNORECASE)
411
+ paragraph_pattern = re.compile(r'(?<!\s)(\([a-z](?!\))|\([1-9](?!\)))', re.IGNORECASE)
412
+ double_space_pattern = re.compile(r'\s{2,}')
413
+
414
+ for paragraph in doc:
415
+ # Check for incorrect document type spacing
416
+ if doc_type_pattern.search(paragraph):
417
+ incorrect_spacing.append(paragraph)
418
+
419
+ # Check for incorrect section symbol spacing
420
+ if section_symbol_pattern.search(paragraph):
421
+ incorrect_spacing.append(paragraph)
422
+
423
+ # Check for incorrect part number spacing
424
+ if part_number_pattern.search(paragraph):
425
+ incorrect_spacing.append(paragraph)
426
+
427
+ # Check for incorrect paragraph indication spacing
428
+ if paragraph_pattern.search(paragraph):
429
+ incorrect_spacing.append(paragraph)
430
+
431
+ # Check for double spaces
432
+ if double_space_pattern.search(paragraph):
433
+ incorrect_spacing.append(paragraph)
434
+
435
+ return len(incorrect_spacing) == 0, incorrect_spacing
436
+
437
+ def check_prohibited_phrases(doc):
438
+ """Check for prohibited words or phrases."""
439
+ prohibited_phrases = [
440
+ r'\babove\b',
441
+ r'\bbelow\b',
442
+ r'\bthere is\b',
443
+ r'\bthere are\b'
444
+ ]
445
+ issues = []
446
+ for paragraph in doc:
447
+ for phrase in prohibited_phrases:
448
+ if re.search(phrase, paragraph, re.IGNORECASE):
449
+ issues.append((phrase.strip(r'\b'), paragraph.strip()))
450
+ return issues
451
+
452
+ def check_abbreviation_usage(doc):
453
+ """Check for abbreviation consistency after first definition."""
454
+ abbreviations = {}
455
+ issues = []
456
+ for paragraph in doc:
457
+ # Find definitions like "Federal Aviation Administration (FAA)"
458
+ defined_matches = re.findall(r'\b([A-Za-z &]+)\s+\((\b[A-Z]{2,}\b)\)', paragraph)
459
+ for full_term, acronym in defined_matches:
460
+ if acronym not in abbreviations:
461
+ abbreviations[acronym] = {"full_term": full_term.strip(), "defined": True}
462
+
463
+ # Check for full term usage after definition
464
+ for acronym, data in abbreviations.items():
465
+ full_term = data["full_term"]
466
+ if full_term in paragraph:
467
+ # Ignore first usage where it's defined
468
+ if data["defined"]:
469
+ data["defined"] = False # Mark it as now defined
470
+ else:
471
+ # Only flag subsequent occurrences
472
+ issues.append((full_term, acronym, paragraph.strip()))
473
+
474
+ return issues
475
+
476
+ def check_date_formats(doc):
477
+ """Check for inconsistent date formats."""
478
+ date_issues = []
479
+ correct_date_pattern = re.compile(r'\b(January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}, \d{4}\b')
480
+ date_pattern = re.compile(r'\b\d{1,2}/\d{1,2}/\d{2,4}\b') # MM/DD/YYYY
481
+ for paragraph in doc:
482
+ if date_pattern.search(paragraph):
483
+ dates = date_pattern.findall(paragraph)
484
+ for date in dates:
485
+ if not correct_date_pattern.match(date):
486
+ date_issues.append((date, paragraph.strip()))
487
+ return date_issues
488
+
489
+ def check_placeholders(doc):
490
+ """Check for placeholders that should be removed."""
491
+ placeholder_phrases = [
492
+ r'\bTBD\b',
493
+ r'\bTo be determined\b',
494
+ r'\bTo be added\b'
495
+ ]
496
+ issues = []
497
+ for paragraph in doc:
498
+ for phrase in placeholder_phrases:
499
+ if re.search(phrase, paragraph, re.IGNORECASE):
500
+ issues.append((phrase.strip(r'\b'), paragraph.strip()))
501
+ return issues
502
+
503
+ def format_results_for_gradio(heading_valid, headings_found, acronyms_valid, undefined_acronyms,
504
+ legal_valid, incorrect_legal_references, table_valid, incorrect_captions,
505
+ figure_valid, incorrect_fig_captions, references_valid, incorrect_table_figure_references,
506
+ title_style_valid, incorrect_titles, required_headings, doc_type, double_period_valid,
507
+ incorrect_sentences, spacing_valid, incorrect_spacing, abbreviation_issues, date_issues, placeholder_issues):
508
+ """Format the results for Gradio display."""
509
+ results = []
510
+ results.append("# Document Check Results\n")
511
+
512
+ # Required Headings Check
513
+ results.append("## Required Headings Check")
514
+ if heading_valid:
515
+ results.append("βœ… All required headings are present.\n")
516
+ else:
517
+ missing_headings = set(required_headings) - set(headings_found)
518
+ results.append("❌ Missing Required Headings:")
519
+ for heading in missing_headings:
520
+ results.append(f"- {heading}")
521
+ results.append("")
522
+
523
+ # Acronym Check
524
+ results.append("## Acronym Check")
525
+ if acronyms_valid:
526
+ results.append("βœ… All acronyms are properly defined.\n")
527
+ else:
528
+ results.append(f"❌ The following acronyms need to be defined at first use: {', '.join(undefined_acronyms)}\n")
529
+
530
+ # Legal Check
531
+ results.append("## Legal Terminology Check")
532
+ if legal_valid:
533
+ results.append("βœ… All legal references are properly formatted.\n")
534
+ else:
535
+ results.append("❌ Incorrect Legal Terminology:")
536
+ for incorrect_term, correct_term in incorrect_legal_references:
537
+ results.append(f"- Use '{correct_term}' instead of '{incorrect_term}'")
538
+ results.append("")
539
+
540
+ # Table Caption Check
541
+ results.append("## Table Caption Check")
542
+ if table_valid:
543
+ results.append("βœ… All table captions are correctly formatted.\n")
544
+ else:
545
+ results.append("❌ Incorrect Table Captions:")
546
+ for caption in incorrect_captions:
547
+ results.append(f"- {caption}")
548
+ results.append("")
549
+
550
+ # Figure Caption Check
551
+ results.append("## Figure Caption Check")
552
+ if figure_valid:
553
+ results.append("βœ… All figure captions are correctly formatted.\n")
554
+ else:
555
+ results.append("❌ Incorrect Figure Captions:")
556
+ for caption in incorrect_fig_captions:
557
+ results.append(f"- {caption}")
558
+ results.append("")
559
+
560
+ # Table and Figure References Check
561
+ results.append("## Table and Figure References Check")
562
+ if references_valid:
563
+ results.append("βœ… All table and figure references are correctly formatted.\n")
564
+ else:
565
+ results.append("❌ Incorrect Table/Figure References:")
566
+ for ref in incorrect_table_figure_references:
567
+ results.append(f"- {ref}")
568
+ results.append("")
569
+
570
+ # Document Title Style Check
571
+ results.append("## Document Title Style Check")
572
+ if title_style_valid:
573
+ results.append("βœ… All document title references are properly styled.\n")
574
+ else:
575
+ results.append("❌ Incorrect Document Title Styling:")
576
+ for title in incorrect_titles:
577
+ results.append(f"- {title['text']}")
578
+ results.append(f" - Issue: {title['issue']}")
579
+
580
+ # Add formatting guidance
581
+ formatting_notes = {
582
+ "Advisory Circular": "Document titles should be italicized, not in quotation marks.",
583
+ "Order": "Document titles should be in quotation marks, not italicized.",
584
+ "Federal Notice": "Document titles should be in quotation marks, not italicized.",
585
+ "Policy Statement": "Document titles should not have any special formatting (no italics, no quotation marks)."
586
+ }
587
+
588
+ if doc_type in formatting_notes:
589
+ results.append(f"\nNote: {formatting_notes[doc_type]}")
590
+ else:
591
+ results.append("\nNote: Please verify the correct formatting style for this document type.")
592
+ results.append("")
593
+
594
+ # Double Period Check
595
+ results.append("## Double Period Check")
596
+ if double_period_valid:
597
+ results.append("βœ… No double periods found.\n")
598
+ else:
599
+ results.append("❌ Sentences found with double periods:")
600
+ for sentence in incorrect_sentences:
601
+ results.append(f"- {sentence}")
602
+ results.append("")
603
+
604
+ # Spacing Check
605
+ results.append("## Spacing Check")
606
+ if spacing_valid:
607
+ results.append("βœ… All spacing is correct.\n")
608
+ else:
609
+ results.append("❌ Incorrect spacing found in:")
610
+ for spacing in incorrect_spacing:
611
+ results.append(f"- {spacing}")
612
+ results.append("")
613
+
614
+ # Abbreviation Consistency
615
+ results.append("## Abbreviation Consistency")
616
+ if not abbreviation_issues:
617
+ results.append("βœ… All abbreviations are used consistently after definition.\n")
618
+ else:
619
+ results.append("❌ Abbreviation Issues:")
620
+ for full_term, acronym, paragraph in abbreviation_issues:
621
+ results.append(f"- Use '{acronym}' instead of '{full_term}' in: {paragraph}")
622
+ results.append("")
623
+
624
+ # Date Format Consistency
625
+ results.append("## Date Format Consistency")
626
+ if not date_issues:
627
+ results.append("βœ… All dates are in the correct format.\n")
628
+ else:
629
+ results.append("❌ Date Format Issues:")
630
+ for date, paragraph in date_issues:
631
+ results.append(f"- Incorrect date format '{date}' in: {paragraph}")
632
+ results.append("")
633
+
634
+ # Placeholder Check
635
+ results.append("## Placeholder Check")
636
+ if not placeholder_issues:
637
+ results.append("βœ… No placeholders found.\n")
638
+ else:
639
+ results.append("❌ Placeholders Found:")
640
+ for phrase, paragraph in placeholder_issues:
641
+ results.append(f"- Placeholder '{phrase}' in: {paragraph}")
642
+
643
+ return "\n".join(results)
644
+
645
+ # Modify the process_document function to return formatted results instead of writing to file
646
+ def process_document(file_obj, doc_type, template_type):
647
+ """Process the document and perform checks based on document type and template type."""
648
+ logger = logging.getLogger(__name__)
649
+
650
+ # Create a temporary file to save the uploaded file
651
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as tmp_file:
652
+ tmp_file.write(file_obj.read())
653
+ tmp_path = tmp_file.name
654
+
655
+ try:
656
+ # Step 1: Read the Word document
657
+ doc = read_word_document(tmp_path)
658
+ logger.info("Document read successfully.")
659
+
660
+ # Get required headings for document type and template type
661
+ checks = get_document_checks(doc_type, template_type)
662
+ required_headings = checks.get("required_headings", [])
663
+
664
+ # Step 2: Perform all checks
665
+ heading_valid, headings_found = heading_title_check(doc, required_headings)
666
+ acronyms_valid, undefined_acronyms = acronym_check(doc)
667
+ legal_valid, incorrect_legal_references = legal_check(doc)
668
+ table_valid, incorrect_captions = table_caption_check(doc, doc_type)
669
+ figure_valid, incorrect_fig_captions = figure_caption_check(doc, doc_type)
670
+ references_valid, incorrect_table_figure_references = table_figure_reference_check(doc, doc_type)
671
+ title_style_valid, incorrect_titles = document_title_check(tmp_path, doc_type)
672
+ double_period_valid, incorrect_sentences = double_period_check(doc)
673
+ spacing_valid, incorrect_spacing = spacing_check(doc)
674
+ abbreviation_issues = check_abbreviation_usage(doc)
675
+ date_issues = check_date_formats(doc)
676
+ placeholder_issues = check_placeholders(doc)
677
+
678
+ # Format results for Gradio
679
+ results = format_results_for_gradio(
680
+ heading_valid, headings_found,
681
+ acronyms_valid, undefined_acronyms,
682
+ legal_valid, incorrect_legal_references,
683
+ table_valid, incorrect_captions,
684
+ figure_valid, incorrect_fig_captions,
685
+ references_valid, incorrect_table_figure_references,
686
+ title_style_valid, incorrect_titles,
687
+ required_headings, doc_type,
688
+ double_period_valid, incorrect_sentences,
689
+ spacing_valid, incorrect_spacing,
690
+ abbreviation_issues, date_issues,
691
+ placeholder_issues
692
+ )
693
+
694
+ return results
695
+
696
+ finally:
697
+ # Clean up the temporary file
698
+ os.unlink(tmp_path)
699
+
700
+ # Create the Gradio interface
701
+ def create_gradio_interface():
702
+ document_types = [
703
+ "Advisory Circular", "Airworthiness Criteria", "Deviation Memo", "Exemption",
704
+ "Federal Register Notice", "Handbook/Manual", "Order", "Policy Statement",
705
+ "Rule", "Special Condition", "Technical Standard Order", "Other"
706
+ ]
707
+
708
+ template_types = ["Short AC template AC", "Long AC template AC"]
709
+
710
+ def process_file(file_obj, doc_type, template_type):
711
+ if doc_type != "Advisory Circular":
712
+ template_type = "N/A"
713
+ return process_document(file_obj, doc_type, template_type)
714
+
715
+ # Create the interface
716
+ iface = gr.Interface(
717
+ fn=process_file,
718
+ inputs=[
719
+ gr.File(label="Upload Word Document (.docx)", type="binary"),
720
+ gr.Dropdown(choices=document_types, label="Document Type"),
721
+ gr.Radio(choices=template_types, label="Template Type (Only for Advisory Circular)", visible=True)
722
+ ],
723
+ outputs=gr.Markdown(label="Check Results"),
724
+ title="FAA Document Checker",
725
+ description="Upload a Word document to check for compliance with FAA documentation standards.",
726
+ article="This tool checks document formatting, headings, acronyms, legal references, and more.",
727
+ theme="default"
728
+ )
729
+
730
+ return iface
731
+
732
+ # Launch the Gradio interface
733
+ if __name__ == "__main__":
734
+ setup_logging()
735
+ iface = create_gradio_interface()
736
+ iface.launch()