Hoctar77 commited on
Commit
8c95735
·
verified ·
1 Parent(s): f4450e4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +197 -397
app.py CHANGED
@@ -3,19 +3,7 @@ import logging
3
  import re
4
  from docx import Document
5
  import io
6
-
7
- def setup_logging():
8
- """Initialize logging configuration."""
9
- logging.basicConfig(
10
- level=logging.INFO,
11
- format='%(asctime)s %(levelname)s %(name)s - %(message)s',
12
- handlers=[logging.StreamHandler()]
13
- )
14
-
15
- def read_word_document(doc_path):
16
- """Read a Word document and return its content as a list of paragraphs."""
17
- doc = Document(doc_path)
18
- return [para.text for para in doc.paragraphs if para.text.strip() != ""]
19
 
20
  def heading_title_check(doc, required_headings):
21
  """Check if all required headings are present."""
@@ -64,63 +52,47 @@ def acronym_check(doc):
64
  return len(undefined_acronyms) == 0, list(undefined_acronyms)
65
 
66
  def legal_check(doc):
67
- """Check for correct legal references in the document and suggest corrections.
68
-
69
- Args:
70
- doc (list): List of paragraphs/strings to check
71
-
72
- Returns:
73
- tuple: (bool, list) - (True if no errors found, list of (incorrect, correct) terms)
74
- """
75
- # Mapping of incorrect terms to their correct versions
76
- incorrect_variations = {
77
- r"\bUSC\b": "U.S.C.",
78
- r"\bCFR Part\b": "CFR part",
79
- r"\bC\.F\.R\.\b": "CFR",
80
- r"\bWe\b": "The FAA",
81
- r"\bwe\b": "the FAA",
82
- r"\bcancelled\b": "canceled",
83
- r"\bshall\b": "must or will",
84
- r"\b&\b": "and"
85
- }
86
-
87
- # List to store tuples of incorrect terms and their correct versions
88
  incorrect_legal_references = []
89
 
90
- for paragraph in doc:
91
- # Special handling for "Title 14" / "title 14"
92
- title_14_pattern = r"(?P<prefix>^|[.!?\s])\s*(?P<title>title 14|Title 14)\b"
93
- matches = re.finditer(title_14_pattern, paragraph)
94
-
95
- for match in matches:
96
- prefix = match.group('prefix')
97
- current_title = match.group('title')
98
-
99
- # If it follows a sentence-ending punctuation or is at start, it should be "Title 14"
100
- if prefix in ('.', '!', '?', '') and current_title.lower() == "title 14":
101
- if current_title != "Title 14":
102
- incorrect_legal_references.append((current_title, "Title 14"))
103
- # If it's within a sentence, it should be "title 14"
104
- elif prefix.isspace() and current_title != "title 14":
105
- incorrect_legal_references.append((current_title, "title 14"))
106
 
107
- # Check other variations
108
- for incorrect_pattern, correct_term in incorrect_variations.items():
109
- matches = re.finditer(incorrect_pattern, paragraph)
110
- for match in matches:
111
- incorrect_legal_references.append((match.group(), correct_term))
 
 
 
 
112
 
113
  return len(incorrect_legal_references) == 0, incorrect_legal_references
114
 
115
  def table_caption_check(doc, doc_type):
116
  """Check if table captions are formatted correctly."""
117
  incorrect_captions = []
 
118
  try:
119
  # Check table captions
120
  for table in doc.tables:
121
  # Get the paragraph before the table
122
- table._element.getprevious()
123
- # Add your caption checking logic here
 
 
 
 
 
 
 
 
124
  except Exception as e:
125
  print(f"Error in table caption check: {str(e)}")
126
  return False, []
@@ -128,373 +100,198 @@ def table_caption_check(doc, doc_type):
128
  return len(incorrect_captions) == 0, incorrect_captions
129
 
130
  def figure_caption_check(doc, doc_type):
131
- """
132
- Check for correctly formatted figure captions in the document.
133
- Supports both numeric (Figure 1-2) and alphanumeric (Figure C-1) formats.
134
- """
135
- if doc_type in ["Advisory Circular", "Order"]:
136
- # Pattern for "Figure X-Y" where X and Y can be either letters or numbers
137
- figure_caption_pattern = re.compile(r'^Figure\s+([A-Z0-9]+)-([A-Z0-9]+)[\.\s]', re.IGNORECASE)
138
- else:
139
- # Pattern for "Figure X" where X can be either letters or numbers
140
- figure_caption_pattern = re.compile(r'^Figure\s+([A-Z0-9]+)[\.\s]', re.IGNORECASE)
141
-
142
  incorrect_fig_captions = []
143
- for paragraph in doc:
144
- paragraph_strip = paragraph.strip()
145
- if paragraph_strip.lower().startswith("figure"):
146
- if not figure_caption_pattern.match(paragraph_strip):
147
- incorrect_fig_captions.append(paragraph_strip)
148
-
 
 
 
 
 
 
 
 
 
 
 
149
  return len(incorrect_fig_captions) == 0, incorrect_fig_captions
150
 
151
  def table_figure_reference_check(doc, doc_type):
152
- """Check for incorrect references to tables and figures in the document."""
153
  incorrect_table_figure_references = []
154
 
155
- if doc_type in ["Advisory Circular", "Order"]:
156
- # For Advisory Circulars and Orders, correct references are "Table X-Y" or "Figure X-Y"
157
- incorrect_table_ref_pattern = re.compile(r'\bTable\s+\d+(?!-\d+)\b', re.IGNORECASE)
158
- incorrect_figure_ref_pattern = re.compile(r'\bFigure\s+\d+(?!-\d+)\b', re.IGNORECASE)
159
- else:
160
- # For other document types, correct references are "Table X" or "Figure X"
161
- incorrect_table_ref_pattern = re.compile(r'\bTable\s+\d+(-\d+)?\b', re.IGNORECASE)
162
- incorrect_figure_ref_pattern = re.compile(r'\bFigure\s+\d+(-\d+)?\b', re.IGNORECASE)
163
-
164
- for paragraph in doc:
165
- paragraph_strip = paragraph.strip()
166
- # Exclude captions
167
- starts_with_table_or_figure = paragraph_strip.lower().startswith('table') or paragraph_strip.lower().startswith('figure')
168
- if not starts_with_table_or_figure:
169
- # Find incorrect table references
170
- incorrect_tables = incorrect_table_ref_pattern.findall(paragraph)
171
- if incorrect_tables:
172
- incorrect_table_figure_references.extend(incorrect_tables)
173
- # Find incorrect figure references
174
- incorrect_figures = incorrect_figure_ref_pattern.findall(paragraph)
175
- if incorrect_figures:
176
- incorrect_table_figure_references.extend(incorrect_figures)
177
-
178
- # Return False if any incorrect references are found
179
  return len(incorrect_table_figure_references) == 0, incorrect_table_figure_references
180
 
181
- def document_title_check(doc_path, doc_type):
 
182
  incorrect_titles = []
183
- doc = Document(doc_path)
184
-
185
- # Updated pattern to capture titles correctly
186
- ac_pattern = re.compile(r'AC\s+\d+(?:-\d+)?(?:,|\s)+(.+?)(?=\.|,|$)')
187
-
188
- # Define formatting rules for different document types
189
- formatting_rules = {
190
- "Advisory Circular": {"italics": True, "quotes": False},
191
- "Airworthiness Criteria": {"italics": False, "quotes": True},
192
- "Deviation Memo": {"italics": False, "quotes": True},
193
- "Exemption": {"italics": False, "quotes": True},
194
- "Federal Register Notice": {"italics": False, "quotes": True},
195
- "Handbook/Manual": {"italics": False, "quotes": False},
196
- "Order": {"italics": False, "quotes": True},
197
- "Policy Statement": {"italics": False, "quotes": False},
198
- "Rule": {"italics": False, "quotes": True},
199
- "Special Condition": {"italics": False, "quotes": True},
200
- "Technical Standard Order": {"italics": False, "quotes": True},
201
- "Other": {"italics": False, "quotes": False}
202
- }
203
-
204
- # Get the rules for the current document type
205
- if doc_type not in formatting_rules:
206
- raise ValueError(f"Unsupported document type: {doc_type}")
207
-
208
- required_format = formatting_rules[doc_type]
209
-
210
- for paragraph in doc.paragraphs:
211
- text = paragraph.text
212
- matches = ac_pattern.finditer(text)
213
-
214
- for match in matches:
215
- full_match = match.group(0)
216
- title_text = match.group(1).strip()
217
-
218
- # Get the position where the title starts
219
- title_start = match.start(1)
220
-
221
- # Check for any type of quotation marks, including smart quotes
222
- title_in_quotes = any(q in title_text for q in ['"', "'", '"', '"', ''', '''])
223
-
224
- # Check the formatting of the title
225
- title_is_italicized = False
226
- current_pos = 0
227
- for run in paragraph.runs:
228
- run_length = len(run.text)
229
- if current_pos <= title_start < current_pos + run_length:
230
- relative_pos = title_start - current_pos
231
- title_is_italicized = run.italic
232
- break
233
- current_pos += run_length
234
-
235
- # Check if formatting matches the required format
236
- formatting_incorrect = False
237
- issue_message = []
238
-
239
- # Check italics requirement
240
- if required_format["italics"] and not title_is_italicized:
241
- formatting_incorrect = True
242
- issue_message.append("should be italicized")
243
- elif not required_format["italics"] and title_is_italicized:
244
- formatting_incorrect = True
245
- issue_message.append("should not be italicized")
246
-
247
- # Check quotes requirement
248
- if required_format["quotes"] and not title_in_quotes:
249
- formatting_incorrect = True
250
- issue_message.append("should be in quotes")
251
- elif not required_format["quotes"] and title_in_quotes:
252
- formatting_incorrect = True
253
- issue_message.append("should not be in quotes")
254
 
255
- if formatting_incorrect:
256
- incorrect_titles.append({
257
- 'text': full_match,
258
- 'issue': ', '.join(issue_message)
259
- })
 
 
 
 
 
 
 
 
 
 
 
260
 
261
  return len(incorrect_titles) == 0, incorrect_titles
262
 
263
- def get_document_checks(doc_type, template_type):
264
- """Return expected outline and required headings based on document type and template type."""
265
- document_checks = {
266
- "Advisory Circular": {
267
- "Short AC template AC": {
268
- "required_headings": [
269
- "PURPOSE.",
270
- "APPLICABILITY.",
271
- "CANCELLATION.",
272
- "RELATED MATERIAL.",
273
- "DEFINITION OF KEY TERMS."
274
- ]
275
- },
276
- "Long AC template AC": {
277
- "required_headings": [
278
- "Purpose.",
279
- "Applicability.",
280
- "Cancellation.",
281
- "Related Material.",
282
- "Definition of Key Terms."
283
- ]
284
- }
285
- },
286
- "Airworthiness Criteria": {
287
- "required_headings": [
288
- "TBD - Need to research"
289
- ]
290
- },
291
- "Deviation Memo": {
292
- "required_headings": [
293
- "TBD - Need to research"
294
- ]
295
- },
296
- "Exemption": {
297
- "required_headings": [
298
- "TBD - Need to research"
299
- ]
300
- },
301
- "Federal Register Notice": {
302
- "required_headings": [
303
- "Purpose of This Notice",
304
- "Audience",
305
- "Where can I Find This Notice"
306
- ]
307
- },
308
- "Handbook/Manual": {
309
- "required_headings": [
310
- "TBD - Need to research"
311
- ]
312
- },
313
- "Order": {
314
- "required_headings": [
315
- "Purpose of This Order.",
316
- "Audience.",
317
- "Where to Find This Order."
318
- ]
319
- },
320
- "Policy Statement": {
321
- "required_headings": [
322
- "SUMMARY",
323
- "CURRENT REGULATORY AND ADVISORY MATERIAL",
324
- "RELEVANT PAST PRACTICE",
325
- "POLICY",
326
- "EFFECT OF POLICY",
327
- "CONCLUSION"
328
- ]
329
- },
330
- "Rule": {
331
- "required_headings": [
332
- "TBD - Need to research"
333
- ]
334
- },
335
- "Special Condition": {
336
- "required_headings": [
337
- "TBD - Need to research"
338
- ]
339
- },
340
- "Technical Standard Order": {
341
- "required_headings": [
342
- "PURPOSE.",
343
- "APPLICABILITY.",
344
- "REQUIREMENTS.",
345
- "MARKING.",
346
- "APPLICATION DATA REQUIREMENTS.",
347
- "MANUFACTURER DATA REQUIREMENTS.",
348
- "FURNISHED DATA REQUIREMENTS.",
349
- "HOW TO GET REFERENCED DOCUMENTS."
350
- ]
351
- },
352
- "Other": {
353
- "required_headings": [
354
- "N/A"
355
- ]
356
- }
357
- }
358
-
359
- # Add debugging logs
360
- logger = logging.getLogger(__name__)
361
- logger.info(f"Requested document type: {doc_type}")
362
- logger.info(f"Requested template type: {template_type}")
363
-
364
- if doc_type == "Advisory Circular":
365
- checks = document_checks.get(doc_type, {}).get(template_type, {})
366
- else:
367
- checks = document_checks.get(doc_type, {})
368
-
369
- logger.info(f"Retrieved checks: {checks}")
370
- return checks
371
-
372
  def double_period_check(doc):
373
- """Check for sentences that end with two periods."""
374
  incorrect_sentences = []
375
-
376
- for paragraph in doc:
377
- # Split the paragraph into sentences based on common sentence-ending punctuation
378
- sentences = re.split(r'(?<=[.!?]) +', paragraph)
379
- for sentence in sentences:
380
- if sentence.endswith('..'):
381
- incorrect_sentences.append(sentence.strip()) # Log the incorrectly formatted sentence
382
-
383
- return len(incorrect_sentences) == 0, incorrect_sentences # Return True if no double periods are found, along with any incorrect sentences
 
 
384
 
385
  def spacing_check(doc):
386
- """
387
- Check for correct spacing in US federal regulatory documents.
388
- Checks for:
389
- - Spacing between document type and number (e.g., "AC 20-114")
390
- - Spacing around section symbols (e.g., "§ 25.301")
391
- - Spacing around part numbers (e.g., "Part 25")
392
- - Spacing around paragraph indications (e.g., "(a)", "(1)")
393
- - Double spaces between words
394
- """
395
  incorrect_spacing = []
396
-
397
- # Regex patterns to find incorrect spacing
398
- doc_type_pattern = re.compile(r'(?<!\s)(AC|AD|CFR|FAA|N|SFAR)(\d+[-]?\d*)', re.IGNORECASE)
399
- section_symbol_pattern = re.compile(r'(?<!\s)(§|§§)(\d+\.\d+)', re.IGNORECASE)
400
- part_number_pattern = re.compile(r'(?<!\s)Part(\d+)', re.IGNORECASE)
401
- paragraph_pattern = re.compile(r'(?<!\s)(\([a-z](?!\))|\([1-9](?!\)))', re.IGNORECASE)
402
- double_space_pattern = re.compile(r'\s{2,}')
403
-
404
- for paragraph in doc:
405
- # Check for incorrect document type spacing
406
- if doc_type_pattern.search(paragraph):
407
- incorrect_spacing.append(paragraph)
408
-
409
- # Check for incorrect section symbol spacing
410
- if section_symbol_pattern.search(paragraph):
411
- incorrect_spacing.append(paragraph)
412
-
413
- # Check for incorrect part number spacing
414
- if part_number_pattern.search(paragraph):
415
- incorrect_spacing.append(paragraph)
416
-
417
- # Check for incorrect paragraph indication spacing
418
- if paragraph_pattern.search(paragraph):
419
- incorrect_spacing.append(paragraph)
420
-
421
- # Check for double spaces
422
- if double_space_pattern.search(paragraph):
423
- incorrect_spacing.append(paragraph)
424
-
425
  return len(incorrect_spacing) == 0, incorrect_spacing
426
 
427
- def check_prohibited_phrases(doc):
428
- """Check for prohibited words or phrases."""
429
- prohibited_phrases = [
430
- r'\babove\b',
431
- r'\bbelow\b',
432
- r'\bthere is\b',
433
- r'\bthere are\b'
434
- ]
435
- issues = []
436
- for paragraph in doc:
437
- for phrase in prohibited_phrases:
438
- if re.search(phrase, paragraph, re.IGNORECASE):
439
- issues.append((phrase.strip(r'\b'), paragraph.strip()))
440
- return issues
441
-
442
  def check_abbreviation_usage(doc):
443
- """Check for abbreviation consistency after first definition."""
444
- abbreviations = {}
445
- issues = []
446
- for paragraph in doc:
447
- # Find definitions like "Federal Aviation Administration (FAA)"
448
- defined_matches = re.findall(r'\b([A-Za-z &]+)\s+\((\b[A-Z]{2,}\b)\)', paragraph)
449
- for full_term, acronym in defined_matches:
450
- if acronym not in abbreviations:
451
- abbreviations[acronym] = {"full_term": full_term.strip(), "defined": True}
452
 
453
- # Check for full term usage after definition
454
- for acronym, data in abbreviations.items():
455
- full_term = data["full_term"]
456
- if full_term in paragraph:
457
- # Ignore first usage where it's defined
458
- if data["defined"]:
459
- data["defined"] = False # Mark it as now defined
460
- else:
461
- # Only flag subsequent occurrences
462
- issues.append((full_term, acronym, paragraph.strip()))
463
-
464
- return issues
 
 
 
 
 
 
 
 
 
465
 
466
  def check_date_formats(doc):
467
- """Check for inconsistent date formats."""
468
  date_issues = []
469
- correct_date_pattern = re.compile(r'\b(January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}, \d{4}\b')
470
- date_pattern = re.compile(r'\b\d{1,2}/\d{1,2}/\d{2,4}\b') # MM/DD/YYYY
471
- for paragraph in doc:
472
- if date_pattern.search(paragraph):
473
- dates = date_pattern.findall(paragraph)
474
- for date in dates:
475
- if not correct_date_pattern.match(date):
476
- date_issues.append((date, paragraph.strip()))
 
 
 
477
  return date_issues
478
 
479
  def check_placeholders(doc):
480
- """Check for placeholders that should be removed."""
481
- placeholder_phrases = [
482
- r'\bTBD\b',
483
- r'\bTo be determined\b',
484
- r'\bTo be added\b'
485
- ]
486
- issues = []
487
- for paragraph in doc:
488
- for phrase in placeholder_phrases:
489
- if re.search(phrase, paragraph, re.IGNORECASE):
490
- issues.append((phrase.strip(r'\b'), paragraph.strip()))
491
- return issues
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
492
 
493
- def format_results_for_gradio(heading_valid, headings_found, acronyms_valid, undefined_acronyms,
494
- legal_valid, incorrect_legal_references, table_valid, incorrect_captions,
495
- figure_valid, incorrect_fig_captions, references_valid, incorrect_table_figure_references,
496
- title_style_valid, incorrect_titles, required_headings, doc_type, double_period_valid,
497
- incorrect_sentences, spacing_valid, incorrect_spacing, abbreviation_issues, date_issues, placeholder_issues):
 
 
 
 
 
 
 
498
  """Format the results for Gradio display."""
499
  results = []
500
  results.append("# Document Check Results\n")
@@ -515,7 +312,10 @@ def format_results_for_gradio(heading_valid, headings_found, acronyms_valid, und
515
  if acronyms_valid:
516
  results.append("✅ All acronyms are properly defined.\n")
517
  else:
518
- results.append(f"❌ The following acronyms need to be defined at first use: {', '.join(undefined_acronyms)}\n")
 
 
 
519
 
520
  # Legal Check
521
  results.append("## Legal Terminology Check")
@@ -571,7 +371,7 @@ def format_results_for_gradio(heading_valid, headings_found, acronyms_valid, und
571
  formatting_notes = {
572
  "Advisory Circular": "Document titles should be italicized, not in quotation marks.",
573
  "Order": "Document titles should be in quotation marks, not italicized.",
574
- "Federal Notice": "Document titles should be in quotation marks, not italicized.",
575
  "Policy Statement": "Document titles should not have any special formatting (no italics, no quotation marks)."
576
  }
577
 
 
3
  import re
4
  from docx import Document
5
  import io
6
+ import traceback
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  def heading_title_check(doc, required_headings):
9
  """Check if all required headings are present."""
 
52
  return len(undefined_acronyms) == 0, list(undefined_acronyms)
53
 
54
  def legal_check(doc):
55
+ """Check if legal terminology is used correctly."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  incorrect_legal_references = []
57
 
58
+ try:
59
+ # Define legal terminology mapping
60
+ legal_terms = {
61
+ "C.F.R.": "Code of Federal Regulations",
62
+ "F.R.": "Federal Register",
63
+ "U.S.C.": "United States Code"
64
+ }
 
 
 
 
 
 
 
 
 
65
 
66
+ # Check each paragraph
67
+ for paragraph in doc.paragraprams:
68
+ text = paragraph.text
69
+ for incorrect_term, correct_term in legal_terms.items():
70
+ if incorrect_term in text and correct_term not in text:
71
+ incorrect_legal_references.append((incorrect_term, correct_term))
72
+ except Exception as e:
73
+ print(f"Error in legal check: {str(e)}")
74
+ return False, []
75
 
76
  return len(incorrect_legal_references) == 0, incorrect_legal_references
77
 
78
  def table_caption_check(doc, doc_type):
79
  """Check if table captions are formatted correctly."""
80
  incorrect_captions = []
81
+
82
  try:
83
  # Check table captions
84
  for table in doc.tables:
85
  # Get the paragraph before the table
86
+ prev_paragraph = table._element.getprevious()
87
+ if prev_paragraph is not None and prev_paragraph.text.startswith("Table"):
88
+ # Check if the caption is formatted correctly
89
+ if doc_type == "Advisory Circular":
90
+ # AC captions should be "Table X. Caption text"
91
+ if not prev_paragraph.text.startswith("Table ") or ". " not in prev_paragraph.text:
92
+ incorrect_captions.append(prev_paragraph.text)
93
+ else:
94
+ # Other doc types may have different caption formats
95
+ pass
96
  except Exception as e:
97
  print(f"Error in table caption check: {str(e)}")
98
  return False, []
 
100
  return len(incorrect_captions) == 0, incorrect_captions
101
 
102
  def figure_caption_check(doc, doc_type):
103
+ """Check if figure captions are formatted correctly."""
 
 
 
 
 
 
 
 
 
 
104
  incorrect_fig_captions = []
105
+
106
+ try:
107
+ # Check figure captions
108
+ for paragraph in doc.paragraphs:
109
+ if paragraph.text.startswith("Figure"):
110
+ # Check if the caption is formatted correctly
111
+ if doc_type == "Advisory Circular":
112
+ # AC captions should be "Figure X. Caption text"
113
+ if ". " not in paragraph.text:
114
+ incorrect_fig_captions.append(paragraph.text)
115
+ else:
116
+ # Other doc types may have different caption formats
117
+ pass
118
+ except Exception as e:
119
+ print(f"Error in figure caption check: {str(e)}")
120
+ return False, []
121
+
122
  return len(incorrect_fig_captions) == 0, incorrect_fig_captions
123
 
124
  def table_figure_reference_check(doc, doc_type):
125
+ """Check if table and figure references are formatted correctly."""
126
  incorrect_table_figure_references = []
127
 
128
+ try:
129
+ # Check table and figure references
130
+ for paragraph in doc.paragraphs:
131
+ text = paragraph.text
132
+ if "Table" in text or "Figure" in text:
133
+ # Check if the reference is formatted correctly
134
+ if doc_type == "Advisory Circular":
135
+ # AC references should be "Table X" or "Figure X"
136
+ if not any(text.startswith(f"{item} ") for item in ["Table", "Figure"]):
137
+ incorrect_table_figure_references.append(text)
138
+ else:
139
+ # Other doc types may have different reference formats
140
+ pass
141
+ except Exception as e:
142
+ print(f"Error in table/figure reference check: {str(e)}")
143
+ return False, []
144
+
 
 
 
 
 
 
 
145
  return len(incorrect_table_figure_references) == 0, incorrect_table_figure_references
146
 
147
+ def document_title_check(doc, doc_type):
148
+ """Check if the document title is formatted correctly."""
149
  incorrect_titles = []
150
+
151
+ try:
152
+ # Check the document title
153
+ if len(doc.paragraphs) > 0 and doc.paragraphs[0].style.name == 'Title':
154
+ title_text = doc.paragraphs[0].text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
+ # Check the formatting based on document type
157
+ if doc_type == "Advisory Circular":
158
+ if not title_text.startswith("ADVISORY CIRCULAR ") or title_text.endswith(" AC"):
159
+ incorrect_titles.append({"text": title_text, "issue": "Advisory Circular titles should start with 'ADVISORY CIRCULAR ' and end with ' AC'"})
160
+ elif doc_type == "Order":
161
+ if not title_text.startswith('"') or not title_text.endswith('"'):
162
+ incorrect_titles.append({"text": title_text, "issue": "Order titles should be enclosed in quotation marks"})
163
+ elif doc_type == "Federal Register Notice":
164
+ if not title_text.startswith('"') or not title_text.endswith('"'):
165
+ incorrect_titles.append({"text": title_text, "issue": "Federal Register Notice titles should be enclosed in quotation marks"})
166
+ elif doc_type == "Policy Statement":
167
+ if title_text.startswith('"') or title_text.endswith('"'):
168
+ incorrect_titles.append({"text": title_text, "issue": "Policy Statement titles should not have quotation marks"})
169
+ except Exception as e:
170
+ print(f"Error in document title check: {str(e)}")
171
+ return False, []
172
 
173
  return len(incorrect_titles) == 0, incorrect_titles
174
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  def double_period_check(doc):
176
+ """Check for sentences with double periods."""
177
  incorrect_sentences = []
178
+
179
+ try:
180
+ # Check each paragraph for double periods
181
+ for paragraph in doc.paragraphs:
182
+ if ".." in paragraph.text:
183
+ incorrect_sentences.append(paragraph.text)
184
+ except Exception as e:
185
+ print(f"Error in double period check: {str(e)}")
186
+ return False, []
187
+
188
+ return len(incorrect_sentences) == 0, incorrect_sentences
189
 
190
  def spacing_check(doc):
191
+ """Check for incorrect spacing."""
 
 
 
 
 
 
 
 
192
  incorrect_spacing = []
193
+
194
+ try:
195
+ # Check each paragraph for spacing issues
196
+ for paragraph in doc.paragraphs:
197
+ if " " in paragraph.text:
198
+ incorrect_spacing.append(paragraph.text)
199
+ except Exception as e:
200
+ print(f"Error in spacing check: {str(e)}")
201
+ return False, []
202
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  return len(incorrect_spacing) == 0, incorrect_spacing
204
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  def check_abbreviation_usage(doc):
206
+ """Check for consistent usage of abbreviations."""
207
+ abbreviation_issues = []
208
+
209
+ try:
210
+ # Regular expression to find abbreviations (2-5 capital letters)
211
+ abbreviation_pattern = r'\b[A-Z]{2,5}\b'
 
 
 
212
 
213
+ # Check each paragraph
214
+ for paragraph in doc.paragraphs:
215
+ text = paragraph.text
216
+
217
+ # Find all abbreviations in this paragraph
218
+ abbreviations = re.findall(abbreviation_pattern, text)
219
+
220
+ for abbr in abbreviations:
221
+ # Look for the full term definition
222
+ definition_pattern = rf'.+\({abbr}\)'
223
+ if any(re.search(definition_pattern, p.text) for p in doc.paragraphs):
224
+ # Check if the abbreviation is used consistently after definition
225
+ for other_paragraph in doc.paragraphs:
226
+ if abbr in other_paragraph.text and definition_pattern not in other_paragraph.text:
227
+ abbreviation_issues.append((definition_pattern.split('(')[0].strip(), abbr, paragraph.text))
228
+ break
229
+ except Exception as e:
230
+ print(f"Error in abbreviation check: {str(e)}")
231
+ return []
232
+
233
+ return abbreviation_issues
234
 
235
  def check_date_formats(doc):
236
+ """Check for consistent date formatting."""
237
  date_issues = []
238
+
239
+ try:
240
+ # Look for date patterns in each paragraph
241
+ for paragraph in doc.paragraphs:
242
+ text = paragraph.text
243
+ if re.search(r'\b\d{1,2}/\d{1,2}/\d{4}\b', text):
244
+ date_issues.append((text, paragraph.text))
245
+ except Exception as e:
246
+ print(f"Error in date format check: {str(e)}")
247
+ return []
248
+
249
  return date_issues
250
 
251
  def check_placeholders(doc):
252
+ """Check for the presence of placeholders."""
253
+ placeholder_issues = []
254
+
255
+ try:
256
+ # Look for placeholder text in each paragraph
257
+ for paragraph in doc.paragraprams:
258
+ text = paragraph.text
259
+ if '[ENTER TEXT]' in text or '[ENTER DATE]' in text:
260
+ placeholder_issues.append((text, paragraph.text))
261
+ except Exception as e:
262
+ print(f"Error in placeholder check: {str(e)}")
263
+ return []
264
+
265
+ return placeholder_issues
266
+
267
+ def get_document_checks(doc_type, template_type):
268
+ """Return the required headings and other checks based on document type."""
269
+ if doc_type == "Advisory Circular":
270
+ if template_type == "Short AC template AC":
271
+ return {
272
+ "required_headings": ["Purpose", "Applicability", "Related Reading Material",
273
+ "Background", "Discussion"]
274
+ }
275
+ else: # Long AC template
276
+ return {
277
+ "required_headings": ["Purpose", "Applicability", "Audience", "Related Reading Material",
278
+ "Background", "Discussion", "Conclusion"]
279
+ }
280
+ # Add other document types as needed
281
+ return {"required_headings": []}
282
 
283
+ def format_results_for_gradio(heading_valid, headings_found,
284
+ acronyms_valid, undefined_acronyms,
285
+ legal_valid, incorrect_legal_references,
286
+ table_valid, incorrect_captions,
287
+ figure_valid, incorrect_fig_captions,
288
+ references_valid, incorrect_table_figure_references,
289
+ title_style_valid, incorrect_titles,
290
+ required_headings, doc_type,
291
+ double_period_valid, incorrect_sentences,
292
+ spacing_valid, incorrect_spacing,
293
+ abbreviation_issues, date_issues,
294
+ placeholder_issues):
295
  """Format the results for Gradio display."""
296
  results = []
297
  results.append("# Document Check Results\n")
 
312
  if acronyms_valid:
313
  results.append("✅ All acronyms are properly defined.\n")
314
  else:
315
+ results.append("❌ The following acronyms need to be defined at first use:")
316
+ for acronym in undefined_acronyms:
317
+ results.append(f"- {acronym}")
318
+ results.append("")
319
 
320
  # Legal Check
321
  results.append("## Legal Terminology Check")
 
371
  formatting_notes = {
372
  "Advisory Circular": "Document titles should be italicized, not in quotation marks.",
373
  "Order": "Document titles should be in quotation marks, not italicized.",
374
+ "Federal Register Notice": "Document titles should be in quotation marks, not italicized.",
375
  "Policy Statement": "Document titles should not have any special formatting (no italics, no quotation marks)."
376
  }
377