Hoctar77 commited on
Commit
d1342e7
·
verified ·
1 Parent(s): 436beda

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -313
app.py CHANGED
@@ -18,37 +18,24 @@ def heading_title_check(paragraphs, required_headings):
18
  return all_headings_present, headings_found
19
 
20
  def acronym_check(paragraphs):
21
- """Check if all acronyms are defined at first use and return undefined acronyms."""
22
- defined_acronyms = set() # Set to store defined acronyms
23
- undefined_acronyms = set() # Set to store undefined acronyms
24
- acronym_pattern = re.compile(r'(\b[A-Z]{2,}\b)') # Regex to find acronyms (2 or more uppercase letters)
25
- defined_pattern = re.compile(r'(\b\w+\b) \((\b[A-Z]{2,}\b)\)') # Regex to find definitions like "Federal Aviation Administration (FAA)"
26
-
27
- for paragraph in paragraphs: # Use paragraphs here
28
- # Check for defined acronyms
29
  defined_matches = defined_pattern.findall(paragraph)
30
  for full_term, acronym in defined_matches:
31
- defined_acronyms.add(acronym) # Add the acronym to the defined set
32
 
33
- # Check for usage of acronyms
34
  usage_matches = acronym_pattern.findall(paragraph)
35
  for acronym in usage_matches:
36
  if acronym not in defined_acronyms:
37
- undefined_acronyms.add(acronym) # Add to undefined acronyms if not defined
38
 
39
  return len(undefined_acronyms) == 0, undefined_acronyms
40
 
41
-
42
  def legal_check(paragraphs):
43
- """Check for correct legal references in the document and suggest corrections.
44
-
45
- Args:
46
- doc (list): List of paragraphs/strings to check
47
-
48
- Returns:
49
- tuple: (bool, list) - (True if no errors found, list of (incorrect, correct) terms)
50
- """
51
- # Mapping of incorrect terms to their correct versions
52
  incorrect_variations = {
53
  r"\bUSC\b": "U.S.C.",
54
  r"\bCFR Part\b": "CFR part",
@@ -59,28 +46,21 @@ def legal_check(paragraphs):
59
  r"\bshall\b": "must or will",
60
  r"\b&\b": "and"
61
  }
62
-
63
- # List to store tuples of incorrect terms and their correct versions
64
  incorrect_legal_references = []
65
 
66
  for paragraph in paragraphs:
67
- # Special handling for "Title 14" / "title 14"
68
  title_14_pattern = r"(?P<prefix>^|[.!?\s])\s*(?P<title>title 14|Title 14)\b"
69
  matches = re.finditer(title_14_pattern, paragraph)
70
 
71
  for match in matches:
72
  prefix = match.group('prefix')
73
  current_title = match.group('title')
74
-
75
- # If it follows a sentence-ending punctuation or is at start, it should be "Title 14"
76
  if prefix in ('.', '!', '?', '') and current_title.lower() == "title 14":
77
  if current_title != "Title 14":
78
  incorrect_legal_references.append((current_title, "Title 14"))
79
- # If it's within a sentence, it should be "title 14"
80
  elif prefix.isspace() and current_title != "title 14":
81
  incorrect_legal_references.append((current_title, "title 14"))
82
 
83
- # Check other variations
84
  for incorrect_pattern, correct_term in incorrect_variations.items():
85
  matches = re.finditer(incorrect_pattern, paragraph)
86
  for match in matches:
@@ -89,15 +69,9 @@ def legal_check(paragraphs):
89
  return len(incorrect_legal_references) == 0, incorrect_legal_references
90
 
91
  def table_caption_check(paragraphs, doc_type):
92
- """
93
- Check for correctly formatted table captions in the document.
94
- Supports both numeric (Table 1-2) and alphanumeric (Table C-1) formats.
95
- """
96
  if doc_type in ["Advisory Circular", "Order"]:
97
- # Pattern for "Table X-Y" where X and Y can be either letters or numbers
98
  table_caption_pattern = re.compile(r'^Table\s+([A-Z0-9]+)-([A-Z0-9]+)[\.\s]', re.IGNORECASE)
99
  else:
100
- # Pattern for "Table X" where X can be either letters or numbers
101
  table_caption_pattern = re.compile(r'^Table\s+([A-Z0-9]+)[\.\s]', re.IGNORECASE)
102
 
103
  incorrect_captions = []
@@ -111,15 +85,9 @@ def table_caption_check(paragraphs, doc_type):
111
  return len(incorrect_captions) == 0, incorrect_captions
112
 
113
  def figure_caption_check(paragraphs, doc_type):
114
- """
115
- Check for correctly formatted figure captions in the document.
116
- Supports both numeric (Figure 1-2) and alphanumeric (Figure C-1) formats.
117
- """
118
  if doc_type in ["Advisory Circular", "Order"]:
119
- # Pattern for "Figure X-Y" where X and Y can be either letters or numbers
120
  figure_caption_pattern = re.compile(r'^Figure\s+([A-Z0-9]+)-([A-Z0-9]+)[\.\s]', re.IGNORECASE)
121
  else:
122
- # Pattern for "Figure X" where X can be either letters or numbers
123
  figure_caption_pattern = re.compile(r'^Figure\s+([A-Z0-9]+)[\.\s]', re.IGNORECASE)
124
 
125
  incorrect_fig_captions = []
@@ -132,283 +100,58 @@ def figure_caption_check(paragraphs, doc_type):
132
  return len(incorrect_fig_captions) == 0, incorrect_fig_captions
133
 
134
  def table_figure_reference_check(paragraphs, doc_type):
135
- """Check for incorrect references to tables and figures in the document."""
136
  incorrect_table_figure_references = []
137
 
138
  if doc_type in ["Advisory Circular", "Order"]:
139
- # For Advisory Circulars and Orders, correct references are "Table X-Y" or "Figure X-Y"
140
  incorrect_table_ref_pattern = re.compile(r'\bTable\s+\d+(?!-\d+)\b', re.IGNORECASE)
141
  incorrect_figure_ref_pattern = re.compile(r'\bFigure\s+\d+(?!-\d+)\b', re.IGNORECASE)
142
  else:
143
- # For other document types, correct references are "Table X" or "Figure X"
144
  incorrect_table_ref_pattern = re.compile(r'\bTable\s+\d+(-\d+)?\b', re.IGNORECASE)
145
  incorrect_figure_ref_pattern = re.compile(r'\bFigure\s+\d+(-\d+)?\b', re.IGNORECASE)
146
 
147
  for paragraph in paragraphs:
148
  paragraph_strip = paragraph.strip()
149
- # Exclude captions
150
  starts_with_table_or_figure = paragraph_strip.lower().startswith('table') or paragraph_strip.lower().startswith('figure')
151
  if not starts_with_table_or_figure:
152
- # Find incorrect table references
153
  incorrect_tables = incorrect_table_ref_pattern.findall(paragraph)
154
  if incorrect_tables:
155
  incorrect_table_figure_references.extend(incorrect_tables)
156
- # Find incorrect figure references
157
  incorrect_figures = incorrect_figure_ref_pattern.findall(paragraph)
158
  if incorrect_figures:
159
  incorrect_table_figure_references.extend(incorrect_figures)
160
 
161
- # Return False if any incorrect references are found
162
  return len(incorrect_table_figure_references) == 0, incorrect_table_figure_references
163
 
164
- def document_title_check(doc_path, doc_type):
165
- incorrect_titles = []
166
- doc = Document(doc_path)
167
-
168
- # Updated pattern to capture titles correctly
169
- ac_pattern = re.compile(r'AC\s+\d+(?:-\d+)?(?:,|\s)+(.+?)(?=\.|,|$)')
170
-
171
- # Define formatting rules for different document types
172
- formatting_rules = {
173
- "Advisory Circular": {"italics": True, "quotes": False},
174
- "Airworthiness Criteria": {"italics": False, "quotes": True},
175
- "Deviation Memo": {"italics": False, "quotes": True},
176
- "Exemption": {"italics": False, "quotes": True},
177
- "Federal Register Notice": {"italics": False, "quotes": True},
178
- "Handbook/Manual": {"italics": False, "quotes": False},
179
- "Order": {"italics": False, "quotes": True},
180
- "Policy Statement": {"italics": False, "quotes": False},
181
- "Rule": {"italics": False, "quotes": True},
182
- "Special Condition": {"italics": False, "quotes": True},
183
- "Technical Standard Order": {"italics": False, "quotes": True},
184
- "Other": {"italics": False, "quotes": False}
185
- }
186
-
187
- # Get the rules for the current document type
188
- if doc_type not in formatting_rules:
189
- raise ValueError(f"Unsupported document type: {doc_type}")
190
-
191
- required_format = formatting_rules[doc_type]
192
-
193
- for paragraph in doc.paragraphs:
194
- text = paragraph.text
195
- matches = ac_pattern.finditer(text)
196
-
197
- for match in matches:
198
- full_match = match.group(0)
199
- title_text = match.group(1).strip()
200
-
201
- # Get the position where the title starts
202
- title_start = match.start(1)
203
-
204
- # Check for any type of quotation marks, including smart quotes
205
- title_in_quotes = any(q in title_text for q in ['"', "'", '"', '"', ''', '''])
206
-
207
- # Check the formatting of the title
208
- title_is_italicized = False
209
- current_pos = 0
210
- for run in paragraph.runs:
211
- run_length = len(run.text)
212
- if current_pos <= title_start < current_pos + run_length:
213
- relative_pos = title_start - current_pos
214
- title_is_italicized = run.italic
215
- break
216
- current_pos += run_length
217
-
218
- # Check if formatting matches the required format
219
- formatting_incorrect = False
220
- issue_message = []
221
-
222
- # Check italics requirement
223
- if required_format["italics"] and not title_is_italicized:
224
- formatting_incorrect = True
225
- issue_message.append("should be italicized")
226
- elif not required_format["italics"] and title_is_italicized:
227
- formatting_incorrect = True
228
- issue_message.append("should not be italicized")
229
-
230
- # Check quotes requirement
231
- if required_format["quotes"] and not title_in_quotes:
232
- formatting_incorrect = True
233
- issue_message.append("should be in quotes")
234
- elif not required_format["quotes"] and title_in_quotes:
235
- formatting_incorrect = True
236
- issue_message.append("should not be in quotes")
237
-
238
- if formatting_incorrect:
239
- incorrect_titles.append({
240
- 'text': full_match,
241
- 'issue': ', '.join(issue_message)
242
- })
243
-
244
- return len(incorrect_titles) == 0, incorrect_titles
245
-
246
- def get_document_checks(doc_type, template_type):
247
- """Return expected outline and required headings based on document type and template type."""
248
- document_checks = {
249
- "Advisory Circular": {
250
- "Short AC template AC": {
251
- "required_headings": [
252
- "PURPOSE.",
253
- "APPLICABILITY.",
254
- "CANCELLATION.",
255
- "RELATED MATERIAL.",
256
- "DEFINITION OF KEY TERMS."
257
- ]
258
- },
259
- "Long AC template AC": {
260
- "required_headings": [
261
- "Purpose.",
262
- "Applicability.",
263
- "Cancellation.",
264
- "Related Material.",
265
- "Definition of Key Terms."
266
- ]
267
- }
268
- },
269
- "Airworthiness Criteria": {
270
- "required_headings": [
271
- "TBD - Need to research"
272
- ]
273
- },
274
- "Deviation Memo": {
275
- "required_headings": [
276
- "TBD - Need to research"
277
- ]
278
- },
279
- "Exemption": {
280
- "required_headings": [
281
- "TBD - Need to research"
282
- ]
283
- },
284
- "Federal Register Notice": {
285
- "required_headings": [
286
- "Purpose of This Notice",
287
- "Audience",
288
- "Where can I Find This Notice"
289
- ]
290
- },
291
- "Handbook/Manual": {
292
- "required_headings": [
293
- "TBD - Need to research"
294
- ]
295
- },
296
- "Order": {
297
- "required_headings": [
298
- "Purpose of This Order.",
299
- "Audience.",
300
- "Where to Find This Order."
301
- ]
302
- },
303
- "Policy Statement": {
304
- "required_headings": [
305
- "SUMMARY",
306
- "CURRENT REGULATORY AND ADVISORY MATERIAL",
307
- "RELEVANT PAST PRACTICE",
308
- "POLICY",
309
- "EFFECT OF POLICY",
310
- "CONCLUSION"
311
- ]
312
- },
313
- "Rule": {
314
- "required_headings": [
315
- "TBD - Need to research"
316
- ]
317
- },
318
- "Special Condition": {
319
- "required_headings": [
320
- "TBD - Need to research"
321
- ]
322
- },
323
- "Technical Standard Order": {
324
- "required_headings": [
325
- "PURPOSE.",
326
- "APPLICABILITY.",
327
- "REQUIREMENTS.",
328
- "MARKING.",
329
- "APPLICATION DATA REQUIREMENTS.",
330
- "MANUFACTURER DATA REQUIREMENTS.",
331
- "FURNISHED DATA REQUIREMENTS.",
332
- "HOW TO GET REFERENCED DOCUMENTS."
333
- ]
334
- },
335
- "Other": {
336
- "required_headings": [
337
- "N/A"
338
- ]
339
- }
340
- }
341
-
342
- # Add debugging logs
343
- logger = logging.getLogger(__name__)
344
- logger.info(f"Requested document type: {doc_type}")
345
- logger.info(f"Requested template type: {template_type}")
346
-
347
- if doc_type == "Advisory Circular":
348
- checks = document_checks.get(doc_type, {}).get(template_type, {})
349
- else:
350
- checks = document_checks.get(doc_type, {})
351
-
352
- logger.info(f"Retrieved checks: {checks}")
353
- return checks
354
-
355
  def double_period_check(paragraphs):
356
- """Check for sentences that end with two periods."""
357
  incorrect_sentences = []
358
 
359
  for paragraph in paragraphs:
360
- # Split the paragraph into sentences based on common sentence-ending punctuation
361
  sentences = re.split(r'(?<=[.!?]) +', paragraph)
362
  for sentence in sentences:
363
  if sentence.endswith('..'):
364
- incorrect_sentences.append(sentence.strip()) # Log the incorrectly formatted sentence
365
 
366
- return len(incorrect_sentences) == 0, incorrect_sentences # Return True if no double periods are found, along with any incorrect sentences
367
 
368
  def spacing_check(paragraphs):
369
- """
370
- Check for correct spacing in US federal regulatory documents.
371
- Checks for:
372
- - Spacing between document type and number (e.g., "AC 20-114")
373
- - Spacing around section symbols (e.g., "§ 25.301")
374
- - Spacing around part numbers (e.g., "Part 25")
375
- - Spacing around paragraph indications (e.g., "(a)", "(1)")
376
- - Double spaces between words
377
- """
378
  incorrect_spacing = []
379
-
380
- # Regex patterns to find incorrect spacing
381
  doc_type_pattern = re.compile(r'(?<!\s)(AC|AD|CFR|FAA|N|SFAR)(\d+[-]?\d*)', re.IGNORECASE)
382
  section_symbol_pattern = re.compile(r'(?<!\s)(§|§§)(\d+\.\d+)', re.IGNORECASE)
383
  part_number_pattern = re.compile(r'(?<!\s)Part(\d+)', re.IGNORECASE)
384
  paragraph_pattern = re.compile(r'(?<!\s)(\([a-z](?!\))|\([1-9](?!\)))', re.IGNORECASE)
385
  double_space_pattern = re.compile(r'\s{2,}')
386
 
387
- for paragraph in doc:
388
- # Check for incorrect document type spacing
389
- if doc_type_pattern.search(paragraph):
390
- incorrect_spacing.append(paragraph)
391
-
392
- # Check for incorrect section symbol spacing
393
- if section_symbol_pattern.search(paragraph):
394
- incorrect_spacing.append(paragraph)
395
-
396
- # Check for incorrect part number spacing
397
- if part_number_pattern.search(paragraph):
398
- incorrect_spacing.append(paragraph)
399
-
400
- # Check for incorrect paragraph indication spacing
401
- if paragraph_pattern.search(paragraph):
402
- incorrect_spacing.append(paragraph)
403
-
404
- # Check for double spaces
405
- if double_space_pattern.search(paragraph):
406
  incorrect_spacing.append(paragraph)
407
 
408
  return len(incorrect_spacing) == 0, incorrect_spacing
409
 
410
  def check_prohibited_phrases(paragraphs):
411
- """Check for prohibited words or phrases."""
412
  prohibited_phrases = [
413
  r'\babove\b',
414
  r'\bbelow\b',
@@ -474,64 +217,39 @@ def check_placeholders(paragraphs):
474
  return issues
475
 
476
  def process_document(file_obj, doc_type, template_type):
477
- """Process the document and perform checks."""
478
  try:
479
- # Read the Word document
480
  doc = Document(file_obj)
481
- print("Document read successfully.")
482
-
483
- # Extract text from each paragraph to make it iterable
484
  paragraphs = [para.text for para in doc.paragraphs]
485
-
486
- # Get required headings based on document type
487
  required_headings = get_document_checks(doc_type, template_type).get("required_headings", [])
488
 
489
- # Perform checks
490
  heading_valid, headings_found = heading_title_check(paragraphs, required_headings)
491
  acronyms_valid, undefined_acronyms = acronym_check(paragraphs)
492
  legal_valid, incorrect_legal_references = legal_check(paragraphs)
493
  table_valid, incorrect_captions = table_caption_check(paragraphs, doc_type)
494
  figure_valid, incorrect_fig_captions = figure_caption_check(paragraphs, doc_type)
495
  references_valid, incorrect_table_figure_references = table_figure_reference_check(paragraphs, doc_type)
496
- title_style_valid, incorrect_titles = document_title_check(file_obj, doc_type)
497
  double_period_valid, incorrect_sentences = double_period_check(paragraphs)
498
  spacing_valid, incorrect_spacing = spacing_check(paragraphs)
499
- abbreviation_issues = check_abbreviation_usage(paragraphs)
500
- date_issues = check_date_formats(paragraphs)
501
- placeholder_issues = check_placeholders(paragraphs)
502
-
503
- # Format results
504
  results = format_results_for_gradio(
505
- heading_valid=heading_valid,
506
- headings_found=headings_found,
507
- acronyms_valid=acronyms_valid,
508
- undefined_acronyms=undefined_acronyms,
509
- legal_valid=legal_valid,
510
- incorrect_legal_references=incorrect_legal_references,
511
- table_valid=table_valid,
512
- incorrect_captions=incorrect_captions,
513
- figure_valid=figure_valid,
514
- incorrect_fig_captions=incorrect_fig_captions,
515
- references_valid=references_valid,
516
- incorrect_table_figure_references=incorrect_table_figure_references,
517
- title_style_valid=title_style_valid,
518
- incorrect_titles=incorrect_titles,
519
- required_headings=required_headings,
520
- doc_type=doc_type,
521
- double_period_valid=double_period_valid,
522
- incorrect_sentences=incorrect_sentences,
523
- spacing_valid=spacing_valid,
524
- incorrect_spacing=incorrect_spacing,
525
- abbreviation_issues=abbreviation_issues,
526
- date_issues=date_issues,
527
- placeholder_issues=placeholder_issues
528
  )
529
-
530
  return results
531
-
532
  except Exception as e:
533
  print(f"Error in process_document: {str(e)}")
534
- raise
535
 
536
  def get_document_checks(doc_type, template_type):
537
  """Return the required headings and other checks based on document type."""
 
18
  return all_headings_present, headings_found
19
 
20
  def acronym_check(paragraphs):
21
+ defined_acronyms = set()
22
+ undefined_acronyms = set()
23
+ acronym_pattern = re.compile(r'(\b[A-Z]{2,}\b)')
24
+ defined_pattern = re.compile(r'(\b\w+\b) \((\b[A-Z]{2,}\b)\)')
25
+
26
+ for paragraph in paragraphs:
 
 
27
  defined_matches = defined_pattern.findall(paragraph)
28
  for full_term, acronym in defined_matches:
29
+ defined_acronyms.add(acronym)
30
 
 
31
  usage_matches = acronym_pattern.findall(paragraph)
32
  for acronym in usage_matches:
33
  if acronym not in defined_acronyms:
34
+ undefined_acronyms.add(acronym)
35
 
36
  return len(undefined_acronyms) == 0, undefined_acronyms
37
 
 
38
  def legal_check(paragraphs):
 
 
 
 
 
 
 
 
 
39
  incorrect_variations = {
40
  r"\bUSC\b": "U.S.C.",
41
  r"\bCFR Part\b": "CFR part",
 
46
  r"\bshall\b": "must or will",
47
  r"\b&\b": "and"
48
  }
 
 
49
  incorrect_legal_references = []
50
 
51
  for paragraph in paragraphs:
 
52
  title_14_pattern = r"(?P<prefix>^|[.!?\s])\s*(?P<title>title 14|Title 14)\b"
53
  matches = re.finditer(title_14_pattern, paragraph)
54
 
55
  for match in matches:
56
  prefix = match.group('prefix')
57
  current_title = match.group('title')
 
 
58
  if prefix in ('.', '!', '?', '') and current_title.lower() == "title 14":
59
  if current_title != "Title 14":
60
  incorrect_legal_references.append((current_title, "Title 14"))
 
61
  elif prefix.isspace() and current_title != "title 14":
62
  incorrect_legal_references.append((current_title, "title 14"))
63
 
 
64
  for incorrect_pattern, correct_term in incorrect_variations.items():
65
  matches = re.finditer(incorrect_pattern, paragraph)
66
  for match in matches:
 
69
  return len(incorrect_legal_references) == 0, incorrect_legal_references
70
 
71
  def table_caption_check(paragraphs, doc_type):
 
 
 
 
72
  if doc_type in ["Advisory Circular", "Order"]:
 
73
  table_caption_pattern = re.compile(r'^Table\s+([A-Z0-9]+)-([A-Z0-9]+)[\.\s]', re.IGNORECASE)
74
  else:
 
75
  table_caption_pattern = re.compile(r'^Table\s+([A-Z0-9]+)[\.\s]', re.IGNORECASE)
76
 
77
  incorrect_captions = []
 
85
  return len(incorrect_captions) == 0, incorrect_captions
86
 
87
  def figure_caption_check(paragraphs, doc_type):
 
 
 
 
88
  if doc_type in ["Advisory Circular", "Order"]:
 
89
  figure_caption_pattern = re.compile(r'^Figure\s+([A-Z0-9]+)-([A-Z0-9]+)[\.\s]', re.IGNORECASE)
90
  else:
 
91
  figure_caption_pattern = re.compile(r'^Figure\s+([A-Z0-9]+)[\.\s]', re.IGNORECASE)
92
 
93
  incorrect_fig_captions = []
 
100
  return len(incorrect_fig_captions) == 0, incorrect_fig_captions
101
 
102
  def table_figure_reference_check(paragraphs, doc_type):
 
103
  incorrect_table_figure_references = []
104
 
105
  if doc_type in ["Advisory Circular", "Order"]:
 
106
  incorrect_table_ref_pattern = re.compile(r'\bTable\s+\d+(?!-\d+)\b', re.IGNORECASE)
107
  incorrect_figure_ref_pattern = re.compile(r'\bFigure\s+\d+(?!-\d+)\b', re.IGNORECASE)
108
  else:
 
109
  incorrect_table_ref_pattern = re.compile(r'\bTable\s+\d+(-\d+)?\b', re.IGNORECASE)
110
  incorrect_figure_ref_pattern = re.compile(r'\bFigure\s+\d+(-\d+)?\b', re.IGNORECASE)
111
 
112
  for paragraph in paragraphs:
113
  paragraph_strip = paragraph.strip()
 
114
  starts_with_table_or_figure = paragraph_strip.lower().startswith('table') or paragraph_strip.lower().startswith('figure')
115
  if not starts_with_table_or_figure:
 
116
  incorrect_tables = incorrect_table_ref_pattern.findall(paragraph)
117
  if incorrect_tables:
118
  incorrect_table_figure_references.extend(incorrect_tables)
 
119
  incorrect_figures = incorrect_figure_ref_pattern.findall(paragraph)
120
  if incorrect_figures:
121
  incorrect_table_figure_references.extend(incorrect_figures)
122
 
 
123
  return len(incorrect_table_figure_references) == 0, incorrect_table_figure_references
124
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  def double_period_check(paragraphs):
 
126
  incorrect_sentences = []
127
 
128
  for paragraph in paragraphs:
 
129
  sentences = re.split(r'(?<=[.!?]) +', paragraph)
130
  for sentence in sentences:
131
  if sentence.endswith('..'):
132
+ incorrect_sentences.append(sentence.strip())
133
 
134
+ return len(incorrect_sentences) == 0, incorrect_sentences
135
 
136
  def spacing_check(paragraphs):
 
 
 
 
 
 
 
 
 
137
  incorrect_spacing = []
 
 
138
  doc_type_pattern = re.compile(r'(?<!\s)(AC|AD|CFR|FAA|N|SFAR)(\d+[-]?\d*)', re.IGNORECASE)
139
  section_symbol_pattern = re.compile(r'(?<!\s)(§|§§)(\d+\.\d+)', re.IGNORECASE)
140
  part_number_pattern = re.compile(r'(?<!\s)Part(\d+)', re.IGNORECASE)
141
  paragraph_pattern = re.compile(r'(?<!\s)(\([a-z](?!\))|\([1-9](?!\)))', re.IGNORECASE)
142
  double_space_pattern = re.compile(r'\s{2,}')
143
 
144
+ for paragraph in paragraphs:
145
+ if doc_type_pattern.search(paragraph) or \
146
+ section_symbol_pattern.search(paragraph) or \
147
+ part_number_pattern.search(paragraph) or \
148
+ paragraph_pattern.search(paragraph) or \
149
+ double_space_pattern.search(paragraph):
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  incorrect_spacing.append(paragraph)
151
 
152
  return len(incorrect_spacing) == 0, incorrect_spacing
153
 
154
  def check_prohibited_phrases(paragraphs):
 
155
  prohibited_phrases = [
156
  r'\babove\b',
157
  r'\bbelow\b',
 
217
  return issues
218
 
219
  def process_document(file_obj, doc_type, template_type):
 
220
  try:
 
221
  doc = Document(file_obj)
 
 
 
222
  paragraphs = [para.text for para in doc.paragraphs]
 
 
223
  required_headings = get_document_checks(doc_type, template_type).get("required_headings", [])
224
 
225
+ # Calls to each function with `paragraphs` as input
226
  heading_valid, headings_found = heading_title_check(paragraphs, required_headings)
227
  acronyms_valid, undefined_acronyms = acronym_check(paragraphs)
228
  legal_valid, incorrect_legal_references = legal_check(paragraphs)
229
  table_valid, incorrect_captions = table_caption_check(paragraphs, doc_type)
230
  figure_valid, incorrect_fig_captions = figure_caption_check(paragraphs, doc_type)
231
  references_valid, incorrect_table_figure_references = table_figure_reference_check(paragraphs, doc_type)
 
232
  double_period_valid, incorrect_sentences = double_period_check(paragraphs)
233
  spacing_valid, incorrect_spacing = spacing_check(paragraphs)
234
+ placeholder_issues = check_prohibited_phrases(paragraphs)
235
+
236
+ # Return all results
 
 
237
  results = format_results_for_gradio(
238
+ heading_valid=heading_valid, headings_found=headings_found,
239
+ acronyms_valid=acronyms_valid, undefined_acronyms=undefined_acronyms,
240
+ legal_valid=legal_valid, incorrect_legal_references=incorrect_legal_references,
241
+ table_valid=table_valid, incorrect_captions=incorrect_captions,
242
+ figure_valid=figure_valid, incorrect_fig_captions=incorrect_fig_captions,
243
+ references_valid=references_valid, incorrect_table_figure_references=incorrect_table_figure_references,
244
+ double_period_valid=double_period_valid, incorrect_sentences=incorrect_sentences,
245
+ spacing_valid=spacing_valid, incorrect_spacing=incorrect_spacing,
246
+ placeholder_issues=placeholder_issues,
247
+ required_headings=required_headings, doc_type=doc_type
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  )
 
249
  return results
 
250
  except Exception as e:
251
  print(f"Error in process_document: {str(e)}")
252
+ return f"An error occurred while processing the document: {str(e)}"
253
 
254
  def get_document_checks(doc_type, template_type):
255
  """Return the required headings and other checks based on document type."""