Hoctar77 commited on
Commit
c6ba992
·
verified ·
1 Parent(s): ee7296b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +438 -231
app.py CHANGED
@@ -6,279 +6,486 @@ import io
6
  import traceback
7
 
8
  def heading_title_check(doc, required_headings):
9
- """Check if all required headings are present."""
10
- headings_found = []
11
- try:
12
- # Iterate through paragraphs to find headings
13
- for paragraph in doc.paragraphs:
14
- if paragraph.style.name.startswith('Heading'):
15
- headings_found.append(paragraph.text.strip())
16
- except Exception as e:
17
- print(f"Error in heading check: {str(e)}")
18
- return False, []
19
-
20
- # Check if all required headings are present
21
- all_present = all(heading in headings_found for heading in required_headings)
22
- return all_present, headings_found
23
-
24
- def acronym_check(doc):
25
- """Check if all acronyms are properly defined."""
26
- undefined_acronyms = set()
27
- defined_acronyms = set()
28
 
29
- try:
30
- # Regular expression for finding acronyms (2-5 capital letters)
31
- acronym_pattern = r'\b[A-Z]{2,5}\b'
32
 
33
- # Check each paragraph
34
- for paragraph in doc.paragraphs:
35
- text = paragraph.text
36
-
37
- # Find all acronyms in this paragraph
38
- acronyms = re.findall(acronym_pattern, text)
39
-
40
- for acronym in acronyms:
41
- if acronym not in defined_acronyms:
42
- # Look for definition pattern: "full term (ACRONYM)"
43
- definition_pattern = rf'.+\({acronym}\)'
44
- if not any(re.search(definition_pattern, p.text) for p in doc.paragraphs):
45
- undefined_acronyms.add(acronym)
46
- else:
47
- defined_acronyms.add(acronym)
48
- except Exception as e:
49
- print(f"Error in acronym check: {str(e)}")
50
- return False, []
51
 
52
- return len(undefined_acronyms) == 0, list(undefined_acronyms)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  def legal_check(doc):
55
- """Check if legal terminology is used correctly."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  incorrect_legal_references = []
57
 
58
- try:
59
- # Define legal terminology mapping
60
- legal_terms = {
61
- "C.F.R.": "Code of Federal Regulations",
62
- "F.R.": "Federal Register",
63
- "U.S.C.": "United States Code"
64
- }
65
 
66
- # Check each paragraph
67
- for paragraph in doc.paragraprams:
68
- text = paragraph.text
69
- for incorrect_term, correct_term in legal_terms.items():
70
- if incorrect_term in text and correct_term not in text:
71
- incorrect_legal_references.append((incorrect_term, correct_term))
72
- except Exception as e:
73
- print(f"Error in legal check: {str(e)}")
74
- return False, []
 
 
 
 
 
 
 
 
75
 
76
  return len(incorrect_legal_references) == 0, incorrect_legal_references
77
 
78
  def table_caption_check(doc, doc_type):
79
- """Check if table captions are formatted correctly."""
80
- incorrect_captions = []
81
-
82
- try:
83
- # Check table captions
84
- for table in doc.tables:
85
- # Get the paragraph before the table
86
- prev_paragraph = table._element.getprevious()
87
- if prev_paragraph is not None and prev_paragraph.text.startswith("Table"):
88
- # Check if the caption is formatted correctly
89
- if doc_type == "Advisory Circular":
90
- # AC captions should be "Table X. Caption text"
91
- if not prev_paragraph.text.startswith("Table ") or ". " not in prev_paragraph.text:
92
- incorrect_captions.append(prev_paragraph.text)
93
- else:
94
- # Other doc types may have different caption formats
95
- pass
96
- except Exception as e:
97
- print(f"Error in table caption check: {str(e)}")
98
- return False, []
99
 
 
 
 
 
 
 
 
 
100
  return len(incorrect_captions) == 0, incorrect_captions
101
 
102
  def figure_caption_check(doc, doc_type):
103
- """Check if figure captions are formatted correctly."""
104
- incorrect_fig_captions = []
105
-
106
- try:
107
- # Check figure captions
108
- for paragraph in doc.paragraphs:
109
- if paragraph.text.startswith("Figure"):
110
- # Check if the caption is formatted correctly
111
- if doc_type == "Advisory Circular":
112
- # AC captions should be "Figure X. Caption text"
113
- if ". " not in paragraph.text:
114
- incorrect_fig_captions.append(paragraph.text)
115
- else:
116
- # Other doc types may have different caption formats
117
- pass
118
- except Exception as e:
119
- print(f"Error in figure caption check: {str(e)}")
120
- return False, []
121
 
 
 
 
 
 
 
 
122
  return len(incorrect_fig_captions) == 0, incorrect_fig_captions
123
 
124
  def table_figure_reference_check(doc, doc_type):
125
- """Check if table and figure references are formatted correctly."""
126
  incorrect_table_figure_references = []
127
 
128
- try:
129
- # Check table and figure references
130
- for paragraph in doc.paragraphs:
131
- text = paragraph.text
132
- if "Table" in text or "Figure" in text:
133
- # Check if the reference is formatted correctly
134
- if doc_type == "Advisory Circular":
135
- # AC references should be "Table X" or "Figure X"
136
- if not any(text.startswith(f"{item} ") for item in ["Table", "Figure"]):
137
- incorrect_table_figure_references.append(text)
138
- else:
139
- # Other doc types may have different reference formats
140
- pass
141
- except Exception as e:
142
- print(f"Error in table/figure reference check: {str(e)}")
143
- return False, []
144
-
 
 
 
 
 
 
 
145
  return len(incorrect_table_figure_references) == 0, incorrect_table_figure_references
146
 
147
- def document_title_check(doc, doc_type):
148
- """Check if the document title is formatted correctly."""
149
  incorrect_titles = []
150
-
151
- try:
152
- # Check the document title
153
- if len(doc.paragraphs) > 0 and doc.paragraphs[0].style.name == 'Title':
154
- title_text = doc.paragraphs[0].text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
- # Check the formatting based on document type
157
- if doc_type == "Advisory Circular":
158
- if not title_text.startswith("ADVISORY CIRCULAR ") or title_text.endswith(" AC"):
159
- incorrect_titles.append({"text": title_text, "issue": "Advisory Circular titles should start with 'ADVISORY CIRCULAR ' and end with ' AC'"})
160
- elif doc_type == "Order":
161
- if not title_text.startswith('"') or not title_text.endswith('"'):
162
- incorrect_titles.append({"text": title_text, "issue": "Order titles should be enclosed in quotation marks"})
163
- elif doc_type == "Federal Register Notice":
164
- if not title_text.startswith('"') or not title_text.endswith('"'):
165
- incorrect_titles.append({"text": title_text, "issue": "Federal Register Notice titles should be enclosed in quotation marks"})
166
- elif doc_type == "Policy Statement":
167
- if title_text.startswith('"') or title_text.endswith('"'):
168
- incorrect_titles.append({"text": title_text, "issue": "Policy Statement titles should not have quotation marks"})
169
- except Exception as e:
170
- print(f"Error in document title check: {str(e)}")
171
- return False, []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
 
173
  return len(incorrect_titles) == 0, incorrect_titles
174
 
175
- def double_period_check(doc):
176
- """Check for sentences with double periods."""
177
- incorrect_sentences = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
 
179
- try:
180
- # Check each paragraph for double periods
181
- for paragraph in doc.paragraphs:
182
- if ".." in paragraph.text:
183
- incorrect_sentences.append(paragraph.text)
184
- except Exception as e:
185
- print(f"Error in double period check: {str(e)}")
186
- return False, []
 
187
 
188
- return len(incorrect_sentences) == 0, incorrect_sentences
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
 
190
  def spacing_check(doc):
191
- """Check for incorrect spacing."""
 
 
 
 
 
 
 
 
192
  incorrect_spacing = []
193
-
194
- try:
195
- # Check each paragraph for spacing issues
196
- for paragraph in doc.paragraphs:
197
- if " " in paragraph.text:
198
- incorrect_spacing.append(paragraph.text)
199
- except Exception as e:
200
- print(f"Error in spacing check: {str(e)}")
201
- return False, []
202
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  return len(incorrect_spacing) == 0, incorrect_spacing
204
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  def check_abbreviation_usage(doc):
206
- """Check for consistent usage of abbreviations."""
207
- abbreviation_issues = []
208
-
209
- try:
210
- # Regular expression to find abbreviations (2-5 capital letters)
211
- abbreviation_pattern = r'\b[A-Z]{2,5}\b'
 
 
 
212
 
213
- # Check each paragraph
214
- for paragraph in doc.paragraphs:
215
- text = paragraph.text
216
-
217
- # Find all abbreviations in this paragraph
218
- abbreviations = re.findall(abbreviation_pattern, text)
219
-
220
- for abbr in abbreviations:
221
- # Look for the full term definition
222
- definition_pattern = rf'.+\({abbr}\)'
223
- if any(re.search(definition_pattern, p.text) for p in doc.paragraphs):
224
- # Check if the abbreviation is used consistently after definition
225
- for other_paragraph in doc.paragraphs:
226
- if abbr in other_paragraph.text and definition_pattern not in other_paragraph.text:
227
- abbreviation_issues.append((definition_pattern.split('(')[0].strip(), abbr, paragraph.text))
228
- break
229
- except Exception as e:
230
- print(f"Error in abbreviation check: {str(e)}")
231
- return []
232
-
233
- return abbreviation_issues
234
 
235
  def check_date_formats(doc):
236
- """Check for consistent date formatting."""
237
  date_issues = []
238
-
239
- try:
240
- # Look for date patterns in each paragraph
241
- for paragraph in doc.paragraphs:
242
- text = paragraph.text
243
- if re.search(r'\b\d{1,2}/\d{1,2}/\d{4}\b', text):
244
- date_issues.append((text, paragraph.text))
245
- except Exception as e:
246
- print(f"Error in date format check: {str(e)}")
247
- return []
248
-
249
  return date_issues
250
 
251
  def check_placeholders(doc):
252
- """Check for the presence of placeholders."""
253
- placeholder_issues = []
254
-
255
- try:
256
- # Look for placeholder text in each paragraph
257
- for paragraph in doc.paragraprams:
258
- text = paragraph.text
259
- if '[ENTER TEXT]' in text or '[ENTER DATE]' in text:
260
- placeholder_issues.append((text, paragraph.text))
261
- except Exception as e:
262
- print(f"Error in placeholder check: {str(e)}")
263
- return []
264
-
265
- return placeholder_issues
266
-
267
- def get_document_checks(doc_type, template_type):
268
- """Return the required headings and other checks based on document type."""
269
- if doc_type == "Advisory Circular":
270
- if template_type == "Short AC template AC":
271
- return {
272
- "required_headings": ["Purpose", "Applicability", "Related Reading Material",
273
- "Background", "Discussion"]
274
- }
275
- else: # Long AC template
276
- return {
277
- "required_headings": ["Purpose", "Applicability", "Audience", "Related Reading Material",
278
- "Background", "Discussion", "Conclusion"]
279
- }
280
- # Add other document types as needed
281
- return {"required_headings": []}
282
 
283
  def process_file(file_obj, doc_type, template_type):
284
  """
@@ -513,7 +720,7 @@ def format_results_for_gradio(**kwargs):
513
  # Placeholder Check
514
  results.append("## Placeholder Check")
515
  if not kwargs['placeholder_issues']:
516
- results.append("✅ No placeholders found.\n")
517
  else:
518
  results.append("❌ Placeholders Found:")
519
  for phrase, paragraph in kwargs['placeholder_issues']:
@@ -554,7 +761,7 @@ demo = gr.Blocks(theme='JohnSmith9982/small_and_pretty')
554
  with demo:
555
  gr.Markdown("# Document Checker Tool")
556
  gr.Markdown("Upload a Word (docx) document to check for compliance with U.S. federal documentation standards.")
557
- gr.Markdown("* This tool is still in development *")
558
  gr.Markdown("Contact Eric Putnam if you have questions and comments.")
559
 
560
  document_types = [
 
6
  import traceback
7
 
8
  def heading_title_check(doc, required_headings):
9
+ """
10
+ Check if required headings are present in the document.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
+ Args:
13
+ doc (list): List of paragraph texts from the document
14
+ required_headings (list): List of required heading titles
15
 
16
+ Returns:
17
+ tuple: (bool, list) - (True if all headings present, list of found headings)
18
+ """
19
+ headings_found = []
20
+
21
+ # Create a set of required headings for efficient lookup
22
+ required_headings_set = set(required_headings)
23
+
24
+ for para in doc:
25
+ para_strip = para.strip()
26
+ # Check if the paragraph is in the required headings list
27
+ if para_strip in required_headings_set:
28
+ headings_found.append(para_strip)
29
+
30
+ # Check if all required headings are found
31
+ all_headings_present = set(headings_found) == required_headings_set
 
 
32
 
33
+ return all_headings_present, headings_found
34
+
35
+ def acronym_check(doc):
36
+ """Check if all acronyms are defined at first use and return undefined acronyms."""
37
+ defined_acronyms = set() # Set to store defined acronyms
38
+ undefined_acronyms = set() # Set to store undefined acronyms
39
+ acronym_pattern = re.compile(r'(\b[A-Z]{2,}\b)') # Regex to find acronyms (2 or more uppercase letters)
40
+ defined_pattern = re.compile(r'(\b\w+\b) \((\b[A-Z]{2,}\b)\)') # Regex to find definitions like "Federal Aviation Administration (FAA)"
41
+
42
+ for paragraph in doc:
43
+ # Check for defined acronyms
44
+ defined_matches = defined_pattern.findall(paragraph)
45
+ for full_term, acronym in defined_matches:
46
+ defined_acronyms.add(acronym) # Add the acronym to the defined set
47
+
48
+ # Check for usage of acronyms
49
+ usage_matches = acronym_pattern.findall(paragraph)
50
+ for acronym in usage_matches:
51
+ if acronym not in defined_acronyms:
52
+ undefined_acronyms.add(acronym) # Add to undefined acronyms if not defined
53
+
54
+ return len(undefined_acronyms) == 0, undefined_acronyms # Return True if all acronyms are defined, along with undefined acronyms
55
 
56
  def legal_check(doc):
57
+ """Check for correct legal references in the document and suggest corrections.
58
+
59
+ Args:
60
+ doc (list): List of paragraphs/strings to check
61
+
62
+ Returns:
63
+ tuple: (bool, list) - (True if no errors found, list of (incorrect, correct) terms)
64
+ """
65
+ # Mapping of incorrect terms to their correct versions
66
+ incorrect_variations = {
67
+ r"\bUSC\b": "U.S.C.",
68
+ r"\bCFR Part\b": "CFR part",
69
+ r"\bC\.F\.R\.\b": "CFR",
70
+ r"\bWe\b": "The FAA",
71
+ r"\bwe\b": "the FAA",
72
+ r"\bcancelled\b": "canceled",
73
+ r"\bshall\b": "must or will",
74
+ r"\b&\b": "and"
75
+ }
76
+
77
+ # List to store tuples of incorrect terms and their correct versions
78
  incorrect_legal_references = []
79
 
80
+ for paragraph in doc:
81
+ # Special handling for "Title 14" / "title 14"
82
+ title_14_pattern = r"(?P<prefix>^|[.!?\s])\s*(?P<title>title 14|Title 14)\b"
83
+ matches = re.finditer(title_14_pattern, paragraph)
 
 
 
84
 
85
+ for match in matches:
86
+ prefix = match.group('prefix')
87
+ current_title = match.group('title')
88
+
89
+ # If it follows a sentence-ending punctuation or is at start, it should be "Title 14"
90
+ if prefix in ('.', '!', '?', '') and current_title.lower() == "title 14":
91
+ if current_title != "Title 14":
92
+ incorrect_legal_references.append((current_title, "Title 14"))
93
+ # If it's within a sentence, it should be "title 14"
94
+ elif prefix.isspace() and current_title != "title 14":
95
+ incorrect_legal_references.append((current_title, "title 14"))
96
+
97
+ # Check other variations
98
+ for incorrect_pattern, correct_term in incorrect_variations.items():
99
+ matches = re.finditer(incorrect_pattern, paragraph)
100
+ for match in matches:
101
+ incorrect_legal_references.append((match.group(), correct_term))
102
 
103
  return len(incorrect_legal_references) == 0, incorrect_legal_references
104
 
105
  def table_caption_check(doc, doc_type):
106
+ """
107
+ Check for correctly formatted table captions in the document.
108
+ Supports both numeric (Table 1-2) and alphanumeric (Table C-1) formats.
109
+ """
110
+ if doc_type in ["Advisory Circular", "Order"]:
111
+ # Pattern for "Table X-Y" where X and Y can be either letters or numbers
112
+ table_caption_pattern = re.compile(r'^Table\s+([A-Z0-9]+)-([A-Z0-9]+)[\.\s]', re.IGNORECASE)
113
+ else:
114
+ # Pattern for "Table X" where X can be either letters or numbers
115
+ table_caption_pattern = re.compile(r'^Table\s+([A-Z0-9]+)[\.\s]', re.IGNORECASE)
 
 
 
 
 
 
 
 
 
 
116
 
117
+ incorrect_captions = []
118
+
119
+ for paragraph in doc:
120
+ paragraph_strip = paragraph.strip()
121
+ if paragraph_strip.lower().startswith("table"):
122
+ if not table_caption_pattern.match(paragraph_strip):
123
+ incorrect_captions.append(paragraph_strip)
124
+
125
  return len(incorrect_captions) == 0, incorrect_captions
126
 
127
  def figure_caption_check(doc, doc_type):
128
+ """
129
+ Check for correctly formatted figure captions in the document.
130
+ Supports both numeric (Figure 1-2) and alphanumeric (Figure C-1) formats.
131
+ """
132
+ if doc_type in ["Advisory Circular", "Order"]:
133
+ # Pattern for "Figure X-Y" where X and Y can be either letters or numbers
134
+ figure_caption_pattern = re.compile(r'^Figure\s+([A-Z0-9]+)-([A-Z0-9]+)[\.\s]', re.IGNORECASE)
135
+ else:
136
+ # Pattern for "Figure X" where X can be either letters or numbers
137
+ figure_caption_pattern = re.compile(r'^Figure\s+([A-Z0-9]+)[\.\s]', re.IGNORECASE)
 
 
 
 
 
 
 
 
138
 
139
+ incorrect_fig_captions = []
140
+ for paragraph in doc:
141
+ paragraph_strip = paragraph.strip()
142
+ if paragraph_strip.lower().startswith("figure"):
143
+ if not figure_caption_pattern.match(paragraph_strip):
144
+ incorrect_fig_captions.append(paragraph_strip)
145
+
146
  return len(incorrect_fig_captions) == 0, incorrect_fig_captions
147
 
148
  def table_figure_reference_check(doc, doc_type):
149
+ """Check for incorrect references to tables and figures in the document."""
150
  incorrect_table_figure_references = []
151
 
152
+ if doc_type in ["Advisory Circular", "Order"]:
153
+ # For Advisory Circulars and Orders, correct references are "Table X-Y" or "Figure X-Y"
154
+ incorrect_table_ref_pattern = re.compile(r'\bTable\s+\d+(?!-\d+)\b', re.IGNORECASE)
155
+ incorrect_figure_ref_pattern = re.compile(r'\bFigure\s+\d+(?!-\d+)\b', re.IGNORECASE)
156
+ else:
157
+ # For other document types, correct references are "Table X" or "Figure X"
158
+ incorrect_table_ref_pattern = re.compile(r'\bTable\s+\d+(-\d+)?\b', re.IGNORECASE)
159
+ incorrect_figure_ref_pattern = re.compile(r'\bFigure\s+\d+(-\d+)?\b', re.IGNORECASE)
160
+
161
+ for paragraph in doc:
162
+ paragraph_strip = paragraph.strip()
163
+ # Exclude captions
164
+ starts_with_table_or_figure = paragraph_strip.lower().startswith('table') or paragraph_strip.lower().startswith('figure')
165
+ if not starts_with_table_or_figure:
166
+ # Find incorrect table references
167
+ incorrect_tables = incorrect_table_ref_pattern.findall(paragraph)
168
+ if incorrect_tables:
169
+ incorrect_table_figure_references.extend(incorrect_tables)
170
+ # Find incorrect figure references
171
+ incorrect_figures = incorrect_figure_ref_pattern.findall(paragraph)
172
+ if incorrect_figures:
173
+ incorrect_table_figure_references.extend(incorrect_figures)
174
+
175
+ # Return False if any incorrect references are found
176
  return len(incorrect_table_figure_references) == 0, incorrect_table_figure_references
177
 
178
+ def document_title_check(doc_path, doc_type):
 
179
  incorrect_titles = []
180
+ doc = Document(doc_path)
181
+
182
+ # Updated pattern to capture titles correctly
183
+ ac_pattern = re.compile(r'AC\s+\d+(?:-\d+)?(?:,|\s)+(.+?)(?=\.|,|$)')
184
+
185
+ # Define formatting rules for different document types
186
+ formatting_rules = {
187
+ "Advisory Circular": {"italics": True, "quotes": False},
188
+ "Airworthiness Criteria": {"italics": False, "quotes": True},
189
+ "Deviation Memo": {"italics": False, "quotes": True},
190
+ "Exemption": {"italics": False, "quotes": True},
191
+ "Federal Register Notice": {"italics": False, "quotes": True},
192
+ "Handbook/Manual": {"italics": False, "quotes": False},
193
+ "Order": {"italics": False, "quotes": True},
194
+ "Policy Statement": {"italics": False, "quotes": False},
195
+ "Rule": {"italics": False, "quotes": True},
196
+ "Special Condition": {"italics": False, "quotes": True},
197
+ "Technical Standard Order": {"italics": False, "quotes": True},
198
+ "Other": {"italics": False, "quotes": False}
199
+ }
200
+
201
+ # Get the rules for the current document type
202
+ if doc_type not in formatting_rules:
203
+ raise ValueError(f"Unsupported document type: {doc_type}")
204
+
205
+ required_format = formatting_rules[doc_type]
206
+
207
+ for paragraph in doc.paragraphs:
208
+ text = paragraph.text
209
+ matches = ac_pattern.finditer(text)
210
+
211
+ for match in matches:
212
+ full_match = match.group(0)
213
+ title_text = match.group(1).strip()
214
 
215
+ # Get the position where the title starts
216
+ title_start = match.start(1)
217
+
218
+ # Check for any type of quotation marks, including smart quotes
219
+ title_in_quotes = any(q in title_text for q in ['"', "'", '"', '"', ''', '''])
220
+
221
+ # Check the formatting of the title
222
+ title_is_italicized = False
223
+ current_pos = 0
224
+ for run in paragraph.runs:
225
+ run_length = len(run.text)
226
+ if current_pos <= title_start < current_pos + run_length:
227
+ relative_pos = title_start - current_pos
228
+ title_is_italicized = run.italic
229
+ break
230
+ current_pos += run_length
231
+
232
+ # Check if formatting matches the required format
233
+ formatting_incorrect = False
234
+ issue_message = []
235
+
236
+ # Check italics requirement
237
+ if required_format["italics"] and not title_is_italicized:
238
+ formatting_incorrect = True
239
+ issue_message.append("should be italicized")
240
+ elif not required_format["italics"] and title_is_italicized:
241
+ formatting_incorrect = True
242
+ issue_message.append("should not be italicized")
243
+
244
+ # Check quotes requirement
245
+ if required_format["quotes"] and not title_in_quotes:
246
+ formatting_incorrect = True
247
+ issue_message.append("should be in quotes")
248
+ elif not required_format["quotes"] and title_in_quotes:
249
+ formatting_incorrect = True
250
+ issue_message.append("should not be in quotes")
251
+
252
+ if formatting_incorrect:
253
+ incorrect_titles.append({
254
+ 'text': full_match,
255
+ 'issue': ', '.join(issue_message)
256
+ })
257
 
258
  return len(incorrect_titles) == 0, incorrect_titles
259
 
260
+ def get_document_checks(doc_type, template_type):
261
+ """Return expected outline and required headings based on document type and template type."""
262
+ document_checks = {
263
+ "Advisory Circular": {
264
+ "Short AC template AC": {
265
+ "required_headings": [
266
+ "PURPOSE.",
267
+ "APPLICABILITY.",
268
+ "CANCELLATION.",
269
+ "RELATED MATERIAL.",
270
+ "DEFINITION OF KEY TERMS."
271
+ ]
272
+ },
273
+ "Long AC template AC": {
274
+ "required_headings": [
275
+ "Purpose.",
276
+ "Applicability.",
277
+ "Cancellation.",
278
+ "Related Material.",
279
+ "Definition of Key Terms."
280
+ ]
281
+ }
282
+ },
283
+ "Airworthiness Criteria": {
284
+ "required_headings": [
285
+ "TBD - Need to research"
286
+ ]
287
+ },
288
+ "Deviation Memo": {
289
+ "required_headings": [
290
+ "TBD - Need to research"
291
+ ]
292
+ },
293
+ "Exemption": {
294
+ "required_headings": [
295
+ "TBD - Need to research"
296
+ ]
297
+ },
298
+ "Federal Register Notice": {
299
+ "required_headings": [
300
+ "Purpose of This Notice",
301
+ "Audience",
302
+ "Where can I Find This Notice"
303
+ ]
304
+ },
305
+ "Handbook/Manual": {
306
+ "required_headings": [
307
+ "TBD - Need to research"
308
+ ]
309
+ },
310
+ "Order": {
311
+ "required_headings": [
312
+ "Purpose of This Order.",
313
+ "Audience.",
314
+ "Where to Find This Order."
315
+ ]
316
+ },
317
+ "Policy Statement": {
318
+ "required_headings": [
319
+ "SUMMARY",
320
+ "CURRENT REGULATORY AND ADVISORY MATERIAL",
321
+ "RELEVANT PAST PRACTICE",
322
+ "POLICY",
323
+ "EFFECT OF POLICY",
324
+ "CONCLUSION"
325
+ ]
326
+ },
327
+ "Rule": {
328
+ "required_headings": [
329
+ "TBD - Need to research"
330
+ ]
331
+ },
332
+ "Special Condition": {
333
+ "required_headings": [
334
+ "TBD - Need to research"
335
+ ]
336
+ },
337
+ "Technical Standard Order": {
338
+ "required_headings": [
339
+ "PURPOSE.",
340
+ "APPLICABILITY.",
341
+ "REQUIREMENTS.",
342
+ "MARKING.",
343
+ "APPLICATION DATA REQUIREMENTS.",
344
+ "MANUFACTURER DATA REQUIREMENTS.",
345
+ "FURNISHED DATA REQUIREMENTS.",
346
+ "HOW TO GET REFERENCED DOCUMENTS."
347
+ ]
348
+ },
349
+ "Other": {
350
+ "required_headings": [
351
+ "N/A"
352
+ ]
353
+ }
354
+ }
355
 
356
+ # Add debugging logs
357
+ logger = logging.getLogger(__name__)
358
+ logger.info(f"Requested document type: {doc_type}")
359
+ logger.info(f"Requested template type: {template_type}")
360
+
361
+ if doc_type == "Advisory Circular":
362
+ checks = document_checks.get(doc_type, {}).get(template_type, {})
363
+ else:
364
+ checks = document_checks.get(doc_type, {})
365
 
366
+ logger.info(f"Retrieved checks: {checks}")
367
+ return checks
368
+
369
+ def double_period_check(doc):
370
+ """Check for sentences that end with two periods."""
371
+ incorrect_sentences = []
372
+
373
+ for paragraph in doc:
374
+ # Split the paragraph into sentences based on common sentence-ending punctuation
375
+ sentences = re.split(r'(?<=[.!?]) +', paragraph)
376
+ for sentence in sentences:
377
+ if sentence.endswith('..'):
378
+ incorrect_sentences.append(sentence.strip()) # Log the incorrectly formatted sentence
379
+
380
+ return len(incorrect_sentences) == 0, incorrect_sentences # Return True if no double periods are found, along with any incorrect sentences
381
 
382
  def spacing_check(doc):
383
+ """
384
+ Check for correct spacing in US federal regulatory documents.
385
+ Checks for:
386
+ - Spacing between document type and number (e.g., "AC 20-114")
387
+ - Spacing around section symbols (e.g., "§ 25.301")
388
+ - Spacing around part numbers (e.g., "Part 25")
389
+ - Spacing around paragraph indications (e.g., "(a)", "(1)")
390
+ - Double spaces between words
391
+ """
392
  incorrect_spacing = []
393
+
394
+ # Regex patterns to find incorrect spacing
395
+ doc_type_pattern = re.compile(r'(?<!\s)(AC|AD|CFR|FAA|N|SFAR)(\d+[-]?\d*)', re.IGNORECASE)
396
+ section_symbol_pattern = re.compile(r'(?<!\s)(§|§§)(\d+\.\d+)', re.IGNORECASE)
397
+ part_number_pattern = re.compile(r'(?<!\s)Part(\d+)', re.IGNORECASE)
398
+ paragraph_pattern = re.compile(r'(?<!\s)(\([a-z](?!\))|\([1-9](?!\)))', re.IGNORECASE)
399
+ double_space_pattern = re.compile(r'\s{2,}')
400
+
401
+ for paragraph in doc:
402
+ # Check for incorrect document type spacing
403
+ if doc_type_pattern.search(paragraph):
404
+ incorrect_spacing.append(paragraph)
405
+
406
+ # Check for incorrect section symbol spacing
407
+ if section_symbol_pattern.search(paragraph):
408
+ incorrect_spacing.append(paragraph)
409
+
410
+ # Check for incorrect part number spacing
411
+ if part_number_pattern.search(paragraph):
412
+ incorrect_spacing.append(paragraph)
413
+
414
+ # Check for incorrect paragraph indication spacing
415
+ if paragraph_pattern.search(paragraph):
416
+ incorrect_spacing.append(paragraph)
417
+
418
+ # Check for double spaces
419
+ if double_space_pattern.search(paragraph):
420
+ incorrect_spacing.append(paragraph)
421
+
422
  return len(incorrect_spacing) == 0, incorrect_spacing
423
 
424
+ def check_prohibited_phrases(doc):
425
+ """Check for prohibited words or phrases."""
426
+ prohibited_phrases = [
427
+ r'\babove\b',
428
+ r'\bbelow\b',
429
+ r'\bthere is\b',
430
+ r'\bthere are\b'
431
+ ]
432
+ issues = []
433
+ for paragraph in doc:
434
+ for phrase in prohibited_phrases:
435
+ if re.search(phrase, paragraph, re.IGNORECASE):
436
+ issues.append((phrase.strip(r'\b'), paragraph.strip()))
437
+ return issues
438
+
439
  def check_abbreviation_usage(doc):
440
+ """Check for abbreviation consistency after first definition."""
441
+ abbreviations = {}
442
+ issues = []
443
+ for paragraph in doc:
444
+ # Find definitions like "Federal Aviation Administration (FAA)"
445
+ defined_matches = re.findall(r'\b([A-Za-z &]+)\s+\((\b[A-Z]{2,}\b)\)', paragraph)
446
+ for full_term, acronym in defined_matches:
447
+ if acronym not in abbreviations:
448
+ abbreviations[acronym] = {"full_term": full_term.strip(), "defined": True}
449
 
450
+ # Check for full term usage after definition
451
+ for acronym, data in abbreviations.items():
452
+ full_term = data["full_term"]
453
+ if full_term in paragraph:
454
+ # Ignore first usage where it's defined
455
+ if data["defined"]:
456
+ data["defined"] = False # Mark it as now defined
457
+ else:
458
+ # Only flag subsequent occurrences
459
+ issues.append((full_term, acronym, paragraph.strip()))
460
+
461
+ return issues
 
 
 
 
 
 
 
 
 
462
 
463
  def check_date_formats(doc):
464
+ """Check for inconsistent date formats."""
465
  date_issues = []
466
+ correct_date_pattern = re.compile(r'\b(January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}, \d{4}\b')
467
+ date_pattern = re.compile(r'\b\d{1,2}/\d{1,2}/\d{2,4}\b') # MM/DD/YYYY
468
+ for paragraph in doc:
469
+ if date_pattern.search(paragraph):
470
+ dates = date_pattern.findall(paragraph)
471
+ for date in dates:
472
+ if not correct_date_pattern.match(date):
473
+ date_issues.append((date, paragraph.strip()))
 
 
 
474
  return date_issues
475
 
476
  def check_placeholders(doc):
477
+ """Check for placeholders that should be removed."""
478
+ placeholder_phrases = [
479
+ r'\bTBD\b',
480
+ r'\bTo be determined\b',
481
+ r'\bTo be added\b'
482
+ ]
483
+ issues = []
484
+ for paragraph in doc:
485
+ for phrase in placeholder_phrases:
486
+ if re.search(phrase, paragraph, re.IGNORECASE):
487
+ issues.append((phrase.strip(r'\b'), paragraph.strip()))
488
+ return issues
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
489
 
490
  def process_file(file_obj, doc_type, template_type):
491
  """
 
720
  # Placeholder Check
721
  results.append("## Placeholder Check")
722
  if not kwargs['placeholder_issues']:
723
+ results.append("✅ No future references or placeholders found.\n")
724
  else:
725
  results.append("❌ Placeholders Found:")
726
  for phrase, paragraph in kwargs['placeholder_issues']:
 
761
  with demo:
762
  gr.Markdown("# Document Checker Tool")
763
  gr.Markdown("Upload a Word (docx) document to check for compliance with U.S. federal documentation standards.")
764
+ gr.Markdown("### This tool is still in development")
765
  gr.Markdown("Contact Eric Putnam if you have questions and comments.")
766
 
767
  document_types = [