Hoctar77 commited on
Commit
3c21950
·
verified ·
1 Parent(s): 0750bee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +200 -26
app.py CHANGED
@@ -122,6 +122,197 @@ def table_figure_reference_check(paragraphs, doc_type):
122
 
123
  return len(incorrect_table_figure_references) == 0, incorrect_table_figure_references
124
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  def double_period_check(paragraphs):
126
  incorrect_sentences = []
127
 
@@ -222,21 +413,20 @@ def process_document(file_obj, doc_type, template_type):
222
  paragraphs = [para.text for para in doc.paragraphs]
223
  required_headings = get_document_checks(doc_type, template_type).get("required_headings", [])
224
 
225
- # Calls to each function with `paragraphs` as input
226
  heading_valid, headings_found = heading_title_check(paragraphs, required_headings)
227
  acronyms_valid, undefined_acronyms = acronym_check(paragraphs)
228
  legal_valid, incorrect_legal_references = legal_check(paragraphs)
229
  table_valid, incorrect_captions = table_caption_check(paragraphs, doc_type)
230
  figure_valid, incorrect_fig_captions = figure_caption_check(paragraphs, doc_type)
231
  references_valid, incorrect_table_figure_references = table_figure_reference_check(paragraphs, doc_type)
 
232
  double_period_valid, incorrect_sentences = double_period_check(paragraphs)
233
  spacing_valid, incorrect_spacing = spacing_check(paragraphs)
234
- placeholder_issues = check_prohibited_phrases(paragraphs)
235
-
236
- # Ensure title style check is included
237
- title_style_valid, incorrect_titles = document_title_check(file_obj, doc_type) if doc_type in ["Advisory Circular", "Order"] else (True, [])
238
 
239
- # Return all results
240
  results = format_results_for_gradio(
241
  heading_valid=heading_valid, headings_found=headings_found,
242
  acronyms_valid=acronyms_valid, undefined_acronyms=undefined_acronyms,
@@ -244,10 +434,11 @@ def process_document(file_obj, doc_type, template_type):
244
  table_valid=table_valid, incorrect_captions=incorrect_captions,
245
  figure_valid=figure_valid, incorrect_fig_captions=incorrect_fig_captions,
246
  references_valid=references_valid, incorrect_table_figure_references=incorrect_table_figure_references,
 
247
  double_period_valid=double_period_valid, incorrect_sentences=incorrect_sentences,
248
  spacing_valid=spacing_valid, incorrect_spacing=incorrect_spacing,
249
- title_style_valid=title_style_valid, incorrect_titles=incorrect_titles,
250
- placeholder_issues=placeholder_issues,
251
  required_headings=required_headings, doc_type=doc_type
252
  )
253
  return results
@@ -255,22 +446,6 @@ def process_document(file_obj, doc_type, template_type):
255
  print(f"Error in process_document: {str(e)}")
256
  return f"An error occurred while processing the document: {str(e)}"
257
 
258
- def get_document_checks(doc_type, template_type):
259
- """Return the required headings and other checks based on document type."""
260
- if doc_type == "Advisory Circular":
261
- if template_type == "Short AC template AC":
262
- return {
263
- "required_headings": ["Purpose", "Applicability", "Related Reading Material",
264
- "Background", "Discussion"]
265
- }
266
- else: # Long AC template
267
- return {
268
- "required_headings": ["Purpose", "Applicability", "Audience", "Related Reading Material",
269
- "Background", "Discussion", "Conclusion"]
270
- }
271
- # Add other document types as needed
272
- return {"required_headings": []}
273
-
274
  def format_results_for_gradio(**kwargs):
275
  """Format the results for display in Gradio."""
276
  results = []
@@ -473,8 +648,7 @@ with demo:
473
  with gr.Column(scale=2):
474
  output = gr.Markdown(
475
  label="Check Results",
476
- value="Results will appear here after processing..."
477
- )
478
 
479
  # Update template type visibility based on document type
480
  def update_template_visibility(doc_type):
 
122
 
123
  return len(incorrect_table_figure_references) == 0, incorrect_table_figure_references
124
 
125
+ def document_title_check(doc_path, doc_type):
126
+ incorrect_titles = []
127
+ doc = Document(doc_path)
128
+
129
+ # Updated pattern to capture titles correctly
130
+ ac_pattern = re.compile(r'AC\s+\d+(?:-\d+)?(?:,|\s)+(.+?)(?=\.|,|$)')
131
+
132
+ # Define formatting rules for different document types
133
+ formatting_rules = {
134
+ "Advisory Circular": {"italics": True, "quotes": False},
135
+ "Airworthiness Criteria": {"italics": False, "quotes": True},
136
+ "Deviation Memo": {"italics": False, "quotes": True},
137
+ "Exemption": {"italics": False, "quotes": True},
138
+ "Federal Register Notice": {"italics": False, "quotes": True},
139
+ "Handbook/Manual": {"italics": False, "quotes": False},
140
+ "Order": {"italics": False, "quotes": True},
141
+ "Policy Statement": {"italics": False, "quotes": False},
142
+ "Rule": {"italics": False, "quotes": True},
143
+ "Special Condition": {"italics": False, "quotes": True},
144
+ "Technical Standard Order": {"italics": False, "quotes": True},
145
+ "Other": {"italics": False, "quotes": False}
146
+ }
147
+
148
+ # Get the rules for the current document type
149
+ if doc_type not in formatting_rules:
150
+ raise ValueError(f"Unsupported document type: {doc_type}")
151
+
152
+ required_format = formatting_rules[doc_type]
153
+
154
+ for paragraph in doc.paragraphs:
155
+ text = paragraph.text
156
+ matches = ac_pattern.finditer(text)
157
+
158
+ for match in matches:
159
+ full_match = match.group(0)
160
+ title_text = match.group(1).strip()
161
+
162
+ # Get the position where the title starts
163
+ title_start = match.start(1)
164
+
165
+ # Check for any type of quotation marks, including smart quotes
166
+ title_in_quotes = any(q in title_text for q in ['"', "'", '"', '"', ''', '''])
167
+
168
+ # Check the formatting of the title
169
+ title_is_italicized = False
170
+ current_pos = 0
171
+ for run in paragraph.runs:
172
+ run_length = len(run.text)
173
+ if current_pos <= title_start < current_pos + run_length:
174
+ relative_pos = title_start - current_pos
175
+ title_is_italicized = run.italic
176
+ break
177
+ current_pos += run_length
178
+
179
+ # Check if formatting matches the required format
180
+ formatting_incorrect = False
181
+ issue_message = []
182
+
183
+ # Check italics requirement
184
+ if required_format["italics"] and not title_is_italicized:
185
+ formatting_incorrect = True
186
+ issue_message.append("should be italicized")
187
+ elif not required_format["italics"] and title_is_italicized:
188
+ formatting_incorrect = True
189
+ issue_message.append("should not be italicized")
190
+
191
+ # Check quotes requirement
192
+ if required_format["quotes"] and not title_in_quotes:
193
+ formatting_incorrect = True
194
+ issue_message.append("should be in quotes")
195
+ elif not required_format["quotes"] and title_in_quotes:
196
+ formatting_incorrect = True
197
+ issue_message.append("should not be in quotes")
198
+
199
+ if formatting_incorrect:
200
+ incorrect_titles.append({
201
+ 'text': full_match,
202
+ 'issue': ', '.join(issue_message)
203
+ })
204
+
205
+ return len(incorrect_titles) == 0, incorrect_titles
206
+
207
+ def get_document_checks(doc_type, template_type):
208
+ """Return the required headings and other checks based on document type."""
209
+ document_checks = {
210
+ "Advisory Circular": {
211
+ "Short AC template AC": {
212
+ "required_headings": [
213
+ "PURPOSE.",
214
+ "APPLICABILITY.",
215
+ "CANCELLATION.",
216
+ "RELATED MATERIAL.",
217
+ "DEFINITION OF KEY TERMS."
218
+ ]
219
+ },
220
+ "Long AC template AC": {
221
+ "required_headings": [
222
+ "Purpose.",
223
+ "Applicability.",
224
+ "Cancellation.",
225
+ "Related Material.",
226
+ "Definition of Key Terms."
227
+ ]
228
+ }
229
+ },
230
+ "Airworthiness Criteria": {
231
+ "required_headings": [
232
+ "TBD - Need to research"
233
+ ]
234
+ },
235
+ "Deviation Memo": {
236
+ "required_headings": [
237
+ "TBD - Need to research"
238
+ ]
239
+ },
240
+ "Exemption": {
241
+ "required_headings": [
242
+ "TBD - Need to research"
243
+ ]
244
+ },
245
+ "Federal Register Notice": {
246
+ "required_headings": [
247
+ "Purpose of This Notice",
248
+ "Audience",
249
+ "Where can I Find This Notice"
250
+ ]
251
+ },
252
+ "Handbook/Manual": {
253
+ "required_headings": [
254
+ "TBD - Need to research"
255
+ ]
256
+ },
257
+ "Order": {
258
+ "required_headings": [
259
+ "Purpose of This Order.",
260
+ "Audience.",
261
+ "Where to Find This Order."
262
+ ]
263
+ },
264
+ "Policy Statement": {
265
+ "required_headings": [
266
+ "SUMMARY",
267
+ "CURRENT REGULATORY AND ADVISORY MATERIAL",
268
+ "RELEVANT PAST PRACTICE",
269
+ "POLICY",
270
+ "EFFECT OF POLICY",
271
+ "CONCLUSION"
272
+ ]
273
+ },
274
+ "Rule": {
275
+ "required_headings": [
276
+ "TBD - Need to research"
277
+ ]
278
+ },
279
+ "Special Condition": {
280
+ "required_headings": [
281
+ "TBD - Need to research"
282
+ ]
283
+ },
284
+ "Technical Standard Order": {
285
+ "required_headings": [
286
+ "PURPOSE.",
287
+ "APPLICABILITY.",
288
+ "REQUIREMENTS.",
289
+ "MARKING.",
290
+ "APPLICATION DATA REQUIREMENTS.",
291
+ "MANUFACTURER DATA REQUIREMENTS.",
292
+ "FURNISHED DATA REQUIREMENTS.",
293
+ "HOW TO GET REFERENCED DOCUMENTS."
294
+ ]
295
+ },
296
+ "Other": {
297
+ "required_headings": [
298
+ "N/A"
299
+ ]
300
+ }
301
+ }
302
+
303
+ # Add debugging logs
304
+ logger = logging.getLogger(__name__)
305
+ logger.info(f"Requested document type: {doc_type}")
306
+ logger.info(f"Requested template type: {template_type}")
307
+
308
+ if doc_type == "Advisory Circular":
309
+ checks = document_checks.get(doc_type, {}).get(template_type, {})
310
+ else:
311
+ checks = document_checks.get(doc_type, {})
312
+
313
+ logger.info(f"Retrieved checks: {checks}")
314
+ return checks
315
+
316
  def double_period_check(paragraphs):
317
  incorrect_sentences = []
318
 
 
413
  paragraphs = [para.text for para in doc.paragraphs]
414
  required_headings = get_document_checks(doc_type, template_type).get("required_headings", [])
415
 
416
+ # Perform each check with `paragraphs` as input
417
  heading_valid, headings_found = heading_title_check(paragraphs, required_headings)
418
  acronyms_valid, undefined_acronyms = acronym_check(paragraphs)
419
  legal_valid, incorrect_legal_references = legal_check(paragraphs)
420
  table_valid, incorrect_captions = table_caption_check(paragraphs, doc_type)
421
  figure_valid, incorrect_fig_captions = figure_caption_check(paragraphs, doc_type)
422
  references_valid, incorrect_table_figure_references = table_figure_reference_check(paragraphs, doc_type)
423
+ title_style_valid, incorrect_titles = document_title_check(file_obj, doc_type) if doc_type in ["Advisory Circular", "Order"] else (True, [])
424
  double_period_valid, incorrect_sentences = double_period_check(paragraphs)
425
  spacing_valid, incorrect_spacing = spacing_check(paragraphs)
426
+ date_issues = check_date_formats(paragraphs) # Pass paragraphs here
427
+ placeholder_issues = check_placeholders(paragraphs) # Pass paragraphs here
 
 
428
 
429
+ # Format results
430
  results = format_results_for_gradio(
431
  heading_valid=heading_valid, headings_found=headings_found,
432
  acronyms_valid=acronyms_valid, undefined_acronyms=undefined_acronyms,
 
434
  table_valid=table_valid, incorrect_captions=incorrect_captions,
435
  figure_valid=figure_valid, incorrect_fig_captions=incorrect_fig_captions,
436
  references_valid=references_valid, incorrect_table_figure_references=incorrect_table_figure_references,
437
+ title_style_valid=title_style_valid, incorrect_titles=incorrect_titles,
438
  double_period_valid=double_period_valid, incorrect_sentences=incorrect_sentences,
439
  spacing_valid=spacing_valid, incorrect_spacing=incorrect_spacing,
440
+ date_issues=date_issues, # Added date_issues
441
+ placeholder_issues=placeholder_issues, # Added placeholder_issues
442
  required_headings=required_headings, doc_type=doc_type
443
  )
444
  return results
 
446
  print(f"Error in process_document: {str(e)}")
447
  return f"An error occurred while processing the document: {str(e)}"
448
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
449
  def format_results_for_gradio(**kwargs):
450
  """Format the results for display in Gradio."""
451
  results = []
 
648
  with gr.Column(scale=2):
649
  output = gr.Markdown(
650
  label="Check Results",
651
+ ).markdown("Results will appear here after processing...")
 
652
 
653
  # Update template type visibility based on document type
654
  def update_template_visibility(doc_type):