Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -122,6 +122,197 @@ def table_figure_reference_check(paragraphs, doc_type):
|
|
122 |
|
123 |
return len(incorrect_table_figure_references) == 0, incorrect_table_figure_references
|
124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
def double_period_check(paragraphs):
|
126 |
incorrect_sentences = []
|
127 |
|
@@ -222,21 +413,20 @@ def process_document(file_obj, doc_type, template_type):
|
|
222 |
paragraphs = [para.text for para in doc.paragraphs]
|
223 |
required_headings = get_document_checks(doc_type, template_type).get("required_headings", [])
|
224 |
|
225 |
-
#
|
226 |
heading_valid, headings_found = heading_title_check(paragraphs, required_headings)
|
227 |
acronyms_valid, undefined_acronyms = acronym_check(paragraphs)
|
228 |
legal_valid, incorrect_legal_references = legal_check(paragraphs)
|
229 |
table_valid, incorrect_captions = table_caption_check(paragraphs, doc_type)
|
230 |
figure_valid, incorrect_fig_captions = figure_caption_check(paragraphs, doc_type)
|
231 |
references_valid, incorrect_table_figure_references = table_figure_reference_check(paragraphs, doc_type)
|
|
|
232 |
double_period_valid, incorrect_sentences = double_period_check(paragraphs)
|
233 |
spacing_valid, incorrect_spacing = spacing_check(paragraphs)
|
234 |
-
|
235 |
-
|
236 |
-
# Ensure title style check is included
|
237 |
-
title_style_valid, incorrect_titles = document_title_check(file_obj, doc_type) if doc_type in ["Advisory Circular", "Order"] else (True, [])
|
238 |
|
239 |
-
#
|
240 |
results = format_results_for_gradio(
|
241 |
heading_valid=heading_valid, headings_found=headings_found,
|
242 |
acronyms_valid=acronyms_valid, undefined_acronyms=undefined_acronyms,
|
@@ -244,10 +434,11 @@ def process_document(file_obj, doc_type, template_type):
|
|
244 |
table_valid=table_valid, incorrect_captions=incorrect_captions,
|
245 |
figure_valid=figure_valid, incorrect_fig_captions=incorrect_fig_captions,
|
246 |
references_valid=references_valid, incorrect_table_figure_references=incorrect_table_figure_references,
|
|
|
247 |
double_period_valid=double_period_valid, incorrect_sentences=incorrect_sentences,
|
248 |
spacing_valid=spacing_valid, incorrect_spacing=incorrect_spacing,
|
249 |
-
|
250 |
-
placeholder_issues=placeholder_issues,
|
251 |
required_headings=required_headings, doc_type=doc_type
|
252 |
)
|
253 |
return results
|
@@ -255,22 +446,6 @@ def process_document(file_obj, doc_type, template_type):
|
|
255 |
print(f"Error in process_document: {str(e)}")
|
256 |
return f"An error occurred while processing the document: {str(e)}"
|
257 |
|
258 |
-
def get_document_checks(doc_type, template_type):
|
259 |
-
"""Return the required headings and other checks based on document type."""
|
260 |
-
if doc_type == "Advisory Circular":
|
261 |
-
if template_type == "Short AC template AC":
|
262 |
-
return {
|
263 |
-
"required_headings": ["Purpose", "Applicability", "Related Reading Material",
|
264 |
-
"Background", "Discussion"]
|
265 |
-
}
|
266 |
-
else: # Long AC template
|
267 |
-
return {
|
268 |
-
"required_headings": ["Purpose", "Applicability", "Audience", "Related Reading Material",
|
269 |
-
"Background", "Discussion", "Conclusion"]
|
270 |
-
}
|
271 |
-
# Add other document types as needed
|
272 |
-
return {"required_headings": []}
|
273 |
-
|
274 |
def format_results_for_gradio(**kwargs):
|
275 |
"""Format the results for display in Gradio."""
|
276 |
results = []
|
@@ -473,8 +648,7 @@ with demo:
|
|
473 |
with gr.Column(scale=2):
|
474 |
output = gr.Markdown(
|
475 |
label="Check Results",
|
476 |
-
|
477 |
-
)
|
478 |
|
479 |
# Update template type visibility based on document type
|
480 |
def update_template_visibility(doc_type):
|
|
|
122 |
|
123 |
return len(incorrect_table_figure_references) == 0, incorrect_table_figure_references
|
124 |
|
125 |
+
def document_title_check(doc_path, doc_type):
|
126 |
+
incorrect_titles = []
|
127 |
+
doc = Document(doc_path)
|
128 |
+
|
129 |
+
# Updated pattern to capture titles correctly
|
130 |
+
ac_pattern = re.compile(r'AC\s+\d+(?:-\d+)?(?:,|\s)+(.+?)(?=\.|,|$)')
|
131 |
+
|
132 |
+
# Define formatting rules for different document types
|
133 |
+
formatting_rules = {
|
134 |
+
"Advisory Circular": {"italics": True, "quotes": False},
|
135 |
+
"Airworthiness Criteria": {"italics": False, "quotes": True},
|
136 |
+
"Deviation Memo": {"italics": False, "quotes": True},
|
137 |
+
"Exemption": {"italics": False, "quotes": True},
|
138 |
+
"Federal Register Notice": {"italics": False, "quotes": True},
|
139 |
+
"Handbook/Manual": {"italics": False, "quotes": False},
|
140 |
+
"Order": {"italics": False, "quotes": True},
|
141 |
+
"Policy Statement": {"italics": False, "quotes": False},
|
142 |
+
"Rule": {"italics": False, "quotes": True},
|
143 |
+
"Special Condition": {"italics": False, "quotes": True},
|
144 |
+
"Technical Standard Order": {"italics": False, "quotes": True},
|
145 |
+
"Other": {"italics": False, "quotes": False}
|
146 |
+
}
|
147 |
+
|
148 |
+
# Get the rules for the current document type
|
149 |
+
if doc_type not in formatting_rules:
|
150 |
+
raise ValueError(f"Unsupported document type: {doc_type}")
|
151 |
+
|
152 |
+
required_format = formatting_rules[doc_type]
|
153 |
+
|
154 |
+
for paragraph in doc.paragraphs:
|
155 |
+
text = paragraph.text
|
156 |
+
matches = ac_pattern.finditer(text)
|
157 |
+
|
158 |
+
for match in matches:
|
159 |
+
full_match = match.group(0)
|
160 |
+
title_text = match.group(1).strip()
|
161 |
+
|
162 |
+
# Get the position where the title starts
|
163 |
+
title_start = match.start(1)
|
164 |
+
|
165 |
+
# Check for any type of quotation marks, including smart quotes
|
166 |
+
title_in_quotes = any(q in title_text for q in ['"', "'", '"', '"', ''', '''])
|
167 |
+
|
168 |
+
# Check the formatting of the title
|
169 |
+
title_is_italicized = False
|
170 |
+
current_pos = 0
|
171 |
+
for run in paragraph.runs:
|
172 |
+
run_length = len(run.text)
|
173 |
+
if current_pos <= title_start < current_pos + run_length:
|
174 |
+
relative_pos = title_start - current_pos
|
175 |
+
title_is_italicized = run.italic
|
176 |
+
break
|
177 |
+
current_pos += run_length
|
178 |
+
|
179 |
+
# Check if formatting matches the required format
|
180 |
+
formatting_incorrect = False
|
181 |
+
issue_message = []
|
182 |
+
|
183 |
+
# Check italics requirement
|
184 |
+
if required_format["italics"] and not title_is_italicized:
|
185 |
+
formatting_incorrect = True
|
186 |
+
issue_message.append("should be italicized")
|
187 |
+
elif not required_format["italics"] and title_is_italicized:
|
188 |
+
formatting_incorrect = True
|
189 |
+
issue_message.append("should not be italicized")
|
190 |
+
|
191 |
+
# Check quotes requirement
|
192 |
+
if required_format["quotes"] and not title_in_quotes:
|
193 |
+
formatting_incorrect = True
|
194 |
+
issue_message.append("should be in quotes")
|
195 |
+
elif not required_format["quotes"] and title_in_quotes:
|
196 |
+
formatting_incorrect = True
|
197 |
+
issue_message.append("should not be in quotes")
|
198 |
+
|
199 |
+
if formatting_incorrect:
|
200 |
+
incorrect_titles.append({
|
201 |
+
'text': full_match,
|
202 |
+
'issue': ', '.join(issue_message)
|
203 |
+
})
|
204 |
+
|
205 |
+
return len(incorrect_titles) == 0, incorrect_titles
|
206 |
+
|
207 |
+
def get_document_checks(doc_type, template_type):
|
208 |
+
"""Return the required headings and other checks based on document type."""
|
209 |
+
document_checks = {
|
210 |
+
"Advisory Circular": {
|
211 |
+
"Short AC template AC": {
|
212 |
+
"required_headings": [
|
213 |
+
"PURPOSE.",
|
214 |
+
"APPLICABILITY.",
|
215 |
+
"CANCELLATION.",
|
216 |
+
"RELATED MATERIAL.",
|
217 |
+
"DEFINITION OF KEY TERMS."
|
218 |
+
]
|
219 |
+
},
|
220 |
+
"Long AC template AC": {
|
221 |
+
"required_headings": [
|
222 |
+
"Purpose.",
|
223 |
+
"Applicability.",
|
224 |
+
"Cancellation.",
|
225 |
+
"Related Material.",
|
226 |
+
"Definition of Key Terms."
|
227 |
+
]
|
228 |
+
}
|
229 |
+
},
|
230 |
+
"Airworthiness Criteria": {
|
231 |
+
"required_headings": [
|
232 |
+
"TBD - Need to research"
|
233 |
+
]
|
234 |
+
},
|
235 |
+
"Deviation Memo": {
|
236 |
+
"required_headings": [
|
237 |
+
"TBD - Need to research"
|
238 |
+
]
|
239 |
+
},
|
240 |
+
"Exemption": {
|
241 |
+
"required_headings": [
|
242 |
+
"TBD - Need to research"
|
243 |
+
]
|
244 |
+
},
|
245 |
+
"Federal Register Notice": {
|
246 |
+
"required_headings": [
|
247 |
+
"Purpose of This Notice",
|
248 |
+
"Audience",
|
249 |
+
"Where can I Find This Notice"
|
250 |
+
]
|
251 |
+
},
|
252 |
+
"Handbook/Manual": {
|
253 |
+
"required_headings": [
|
254 |
+
"TBD - Need to research"
|
255 |
+
]
|
256 |
+
},
|
257 |
+
"Order": {
|
258 |
+
"required_headings": [
|
259 |
+
"Purpose of This Order.",
|
260 |
+
"Audience.",
|
261 |
+
"Where to Find This Order."
|
262 |
+
]
|
263 |
+
},
|
264 |
+
"Policy Statement": {
|
265 |
+
"required_headings": [
|
266 |
+
"SUMMARY",
|
267 |
+
"CURRENT REGULATORY AND ADVISORY MATERIAL",
|
268 |
+
"RELEVANT PAST PRACTICE",
|
269 |
+
"POLICY",
|
270 |
+
"EFFECT OF POLICY",
|
271 |
+
"CONCLUSION"
|
272 |
+
]
|
273 |
+
},
|
274 |
+
"Rule": {
|
275 |
+
"required_headings": [
|
276 |
+
"TBD - Need to research"
|
277 |
+
]
|
278 |
+
},
|
279 |
+
"Special Condition": {
|
280 |
+
"required_headings": [
|
281 |
+
"TBD - Need to research"
|
282 |
+
]
|
283 |
+
},
|
284 |
+
"Technical Standard Order": {
|
285 |
+
"required_headings": [
|
286 |
+
"PURPOSE.",
|
287 |
+
"APPLICABILITY.",
|
288 |
+
"REQUIREMENTS.",
|
289 |
+
"MARKING.",
|
290 |
+
"APPLICATION DATA REQUIREMENTS.",
|
291 |
+
"MANUFACTURER DATA REQUIREMENTS.",
|
292 |
+
"FURNISHED DATA REQUIREMENTS.",
|
293 |
+
"HOW TO GET REFERENCED DOCUMENTS."
|
294 |
+
]
|
295 |
+
},
|
296 |
+
"Other": {
|
297 |
+
"required_headings": [
|
298 |
+
"N/A"
|
299 |
+
]
|
300 |
+
}
|
301 |
+
}
|
302 |
+
|
303 |
+
# Add debugging logs
|
304 |
+
logger = logging.getLogger(__name__)
|
305 |
+
logger.info(f"Requested document type: {doc_type}")
|
306 |
+
logger.info(f"Requested template type: {template_type}")
|
307 |
+
|
308 |
+
if doc_type == "Advisory Circular":
|
309 |
+
checks = document_checks.get(doc_type, {}).get(template_type, {})
|
310 |
+
else:
|
311 |
+
checks = document_checks.get(doc_type, {})
|
312 |
+
|
313 |
+
logger.info(f"Retrieved checks: {checks}")
|
314 |
+
return checks
|
315 |
+
|
316 |
def double_period_check(paragraphs):
|
317 |
incorrect_sentences = []
|
318 |
|
|
|
413 |
paragraphs = [para.text for para in doc.paragraphs]
|
414 |
required_headings = get_document_checks(doc_type, template_type).get("required_headings", [])
|
415 |
|
416 |
+
# Perform each check with `paragraphs` as input
|
417 |
heading_valid, headings_found = heading_title_check(paragraphs, required_headings)
|
418 |
acronyms_valid, undefined_acronyms = acronym_check(paragraphs)
|
419 |
legal_valid, incorrect_legal_references = legal_check(paragraphs)
|
420 |
table_valid, incorrect_captions = table_caption_check(paragraphs, doc_type)
|
421 |
figure_valid, incorrect_fig_captions = figure_caption_check(paragraphs, doc_type)
|
422 |
references_valid, incorrect_table_figure_references = table_figure_reference_check(paragraphs, doc_type)
|
423 |
+
title_style_valid, incorrect_titles = document_title_check(file_obj, doc_type) if doc_type in ["Advisory Circular", "Order"] else (True, [])
|
424 |
double_period_valid, incorrect_sentences = double_period_check(paragraphs)
|
425 |
spacing_valid, incorrect_spacing = spacing_check(paragraphs)
|
426 |
+
date_issues = check_date_formats(paragraphs) # Pass paragraphs here
|
427 |
+
placeholder_issues = check_placeholders(paragraphs) # Pass paragraphs here
|
|
|
|
|
428 |
|
429 |
+
# Format results
|
430 |
results = format_results_for_gradio(
|
431 |
heading_valid=heading_valid, headings_found=headings_found,
|
432 |
acronyms_valid=acronyms_valid, undefined_acronyms=undefined_acronyms,
|
|
|
434 |
table_valid=table_valid, incorrect_captions=incorrect_captions,
|
435 |
figure_valid=figure_valid, incorrect_fig_captions=incorrect_fig_captions,
|
436 |
references_valid=references_valid, incorrect_table_figure_references=incorrect_table_figure_references,
|
437 |
+
title_style_valid=title_style_valid, incorrect_titles=incorrect_titles,
|
438 |
double_period_valid=double_period_valid, incorrect_sentences=incorrect_sentences,
|
439 |
spacing_valid=spacing_valid, incorrect_spacing=incorrect_spacing,
|
440 |
+
date_issues=date_issues, # Added date_issues
|
441 |
+
placeholder_issues=placeholder_issues, # Added placeholder_issues
|
442 |
required_headings=required_headings, doc_type=doc_type
|
443 |
)
|
444 |
return results
|
|
|
446 |
print(f"Error in process_document: {str(e)}")
|
447 |
return f"An error occurred while processing the document: {str(e)}"
|
448 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
449 |
def format_results_for_gradio(**kwargs):
|
450 |
"""Format the results for display in Gradio."""
|
451 |
results = []
|
|
|
648 |
with gr.Column(scale=2):
|
649 |
output = gr.Markdown(
|
650 |
label="Check Results",
|
651 |
+
).markdown("Results will appear here after processing...")
|
|
|
652 |
|
653 |
# Update template type visibility based on document type
|
654 |
def update_template_visibility(doc_type):
|