Spaces:
Running
Running
import gradio as gr | |
import logging | |
import re | |
from docx import Document | |
import io | |
import traceback | |
def heading_title_check(doc, required_headings): | |
"""Check if all required headings are present.""" | |
headings_found = [] | |
try: | |
# Iterate through paragraphs to find headings | |
for paragraph in doc.paragraphs: | |
if paragraph.style.name.startswith('Heading'): | |
headings_found.append(paragraph.text.strip()) | |
except Exception as e: | |
print(f"Error in heading check: {str(e)}") | |
return False, [] | |
# Check if all required headings are present | |
all_present = all(heading in headings_found for heading in required_headings) | |
return all_present, headings_found | |
def acronym_check(doc): | |
"""Check if all acronyms are properly defined.""" | |
undefined_acronyms = set() | |
defined_acronyms = set() | |
try: | |
# Regular expression for finding acronyms (2-5 capital letters) | |
acronym_pattern = r'\b[A-Z]{2,5}\b' | |
# Check each paragraph | |
for paragraph in doc.paragraphs: | |
text = paragraph.text | |
# Find all acronyms in this paragraph | |
acronyms = re.findall(acronym_pattern, text) | |
for acronym in acronyms: | |
if acronym not in defined_acronyms: | |
# Look for definition pattern: "full term (ACRONYM)" | |
definition_pattern = rf'.+\({acronym}\)' | |
if not any(re.search(definition_pattern, p.text) for p in doc.paragraphs): | |
undefined_acronyms.add(acronym) | |
else: | |
defined_acronyms.add(acronym) | |
except Exception as e: | |
print(f"Error in acronym check: {str(e)}") | |
return False, [] | |
return len(undefined_acronyms) == 0, list(undefined_acronyms) | |
def legal_check(doc): | |
"""Check if legal terminology is used correctly.""" | |
incorrect_legal_references = [] | |
try: | |
# Define legal terminology mapping | |
legal_terms = { | |
"C.F.R.": "Code of Federal Regulations", | |
"F.R.": "Federal Register", | |
"U.S.C.": "United States Code" | |
} | |
# Check each paragraph | |
for paragraph in doc.paragraprams: | |
text = paragraph.text | |
for incorrect_term, correct_term in legal_terms.items(): | |
if incorrect_term in text and correct_term not in text: | |
incorrect_legal_references.append((incorrect_term, correct_term)) | |
except Exception as e: | |
print(f"Error in legal check: {str(e)}") | |
return False, [] | |
return len(incorrect_legal_references) == 0, incorrect_legal_references | |
def table_caption_check(doc, doc_type): | |
"""Check if table captions are formatted correctly.""" | |
incorrect_captions = [] | |
try: | |
# Check table captions | |
for table in doc.tables: | |
# Get the paragraph before the table | |
prev_paragraph = table._element.getprevious() | |
if prev_paragraph is not None and prev_paragraph.text.startswith("Table"): | |
# Check if the caption is formatted correctly | |
if doc_type == "Advisory Circular": | |
# AC captions should be "Table X. Caption text" | |
if not prev_paragraph.text.startswith("Table ") or ". " not in prev_paragraph.text: | |
incorrect_captions.append(prev_paragraph.text) | |
else: | |
# Other doc types may have different caption formats | |
pass | |
except Exception as e: | |
print(f"Error in table caption check: {str(e)}") | |
return False, [] | |
return len(incorrect_captions) == 0, incorrect_captions | |
def figure_caption_check(doc, doc_type): | |
"""Check if figure captions are formatted correctly.""" | |
incorrect_fig_captions = [] | |
try: | |
# Check figure captions | |
for paragraph in doc.paragraphs: | |
if paragraph.text.startswith("Figure"): | |
# Check if the caption is formatted correctly | |
if doc_type == "Advisory Circular": | |
# AC captions should be "Figure X. Caption text" | |
if ". " not in paragraph.text: | |
incorrect_fig_captions.append(paragraph.text) | |
else: | |
# Other doc types may have different caption formats | |
pass | |
except Exception as e: | |
print(f"Error in figure caption check: {str(e)}") | |
return False, [] | |
return len(incorrect_fig_captions) == 0, incorrect_fig_captions | |
def table_figure_reference_check(doc, doc_type): | |
"""Check if table and figure references are formatted correctly.""" | |
incorrect_table_figure_references = [] | |
try: | |
# Check table and figure references | |
for paragraph in doc.paragraphs: | |
text = paragraph.text | |
if "Table" in text or "Figure" in text: | |
# Check if the reference is formatted correctly | |
if doc_type == "Advisory Circular": | |
# AC references should be "Table X" or "Figure X" | |
if not any(text.startswith(f"{item} ") for item in ["Table", "Figure"]): | |
incorrect_table_figure_references.append(text) | |
else: | |
# Other doc types may have different reference formats | |
pass | |
except Exception as e: | |
print(f"Error in table/figure reference check: {str(e)}") | |
return False, [] | |
return len(incorrect_table_figure_references) == 0, incorrect_table_figure_references | |
def document_title_check(doc, doc_type): | |
"""Check if the document title is formatted correctly.""" | |
incorrect_titles = [] | |
try: | |
# Check the document title | |
if len(doc.paragraphs) > 0 and doc.paragraphs[0].style.name == 'Title': | |
title_text = doc.paragraphs[0].text | |
# Check the formatting based on document type | |
if doc_type == "Advisory Circular": | |
if not title_text.startswith("ADVISORY CIRCULAR ") or title_text.endswith(" AC"): | |
incorrect_titles.append({"text": title_text, "issue": "Advisory Circular titles should start with 'ADVISORY CIRCULAR ' and end with ' AC'"}) | |
elif doc_type == "Order": | |
if not title_text.startswith('"') or not title_text.endswith('"'): | |
incorrect_titles.append({"text": title_text, "issue": "Order titles should be enclosed in quotation marks"}) | |
elif doc_type == "Federal Register Notice": | |
if not title_text.startswith('"') or not title_text.endswith('"'): | |
incorrect_titles.append({"text": title_text, "issue": "Federal Register Notice titles should be enclosed in quotation marks"}) | |
elif doc_type == "Policy Statement": | |
if title_text.startswith('"') or title_text.endswith('"'): | |
incorrect_titles.append({"text": title_text, "issue": "Policy Statement titles should not have quotation marks"}) | |
except Exception as e: | |
print(f"Error in document title check: {str(e)}") | |
return False, [] | |
return len(incorrect_titles) == 0, incorrect_titles | |
def double_period_check(doc): | |
"""Check for sentences with double periods.""" | |
incorrect_sentences = [] | |
try: | |
# Check each paragraph for double periods | |
for paragraph in doc.paragraphs: | |
if ".." in paragraph.text: | |
incorrect_sentences.append(paragraph.text) | |
except Exception as e: | |
print(f"Error in double period check: {str(e)}") | |
return False, [] | |
return len(incorrect_sentences) == 0, incorrect_sentences | |
def spacing_check(doc): | |
"""Check for incorrect spacing.""" | |
incorrect_spacing = [] | |
try: | |
# Check each paragraph for spacing issues | |
for paragraph in doc.paragraphs: | |
if " " in paragraph.text: | |
incorrect_spacing.append(paragraph.text) | |
except Exception as e: | |
print(f"Error in spacing check: {str(e)}") | |
return False, [] | |
return len(incorrect_spacing) == 0, incorrect_spacing | |
def check_abbreviation_usage(doc): | |
"""Check for consistent usage of abbreviations.""" | |
abbreviation_issues = [] | |
try: | |
# Regular expression to find abbreviations (2-5 capital letters) | |
abbreviation_pattern = r'\b[A-Z]{2,5}\b' | |
# Check each paragraph | |
for paragraph in doc.paragraphs: | |
text = paragraph.text | |
# Find all abbreviations in this paragraph | |
abbreviations = re.findall(abbreviation_pattern, text) | |
for abbr in abbreviations: | |
# Look for the full term definition | |
definition_pattern = rf'.+\({abbr}\)' | |
if any(re.search(definition_pattern, p.text) for p in doc.paragraphs): | |
# Check if the abbreviation is used consistently after definition | |
for other_paragraph in doc.paragraphs: | |
if abbr in other_paragraph.text and definition_pattern not in other_paragraph.text: | |
abbreviation_issues.append((definition_pattern.split('(')[0].strip(), abbr, paragraph.text)) | |
break | |
except Exception as e: | |
print(f"Error in abbreviation check: {str(e)}") | |
return [] | |
return abbreviation_issues | |
def check_date_formats(doc): | |
"""Check for consistent date formatting.""" | |
date_issues = [] | |
try: | |
# Look for date patterns in each paragraph | |
for paragraph in doc.paragraphs: | |
text = paragraph.text | |
if re.search(r'\b\d{1,2}/\d{1,2}/\d{4}\b', text): | |
date_issues.append((text, paragraph.text)) | |
except Exception as e: | |
print(f"Error in date format check: {str(e)}") | |
return [] | |
return date_issues | |
def check_placeholders(doc): | |
"""Check for the presence of placeholders.""" | |
placeholder_issues = [] | |
try: | |
# Look for placeholder text in each paragraph | |
for paragraph in doc.paragraprams: | |
text = paragraph.text | |
if '[ENTER TEXT]' in text or '[ENTER DATE]' in text: | |
placeholder_issues.append((text, paragraph.text)) | |
except Exception as e: | |
print(f"Error in placeholder check: {str(e)}") | |
return [] | |
return placeholder_issues | |
def get_document_checks(doc_type, template_type): | |
"""Return the required headings and other checks based on document type.""" | |
if doc_type == "Advisory Circular": | |
if template_type == "Short AC template AC": | |
return { | |
"required_headings": ["Purpose", "Applicability", "Related Reading Material", | |
"Background", "Discussion"] | |
} | |
else: # Long AC template | |
return { | |
"required_headings": ["Purpose", "Applicability", "Audience", "Related Reading Material", | |
"Background", "Discussion", "Conclusion"] | |
} | |
# Add other document types as needed | |
return {"required_headings": []} | |
def format_results_for_gradio(**kwargs): | |
"""Format the results for display in Gradio.""" | |
results = [] | |
results.append("# Document Check Results\n") | |
# Required Headings Check | |
results.append("## Required Headings Check") | |
if kwargs['heading_valid']: | |
results.append("β All required headings are present.\n") | |
else: | |
missing_headings = set(kwargs['required_headings']) - set(kwargs['headings_found']) | |
results.append("β Missing Required Headings:") | |
for heading in missing_headings: | |
results.append(f"- {heading}") | |
results.append("") | |
# Acronym Check | |
results.append("## Acronym Check") | |
if kwargs['acronyms_valid']: | |
results.append("β All acronyms are properly defined.\n") | |
else: | |
results.append("β The following acronyms need to be defined at first use:") | |
for acronym in kwargs['undefined_acronyms']: | |
results.append(f"- {acronym}") | |
results.append("") | |
# Legal Check | |
results.append("## Legal Terminology Check") | |
if kwargs['legal_valid']: | |
results.append("β All legal references are properly formatted.\n") | |
else: | |
results.append("β Incorrect Legal Terminology:") | |
for incorrect_term, correct_term in kwargs['incorrect_legal_references']: | |
results.append(f"- Use '{correct_term}' instead of '{incorrect_term}'") | |
results.append("") | |
# Table Caption Check | |
results.append("## Table Caption Check") | |
if kwargs['table_valid']: | |
results.append("β All table captions are correctly formatted.\n") | |
else: | |
results.append("β Incorrect Table Captions:") | |
for caption in kwargs['incorrect_captions']: | |
results.append(f"- {caption}") | |
results.append("") | |
# Figure Caption Check | |
results.append("## Figure Caption Check") | |
if kwargs['figure_valid']: | |
results.append("β All figure captions are correctly formatted.\n") | |
else: | |
results.append("β Incorrect Figure Captions:") | |
for caption in kwargs['incorrect_fig_captions']: | |
results.append(f"- {caption}") | |
results.append("") | |
# Table and Figure References Check | |
results.append("## Table and Figure References Check") | |
if kwargs['references_valid']: | |
results.append("β All table and figure references are correctly formatted.\n") | |
else: | |
results.append("β Incorrect Table/Figure References:") | |
for ref in kwargs['incorrect_table_figure_references']: | |
results.append(f"- {ref}") | |
results.append("") | |
# Document Title Style Check | |
results.append("## Document Title Style Check") | |
if kwargs['title_style_valid']: | |
results.append("β All document title references are properly styled.\n") | |
else: | |
results.append("β Incorrect Document Title Styling:") | |
for title in kwargs['incorrect_titles']: | |
results.append(f"- {title['text']}") | |
results.append(f" - Issue: {title['issue']}") | |
# Add formatting guidance | |
formatting_notes = { | |
"Advisory Circular": "Document titles should be italicized, not in quotation marks.", | |
"Order": "Document titles should be in quotation marks, not italicized.", | |
"Federal Register Notice": "Document titles should be in quotation marks, not italicized.", | |
"Policy Statement": "Document titles should not have any special formatting (no italics, no quotation marks)." | |
} | |
doc_type = kwargs.get('doc_type', 'Unknown') | |
if doc_type in formatting_notes: | |
results.append(f"\nNote: {formatting_notes[doc_type]}") | |
else: | |
results.append("\nNote: Please verify the correct formatting style for this document type.") | |
results.append("") | |
# Double Period Check | |
results.append("## Double Period Check") | |
if kwargs['double_period_valid']: | |
results.append("β No double periods found.\n") | |
else: | |
results.append("β Sentences found with double periods:") | |
for sentence in kwargs['incorrect_sentences']: | |
results.append(f"- {sentence}") | |
results.append("") | |
# Spacing Check | |
results.append("## Spacing Check") | |
if kwargs['spacing_valid']: | |
results.append("β All spacing is correct.\n") | |
else: | |
results.append("β Incorrect spacing found in:") | |
for spacing in kwargs['incorrect_spacing']: | |
results.append(f"- {spacing}") | |
results.append("") | |
# Abbreviation Consistency | |
results.append("## Abbreviation Consistency") | |
if not kwargs['abbreviation_issues']: | |
results.append("β All abbreviations are used consistently after definition.\n") | |
else: | |
results.append("β Abbreviation Issues:") | |
for full_term, acronym, paragraph in kwargs['abbreviation_issues']: | |
results.append(f"- Use '{acronym}' instead of '{full_term}' in: {paragraph}") | |
results.append("") | |
# Date Format Consistency | |
results.append("## Date Format Consistency") | |
if not kwargs['date_issues']: | |
results.append("β All dates are in the correct format.\n") | |
else: | |
results.append("β Date Format Issues:") | |
for date, paragraph in kwargs['date_issues']: | |
results.append(f"- Incorrect date format '{date}' in: {paragraph}") | |
results.append("") | |
# Placeholder Check | |
results.append("## Placeholder Check") | |
if not kwargs['placeholder_issues']: | |
results.append("β No placeholders found.\n") | |
else: | |
results.append("β Placeholders Found:") | |
for phrase, paragraph in kwargs['placeholder_issues']: | |
results.append(f"- Placeholder '{phrase}' in: {paragraph}") | |
return "\n".join(results) | |
def process_file(file_obj, doc_type, template_type): | |
""" | |
Process the uploaded file and return results with error handling | |
""" | |
if file_obj is None: | |
return "Please upload a document first." | |
try: | |
# Convert bytes to BytesIO object that Document can read | |
if isinstance(file_obj, bytes): | |
doc_bytes = io.BytesIO(file_obj) | |
else: | |
doc_bytes = io.BytesIO(file_obj.read()) | |
# Process the document and get results | |
results = process_document(doc_bytes, doc_type, template_type) | |
return results | |
except Exception as e: | |
error_trace = traceback.format_exc() | |
print(f"Error processing file: {str(e)}") | |
print(f"Full traceback: {error_trace}") | |
error_message = f"""An error occurred while processing the document: | |
Error: {str(e)} | |
Please ensure: | |
1. The file is a valid Word document (.docx) | |
2. The file is not corrupted | |
3. The file is not password protected | |
Technical details: {str(e)}""" | |
return error_message | |
def process_document(file_obj, doc_type, template_type): | |
"""Process the document and perform checks.""" | |
try: | |
# Read the Word document | |
doc = Document(file_obj) | |
print("Document read successfully.") | |
# Get required headings based on document type | |
required_headings = get_document_checks(doc_type, template_type).get("required_headings", []) | |
# Perform checks | |
heading_valid, headings_found = heading_title_check(doc, required_headings) | |
acronyms_valid, undefined_acronyms = acronym_check(doc) | |
# Format results | |
results = format_results_for_gradio( | |
heading_valid=heading_valid, | |
headings_found=headings_found, | |
acronyms_valid=acronyms_valid, | |
undefined_acronyms=undefined_acronyms, | |
legal_valid=True, # Placeholder | |
incorrect_legal_references=[], | |
table_valid=True, # Placeholder | |
incorrect_captions=[], | |
figure_valid=True, # Placeholder | |
incorrect_fig_captions=[], | |
references_valid=True, # Placeholder | |
incorrect_table_figure_references=[], | |
title_style_valid=True, # Placeholder | |
incorrect_titles=[], | |
required_headings=required_headings, | |
doc_type=doc_type, | |
double_period_valid=True, # Placeholder | |
incorrect_sentences=[], | |
spacing_valid=True, # Placeholder | |
incorrect_spacing=[], | |
abbreviation_issues=[], | |
date_issues=[], | |
placeholder_issues=[] | |
) | |
return results | |
except Exception as e: | |
print(f"Error in process_document: {str(e)}") | |
raise | |
def get_document_checks(doc_type, template_type): | |
"""Return the required headings and other checks based on document type.""" | |
if doc_type == "Advisory Circular": | |
if template_type == "Short AC template AC": | |
return { | |
"required_headings": ["Purpose", "Applicability", "Related Reading Material", | |
"Background", "Discussion"] | |
} | |
else: # Long AC template | |
return { | |
"required_headings": ["Purpose", "Applicability", "Audience", "Related Reading Material", | |
"Background", "Discussion", "Conclusion"] | |
} | |
# Add other document types as needed | |
return {"required_headings": []} | |
def format_results_for_gradio(**kwargs): | |
"""Format the results for display.""" | |
results = [] | |
results.append("# Document Check Results\n") | |
# Required Headings Check | |
results.append("## Required Headings Check") | |
if kwargs['heading_valid']: | |
results.append("β All required headings are present.\n") | |
else: | |
missing_headings = set(kwargs['required_headings']) - set(kwargs['headings_found']) | |
results.append("β Missing Required Headings:") | |
for heading in missing_headings: | |
results.append(f"- {heading}") | |
results.append("") | |
# Acronym Check | |
results.append("## Acronym Check") | |
if kwargs['acronyms_valid']: | |
results.append("β All acronyms are properly defined.\n") | |
else: | |
results.append("β The following acronyms need to be defined at first use:") | |
for acronym in kwargs['undefined_acronyms']: | |
results.append(f"- {acronym}") | |
return "\n".join(results) | |
def process_file(file_obj, doc_type, template_type): | |
"""Process the uploaded file and return results with error handling.""" | |
if file_obj is None: | |
return "Please upload a document first." | |
try: | |
# Convert bytes to BytesIO object | |
doc_bytes = io.BytesIO(file_obj) if isinstance(file_obj, bytes) else io.BytesIO(file_obj.read()) | |
# Process the document | |
results = process_document(doc_bytes, doc_type, template_type) | |
return results | |
except Exception as e: | |
error_message = f"""An error occurred while processing the document: | |
Error: {str(e)} | |
Please ensure: | |
1. The file is a valid Word document (.docx) | |
2. The file is not corrupted | |
3. The file is not password protected | |
Technical details: {str(e)}""" | |
print(f"Error processing file: {str(e)}") | |
return error_message | |
# Create the Gradio interface | |
demo = gr.Blocks(theme='JohnSmith9982/small_and_pretty') | |
with demo: | |
gr.Markdown("# FAA Document Checker") | |
gr.Markdown("Upload a Word document to check for compliance with FAA documentation standards.") | |
document_types = [ | |
"Advisory Circular", "Airworthiness Criteria", "Deviation Memo", "Exemption", | |
"Federal Register Notice", "Handbook/Manual", "Order", "Policy Statement", | |
"Rule", "Special Condition", "Technical Standard Order", "Other" | |
] | |
template_types = ["Short AC template AC", "Long AC template AC"] | |
with gr.Row(): | |
with gr.Column(scale=1): | |
file_input = gr.File( | |
label="Upload Word Document (.docx)", | |
file_types=[".docx"], | |
type="binary" | |
) | |
doc_type = gr.Dropdown( | |
choices=document_types, | |
label="Document Type", | |
value="Advisory Circular" | |
) | |
template_type = gr.Radio( | |
choices=template_types, | |
label="Template Type (Only for Advisory Circular)", | |
visible=True, | |
value="Short AC template AC" | |
) | |
submit_btn = gr.Button("Check Document", variant="primary") | |
with gr.Column(scale=2): | |
output = gr.Markdown( | |
label="Check Results", | |
value="Results will appear here after processing..." | |
) | |
# Update template type visibility based on document type | |
def update_template_visibility(doc_type): | |
return gr.update(visible=doc_type == "Advisory Circular") | |
doc_type.change( | |
fn=update_template_visibility, | |
inputs=[doc_type], | |
outputs=[template_type] | |
) | |
# Process file when submit button is clicked | |
submit_btn.click( | |
fn=process_file, | |
inputs=[file_input, doc_type, template_type], | |
outputs=[output] | |
) | |
# Launch the demo | |
demo.launch() |