Hoctar77's picture
Update app.py
4b2d07d verified
raw
history blame
24.6 kB
import gradio as gr
import logging
import re
from docx import Document
import io
import traceback
def heading_title_check(doc, required_headings):
"""Check if all required headings are present."""
headings_found = []
try:
# Iterate through paragraphs to find headings
for paragraph in doc.paragraphs:
if paragraph.style.name.startswith('Heading'):
headings_found.append(paragraph.text.strip())
except Exception as e:
print(f"Error in heading check: {str(e)}")
return False, []
# Check if all required headings are present
all_present = all(heading in headings_found for heading in required_headings)
return all_present, headings_found
def acronym_check(doc):
"""Check if all acronyms are properly defined."""
undefined_acronyms = set()
defined_acronyms = set()
try:
# Regular expression for finding acronyms (2-5 capital letters)
acronym_pattern = r'\b[A-Z]{2,5}\b'
# Check each paragraph
for paragraph in doc.paragraphs:
text = paragraph.text
# Find all acronyms in this paragraph
acronyms = re.findall(acronym_pattern, text)
for acronym in acronyms:
if acronym not in defined_acronyms:
# Look for definition pattern: "full term (ACRONYM)"
definition_pattern = rf'.+\({acronym}\)'
if not any(re.search(definition_pattern, p.text) for p in doc.paragraphs):
undefined_acronyms.add(acronym)
else:
defined_acronyms.add(acronym)
except Exception as e:
print(f"Error in acronym check: {str(e)}")
return False, []
return len(undefined_acronyms) == 0, list(undefined_acronyms)
def legal_check(doc):
"""Check if legal terminology is used correctly."""
incorrect_legal_references = []
try:
# Define legal terminology mapping
legal_terms = {
"C.F.R.": "Code of Federal Regulations",
"F.R.": "Federal Register",
"U.S.C.": "United States Code"
}
# Check each paragraph
for paragraph in doc.paragraprams:
text = paragraph.text
for incorrect_term, correct_term in legal_terms.items():
if incorrect_term in text and correct_term not in text:
incorrect_legal_references.append((incorrect_term, correct_term))
except Exception as e:
print(f"Error in legal check: {str(e)}")
return False, []
return len(incorrect_legal_references) == 0, incorrect_legal_references
def table_caption_check(doc, doc_type):
"""Check if table captions are formatted correctly."""
incorrect_captions = []
try:
# Check table captions
for table in doc.tables:
# Get the paragraph before the table
prev_paragraph = table._element.getprevious()
if prev_paragraph is not None and prev_paragraph.text.startswith("Table"):
# Check if the caption is formatted correctly
if doc_type == "Advisory Circular":
# AC captions should be "Table X. Caption text"
if not prev_paragraph.text.startswith("Table ") or ". " not in prev_paragraph.text:
incorrect_captions.append(prev_paragraph.text)
else:
# Other doc types may have different caption formats
pass
except Exception as e:
print(f"Error in table caption check: {str(e)}")
return False, []
return len(incorrect_captions) == 0, incorrect_captions
def figure_caption_check(doc, doc_type):
"""Check if figure captions are formatted correctly."""
incorrect_fig_captions = []
try:
# Check figure captions
for paragraph in doc.paragraphs:
if paragraph.text.startswith("Figure"):
# Check if the caption is formatted correctly
if doc_type == "Advisory Circular":
# AC captions should be "Figure X. Caption text"
if ". " not in paragraph.text:
incorrect_fig_captions.append(paragraph.text)
else:
# Other doc types may have different caption formats
pass
except Exception as e:
print(f"Error in figure caption check: {str(e)}")
return False, []
return len(incorrect_fig_captions) == 0, incorrect_fig_captions
def table_figure_reference_check(doc, doc_type):
"""Check if table and figure references are formatted correctly."""
incorrect_table_figure_references = []
try:
# Check table and figure references
for paragraph in doc.paragraphs:
text = paragraph.text
if "Table" in text or "Figure" in text:
# Check if the reference is formatted correctly
if doc_type == "Advisory Circular":
# AC references should be "Table X" or "Figure X"
if not any(text.startswith(f"{item} ") for item in ["Table", "Figure"]):
incorrect_table_figure_references.append(text)
else:
# Other doc types may have different reference formats
pass
except Exception as e:
print(f"Error in table/figure reference check: {str(e)}")
return False, []
return len(incorrect_table_figure_references) == 0, incorrect_table_figure_references
def document_title_check(doc, doc_type):
"""Check if the document title is formatted correctly."""
incorrect_titles = []
try:
# Check the document title
if len(doc.paragraphs) > 0 and doc.paragraphs[0].style.name == 'Title':
title_text = doc.paragraphs[0].text
# Check the formatting based on document type
if doc_type == "Advisory Circular":
if not title_text.startswith("ADVISORY CIRCULAR ") or title_text.endswith(" AC"):
incorrect_titles.append({"text": title_text, "issue": "Advisory Circular titles should start with 'ADVISORY CIRCULAR ' and end with ' AC'"})
elif doc_type == "Order":
if not title_text.startswith('"') or not title_text.endswith('"'):
incorrect_titles.append({"text": title_text, "issue": "Order titles should be enclosed in quotation marks"})
elif doc_type == "Federal Register Notice":
if not title_text.startswith('"') or not title_text.endswith('"'):
incorrect_titles.append({"text": title_text, "issue": "Federal Register Notice titles should be enclosed in quotation marks"})
elif doc_type == "Policy Statement":
if title_text.startswith('"') or title_text.endswith('"'):
incorrect_titles.append({"text": title_text, "issue": "Policy Statement titles should not have quotation marks"})
except Exception as e:
print(f"Error in document title check: {str(e)}")
return False, []
return len(incorrect_titles) == 0, incorrect_titles
def double_period_check(doc):
"""Check for sentences with double periods."""
incorrect_sentences = []
try:
# Check each paragraph for double periods
for paragraph in doc.paragraphs:
if ".." in paragraph.text:
incorrect_sentences.append(paragraph.text)
except Exception as e:
print(f"Error in double period check: {str(e)}")
return False, []
return len(incorrect_sentences) == 0, incorrect_sentences
def spacing_check(doc):
"""Check for incorrect spacing."""
incorrect_spacing = []
try:
# Check each paragraph for spacing issues
for paragraph in doc.paragraphs:
if " " in paragraph.text:
incorrect_spacing.append(paragraph.text)
except Exception as e:
print(f"Error in spacing check: {str(e)}")
return False, []
return len(incorrect_spacing) == 0, incorrect_spacing
def check_abbreviation_usage(doc):
"""Check for consistent usage of abbreviations."""
abbreviation_issues = []
try:
# Regular expression to find abbreviations (2-5 capital letters)
abbreviation_pattern = r'\b[A-Z]{2,5}\b'
# Check each paragraph
for paragraph in doc.paragraphs:
text = paragraph.text
# Find all abbreviations in this paragraph
abbreviations = re.findall(abbreviation_pattern, text)
for abbr in abbreviations:
# Look for the full term definition
definition_pattern = rf'.+\({abbr}\)'
if any(re.search(definition_pattern, p.text) for p in doc.paragraphs):
# Check if the abbreviation is used consistently after definition
for other_paragraph in doc.paragraphs:
if abbr in other_paragraph.text and definition_pattern not in other_paragraph.text:
abbreviation_issues.append((definition_pattern.split('(')[0].strip(), abbr, paragraph.text))
break
except Exception as e:
print(f"Error in abbreviation check: {str(e)}")
return []
return abbreviation_issues
def check_date_formats(doc):
"""Check for consistent date formatting."""
date_issues = []
try:
# Look for date patterns in each paragraph
for paragraph in doc.paragraphs:
text = paragraph.text
if re.search(r'\b\d{1,2}/\d{1,2}/\d{4}\b', text):
date_issues.append((text, paragraph.text))
except Exception as e:
print(f"Error in date format check: {str(e)}")
return []
return date_issues
def check_placeholders(doc):
"""Check for the presence of placeholders."""
placeholder_issues = []
try:
# Look for placeholder text in each paragraph
for paragraph in doc.paragraprams:
text = paragraph.text
if '[ENTER TEXT]' in text or '[ENTER DATE]' in text:
placeholder_issues.append((text, paragraph.text))
except Exception as e:
print(f"Error in placeholder check: {str(e)}")
return []
return placeholder_issues
def get_document_checks(doc_type, template_type):
"""Return the required headings and other checks based on document type."""
if doc_type == "Advisory Circular":
if template_type == "Short AC template AC":
return {
"required_headings": ["Purpose", "Applicability", "Related Reading Material",
"Background", "Discussion"]
}
else: # Long AC template
return {
"required_headings": ["Purpose", "Applicability", "Audience", "Related Reading Material",
"Background", "Discussion", "Conclusion"]
}
# Add other document types as needed
return {"required_headings": []}
def format_results_for_gradio(**kwargs):
"""Format the results for display in Gradio."""
results = []
results.append("# Document Check Results\n")
# Required Headings Check
results.append("## Required Headings Check")
if kwargs['heading_valid']:
results.append("βœ… All required headings are present.\n")
else:
missing_headings = set(kwargs['required_headings']) - set(kwargs['headings_found'])
results.append("❌ Missing Required Headings:")
for heading in missing_headings:
results.append(f"- {heading}")
results.append("")
# Acronym Check
results.append("## Acronym Check")
if kwargs['acronyms_valid']:
results.append("βœ… All acronyms are properly defined.\n")
else:
results.append("❌ The following acronyms need to be defined at first use:")
for acronym in kwargs['undefined_acronyms']:
results.append(f"- {acronym}")
results.append("")
# Legal Check
results.append("## Legal Terminology Check")
if kwargs['legal_valid']:
results.append("βœ… All legal references are properly formatted.\n")
else:
results.append("❌ Incorrect Legal Terminology:")
for incorrect_term, correct_term in kwargs['incorrect_legal_references']:
results.append(f"- Use '{correct_term}' instead of '{incorrect_term}'")
results.append("")
# Table Caption Check
results.append("## Table Caption Check")
if kwargs['table_valid']:
results.append("βœ… All table captions are correctly formatted.\n")
else:
results.append("❌ Incorrect Table Captions:")
for caption in kwargs['incorrect_captions']:
results.append(f"- {caption}")
results.append("")
# Figure Caption Check
results.append("## Figure Caption Check")
if kwargs['figure_valid']:
results.append("βœ… All figure captions are correctly formatted.\n")
else:
results.append("❌ Incorrect Figure Captions:")
for caption in kwargs['incorrect_fig_captions']:
results.append(f"- {caption}")
results.append("")
# Table and Figure References Check
results.append("## Table and Figure References Check")
if kwargs['references_valid']:
results.append("βœ… All table and figure references are correctly formatted.\n")
else:
results.append("❌ Incorrect Table/Figure References:")
for ref in kwargs['incorrect_table_figure_references']:
results.append(f"- {ref}")
results.append("")
# Document Title Style Check
results.append("## Document Title Style Check")
if kwargs['title_style_valid']:
results.append("βœ… All document title references are properly styled.\n")
else:
results.append("❌ Incorrect Document Title Styling:")
for title in kwargs['incorrect_titles']:
results.append(f"- {title['text']}")
results.append(f" - Issue: {title['issue']}")
# Add formatting guidance
formatting_notes = {
"Advisory Circular": "Document titles should be italicized, not in quotation marks.",
"Order": "Document titles should be in quotation marks, not italicized.",
"Federal Register Notice": "Document titles should be in quotation marks, not italicized.",
"Policy Statement": "Document titles should not have any special formatting (no italics, no quotation marks)."
}
doc_type = kwargs.get('doc_type', 'Unknown')
if doc_type in formatting_notes:
results.append(f"\nNote: {formatting_notes[doc_type]}")
else:
results.append("\nNote: Please verify the correct formatting style for this document type.")
results.append("")
# Double Period Check
results.append("## Double Period Check")
if kwargs['double_period_valid']:
results.append("βœ… No double periods found.\n")
else:
results.append("❌ Sentences found with double periods:")
for sentence in kwargs['incorrect_sentences']:
results.append(f"- {sentence}")
results.append("")
# Spacing Check
results.append("## Spacing Check")
if kwargs['spacing_valid']:
results.append("βœ… All spacing is correct.\n")
else:
results.append("❌ Incorrect spacing found in:")
for spacing in kwargs['incorrect_spacing']:
results.append(f"- {spacing}")
results.append("")
# Abbreviation Consistency
results.append("## Abbreviation Consistency")
if not kwargs['abbreviation_issues']:
results.append("βœ… All abbreviations are used consistently after definition.\n")
else:
results.append("❌ Abbreviation Issues:")
for full_term, acronym, paragraph in kwargs['abbreviation_issues']:
results.append(f"- Use '{acronym}' instead of '{full_term}' in: {paragraph}")
results.append("")
# Date Format Consistency
results.append("## Date Format Consistency")
if not kwargs['date_issues']:
results.append("βœ… All dates are in the correct format.\n")
else:
results.append("❌ Date Format Issues:")
for date, paragraph in kwargs['date_issues']:
results.append(f"- Incorrect date format '{date}' in: {paragraph}")
results.append("")
# Placeholder Check
results.append("## Placeholder Check")
if not kwargs['placeholder_issues']:
results.append("βœ… No placeholders found.\n")
else:
results.append("❌ Placeholders Found:")
for phrase, paragraph in kwargs['placeholder_issues']:
results.append(f"- Placeholder '{phrase}' in: {paragraph}")
return "\n".join(results)
def process_file(file_obj, doc_type, template_type):
"""
Process the uploaded file and return results with error handling
"""
if file_obj is None:
return "Please upload a document first."
try:
# Convert bytes to BytesIO object that Document can read
if isinstance(file_obj, bytes):
doc_bytes = io.BytesIO(file_obj)
else:
doc_bytes = io.BytesIO(file_obj.read())
# Process the document and get results
results = process_document(doc_bytes, doc_type, template_type)
return results
except Exception as e:
error_trace = traceback.format_exc()
print(f"Error processing file: {str(e)}")
print(f"Full traceback: {error_trace}")
error_message = f"""An error occurred while processing the document:
Error: {str(e)}
Please ensure:
1. The file is a valid Word document (.docx)
2. The file is not corrupted
3. The file is not password protected
Technical details: {str(e)}"""
return error_message
def process_document(file_obj, doc_type, template_type):
"""Process the document and perform checks."""
try:
# Read the Word document
doc = Document(file_obj)
print("Document read successfully.")
# Get required headings based on document type
required_headings = get_document_checks(doc_type, template_type).get("required_headings", [])
# Perform checks
heading_valid, headings_found = heading_title_check(doc, required_headings)
acronyms_valid, undefined_acronyms = acronym_check(doc)
# Format results
results = format_results_for_gradio(
heading_valid=heading_valid,
headings_found=headings_found,
acronyms_valid=acronyms_valid,
undefined_acronyms=undefined_acronyms,
legal_valid=True, # Placeholder
incorrect_legal_references=[],
table_valid=True, # Placeholder
incorrect_captions=[],
figure_valid=True, # Placeholder
incorrect_fig_captions=[],
references_valid=True, # Placeholder
incorrect_table_figure_references=[],
title_style_valid=True, # Placeholder
incorrect_titles=[],
required_headings=required_headings,
doc_type=doc_type,
double_period_valid=True, # Placeholder
incorrect_sentences=[],
spacing_valid=True, # Placeholder
incorrect_spacing=[],
abbreviation_issues=[],
date_issues=[],
placeholder_issues=[]
)
return results
except Exception as e:
print(f"Error in process_document: {str(e)}")
raise
def get_document_checks(doc_type, template_type):
"""Return the required headings and other checks based on document type."""
if doc_type == "Advisory Circular":
if template_type == "Short AC template AC":
return {
"required_headings": ["Purpose", "Applicability", "Related Reading Material",
"Background", "Discussion"]
}
else: # Long AC template
return {
"required_headings": ["Purpose", "Applicability", "Audience", "Related Reading Material",
"Background", "Discussion", "Conclusion"]
}
# Add other document types as needed
return {"required_headings": []}
def format_results_for_gradio(**kwargs):
"""Format the results for display."""
results = []
results.append("# Document Check Results\n")
# Required Headings Check
results.append("## Required Headings Check")
if kwargs['heading_valid']:
results.append("βœ… All required headings are present.\n")
else:
missing_headings = set(kwargs['required_headings']) - set(kwargs['headings_found'])
results.append("❌ Missing Required Headings:")
for heading in missing_headings:
results.append(f"- {heading}")
results.append("")
# Acronym Check
results.append("## Acronym Check")
if kwargs['acronyms_valid']:
results.append("βœ… All acronyms are properly defined.\n")
else:
results.append("❌ The following acronyms need to be defined at first use:")
for acronym in kwargs['undefined_acronyms']:
results.append(f"- {acronym}")
return "\n".join(results)
def process_file(file_obj, doc_type, template_type):
"""Process the uploaded file and return results with error handling."""
if file_obj is None:
return "Please upload a document first."
try:
# Convert bytes to BytesIO object
doc_bytes = io.BytesIO(file_obj) if isinstance(file_obj, bytes) else io.BytesIO(file_obj.read())
# Process the document
results = process_document(doc_bytes, doc_type, template_type)
return results
except Exception as e:
error_message = f"""An error occurred while processing the document:
Error: {str(e)}
Please ensure:
1. The file is a valid Word document (.docx)
2. The file is not corrupted
3. The file is not password protected
Technical details: {str(e)}"""
print(f"Error processing file: {str(e)}")
return error_message
# Create the Gradio interface
demo = gr.Blocks(theme='JohnSmith9982/small_and_pretty')
with demo:
gr.Markdown("# FAA Document Checker")
gr.Markdown("Upload a Word document to check for compliance with FAA documentation standards.")
document_types = [
"Advisory Circular", "Airworthiness Criteria", "Deviation Memo", "Exemption",
"Federal Register Notice", "Handbook/Manual", "Order", "Policy Statement",
"Rule", "Special Condition", "Technical Standard Order", "Other"
]
template_types = ["Short AC template AC", "Long AC template AC"]
with gr.Row():
with gr.Column(scale=1):
file_input = gr.File(
label="Upload Word Document (.docx)",
file_types=[".docx"],
type="binary"
)
doc_type = gr.Dropdown(
choices=document_types,
label="Document Type",
value="Advisory Circular"
)
template_type = gr.Radio(
choices=template_types,
label="Template Type (Only for Advisory Circular)",
visible=True,
value="Short AC template AC"
)
submit_btn = gr.Button("Check Document", variant="primary")
with gr.Column(scale=2):
output = gr.Markdown(
label="Check Results",
value="Results will appear here after processing..."
)
# Update template type visibility based on document type
def update_template_visibility(doc_type):
return gr.update(visible=doc_type == "Advisory Circular")
doc_type.change(
fn=update_template_visibility,
inputs=[doc_type],
outputs=[template_type]
)
# Process file when submit button is clicked
submit_btn.click(
fn=process_file,
inputs=[file_input, doc_type, template_type],
outputs=[output]
)
# Launch the demo
demo.launch()