Hoctar77's picture
Update app.py
aa7788c verified
raw
history blame
26.8 kB
import gradio as gr
import logging
import re
from docx import Document
import io
import traceback
def heading_title_check(paragraphs, required_headings):
headings_found = []
required_headings_set = set(required_headings)
for para in paragraphs:
para_strip = para.strip()
if para_strip in required_headings_set:
headings_found.append(para_strip)
all_headings_present = set(headings_found) == required_headings_set
return all_headings_present, headings_found
def acronym_check(paragraphs):
defined_acronyms = set()
undefined_acronyms = set()
acronym_pattern = re.compile(r'(\b[A-Z]{2,}\b)')
defined_pattern = re.compile(r'(\b\w+\b) \((\b[A-Z]{2,}\b)\)')
for paragraph in paragraphs:
defined_matches = defined_pattern.findall(paragraph)
for full_term, acronym in defined_matches:
defined_acronyms.add(acronym)
usage_matches = acronym_pattern.findall(paragraph)
for acronym in usage_matches:
if acronym not in defined_acronyms:
undefined_acronyms.add(acronym)
return len(undefined_acronyms) == 0, undefined_acronyms
def legal_check(paragraphs):
incorrect_variations = {
r"\bUSC\b": "U.S.C.",
r"\bCFR Part\b": "CFR part",
r"\bC\.F\.R\.\b": "CFR",
r"\bWe\b": "The FAA",
r"\bwe\b": "the FAA",
r"\bcancelled\b": "canceled",
r"\bshall\b": "must or will",
r"\b&\b": "and"
}
incorrect_legal_references = []
for paragraph in paragraphs:
title_14_pattern = r"(?P<prefix>^|[.!?\s])\s*(?P<title>title 14|Title 14)\b"
matches = re.finditer(title_14_pattern, paragraph)
for match in matches:
prefix = match.group('prefix')
current_title = match.group('title')
if prefix in ('.', '!', '?', '') and current_title.lower() == "title 14":
if current_title != "Title 14":
incorrect_legal_references.append((current_title, "Title 14"))
elif prefix.isspace() and current_title != "title 14":
incorrect_legal_references.append((current_title, "title 14"))
for incorrect_pattern, correct_term in incorrect_variations.items():
matches = re.finditer(incorrect_pattern, paragraph)
for match in matches:
incorrect_legal_references.append((match.group(), correct_term))
return len(incorrect_legal_references) == 0, incorrect_legal_references
def table_caption_check(paragraphs, doc_type):
if doc_type in ["Advisory Circular", "Order"]:
table_caption_pattern = re.compile(r'^Table\s+([A-Z0-9]+)-([A-Z0-9]+)[\.\s]', re.IGNORECASE)
else:
table_caption_pattern = re.compile(r'^Table\s+([A-Z0-9]+)[\.\s]', re.IGNORECASE)
incorrect_captions = []
for paragraph in paragraphs:
paragraph_strip = paragraph.strip()
if paragraph_strip.lower().startswith("table"):
if not table_caption_pattern.match(paragraph_strip):
incorrect_captions.append(paragraph_strip)
return len(incorrect_captions) == 0, incorrect_captions
def figure_caption_check(paragraphs, doc_type):
if doc_type in ["Advisory Circular", "Order"]:
figure_caption_pattern = re.compile(r'^Figure\s+([A-Z0-9]+)-([A-Z0-9]+)[\.\s]', re.IGNORECASE)
else:
figure_caption_pattern = re.compile(r'^Figure\s+([A-Z0-9]+)[\.\s]', re.IGNORECASE)
incorrect_fig_captions = []
for paragraph in paragraphs:
paragraph_strip = paragraph.strip()
if paragraph_strip.lower().startswith("figure"):
if not figure_caption_pattern.match(paragraph_strip):
incorrect_fig_captions.append(paragraph_strip)
return len(incorrect_fig_captions) == 0, incorrect_fig_captions
def table_figure_reference_check(paragraphs, doc_type):
incorrect_table_figure_references = []
if doc_type in ["Advisory Circular", "Order"]:
incorrect_table_ref_pattern = re.compile(r'\bTable\s+\d+(?!-\d+)\b', re.IGNORECASE)
incorrect_figure_ref_pattern = re.compile(r'\bFigure\s+\d+(?!-\d+)\b', re.IGNORECASE)
else:
incorrect_table_ref_pattern = re.compile(r'\bTable\s+\d+(-\d+)?\b', re.IGNORECASE)
incorrect_figure_ref_pattern = re.compile(r'\bFigure\s+\d+(-\d+)?\b', re.IGNORECASE)
for paragraph in paragraphs:
paragraph_strip = paragraph.strip()
starts_with_table_or_figure = paragraph_strip.lower().startswith('table') or paragraph_strip.lower().startswith('figure')
if not starts_with_table_or_figure:
incorrect_tables = incorrect_table_ref_pattern.findall(paragraph)
if incorrect_tables:
incorrect_table_figure_references.extend(incorrect_tables)
incorrect_figures = incorrect_figure_ref_pattern.findall(paragraph)
if incorrect_figures:
incorrect_table_figure_references.extend(incorrect_figures)
return len(incorrect_table_figure_references) == 0, incorrect_table_figure_references
def document_title_check(doc_path, doc_type):
incorrect_titles = []
doc = Document(doc_path)
# Updated pattern to capture titles correctly
ac_pattern = re.compile(r'AC\s+\d+(?:-\d+)?(?:,|\s)+(.+?)(?=\.|,|$)')
# Define formatting rules for different document types
formatting_rules = {
"Advisory Circular": {"italics": True, "quotes": False},
"Airworthiness Criteria": {"italics": False, "quotes": True},
"Deviation Memo": {"italics": False, "quotes": True},
"Exemption": {"italics": False, "quotes": True},
"Federal Register Notice": {"italics": False, "quotes": True},
"Handbook/Manual": {"italics": False, "quotes": False},
"Order": {"italics": False, "quotes": True},
"Policy Statement": {"italics": False, "quotes": False},
"Rule": {"italics": False, "quotes": True},
"Special Condition": {"italics": False, "quotes": True},
"Technical Standard Order": {"italics": False, "quotes": True},
"Other": {"italics": False, "quotes": False}
}
# Get the rules for the current document type
if doc_type not in formatting_rules:
raise ValueError(f"Unsupported document type: {doc_type}")
required_format = formatting_rules[doc_type]
for paragraph in doc.paragraphs:
text = paragraph.text
matches = ac_pattern.finditer(text)
for match in matches:
full_match = match.group(0)
title_text = match.group(1).strip()
# Get the position where the title starts
title_start = match.start(1)
# Check for any type of quotation marks, including smart quotes
title_in_quotes = any(q in title_text for q in ['"', "'", '"', '"', ''', '''])
# Check the formatting of the title
title_is_italicized = False
current_pos = 0
for run in paragraph.runs:
run_length = len(run.text)
if current_pos <= title_start < current_pos + run_length:
relative_pos = title_start - current_pos
title_is_italicized = run.italic
break
current_pos += run_length
# Check if formatting matches the required format
formatting_incorrect = False
issue_message = []
# Check italics requirement
if required_format["italics"] and not title_is_italicized:
formatting_incorrect = True
issue_message.append("should be italicized")
elif not required_format["italics"] and title_is_italicized:
formatting_incorrect = True
issue_message.append("should not be italicized")
# Check quotes requirement
if required_format["quotes"] and not title_in_quotes:
formatting_incorrect = True
issue_message.append("should be in quotes")
elif not required_format["quotes"] and title_in_quotes:
formatting_incorrect = True
issue_message.append("should not be in quotes")
if formatting_incorrect:
incorrect_titles.append({
'text': full_match,
'issue': ', '.join(issue_message)
})
return len(incorrect_titles) == 0, incorrect_titles
def get_document_checks(doc_type, template_type):
"""Return the required headings and other checks based on document type."""
document_checks = {
"Advisory Circular": {
"Short AC template AC": {
"required_headings": [
"PURPOSE.",
"APPLICABILITY.",
"CANCELLATION.",
"RELATED MATERIAL.",
"DEFINITION OF KEY TERMS."
]
},
"Long AC template AC": {
"required_headings": [
"Purpose.",
"Applicability.",
"Cancellation.",
"Related Material.",
"Definition of Key Terms."
]
}
},
"Airworthiness Criteria": {
"required_headings": [
"TBD - Need to research"
]
},
"Deviation Memo": {
"required_headings": [
"TBD - Need to research"
]
},
"Exemption": {
"required_headings": [
"TBD - Need to research"
]
},
"Federal Register Notice": {
"required_headings": [
"Purpose of This Notice",
"Audience",
"Where can I Find This Notice"
]
},
"Handbook/Manual": {
"required_headings": [
"TBD - Need to research"
]
},
"Order": {
"required_headings": [
"Purpose of This Order.",
"Audience.",
"Where to Find This Order."
]
},
"Policy Statement": {
"required_headings": [
"SUMMARY",
"CURRENT REGULATORY AND ADVISORY MATERIAL",
"RELEVANT PAST PRACTICE",
"POLICY",
"EFFECT OF POLICY",
"CONCLUSION"
]
},
"Rule": {
"required_headings": [
"TBD - Need to research"
]
},
"Special Condition": {
"required_headings": [
"TBD - Need to research"
]
},
"Technical Standard Order": {
"required_headings": [
"PURPOSE.",
"APPLICABILITY.",
"REQUIREMENTS.",
"MARKING.",
"APPLICATION DATA REQUIREMENTS.",
"MANUFACTURER DATA REQUIREMENTS.",
"FURNISHED DATA REQUIREMENTS.",
"HOW TO GET REFERENCED DOCUMENTS."
]
},
"Other": {
"required_headings": [
"N/A"
]
}
}
# Add debugging logs
logger = logging.getLogger(__name__)
logger.info(f"Requested document type: {doc_type}")
logger.info(f"Requested template type: {template_type}")
if doc_type == "Advisory Circular":
checks = document_checks.get(doc_type, {}).get(template_type, {})
else:
checks = document_checks.get(doc_type, {})
logger.info(f"Retrieved checks: {checks}")
return checks
def double_period_check(paragraphs):
incorrect_sentences = []
for paragraph in paragraphs:
sentences = re.split(r'(?<=[.!?]) +', paragraph)
for sentence in sentences:
if sentence.endswith('..'):
incorrect_sentences.append(sentence.strip())
return len(incorrect_sentences) == 0, incorrect_sentences
def spacing_check(paragraphs):
incorrect_spacing = []
doc_type_pattern = re.compile(r'(?<!\s)(AC|AD|CFR|FAA|N|SFAR)(\d+[-]?\d*)', re.IGNORECASE)
section_symbol_pattern = re.compile(r'(?<!\s)(Β§|Β§Β§)(\d+\.\d+)', re.IGNORECASE)
part_number_pattern = re.compile(r'(?<!\s)Part(\d+)', re.IGNORECASE)
paragraph_pattern = re.compile(r'(?<!\s)(\([a-z](?!\))|\([1-9](?!\)))', re.IGNORECASE)
double_space_pattern = re.compile(r'\s{2,}')
for paragraph in paragraphs:
if doc_type_pattern.search(paragraph) or \
section_symbol_pattern.search(paragraph) or \
part_number_pattern.search(paragraph) or \
paragraph_pattern.search(paragraph) or \
double_space_pattern.search(paragraph):
incorrect_spacing.append(paragraph)
return len(incorrect_spacing) == 0, incorrect_spacing
def check_prohibited_phrases(paragraphs):
prohibited_phrases = [
r'\babove\b',
r'\bbelow\b',
r'\bthere is\b',
r'\bthere are\b'
]
issues = []
for paragraph in paragraphs:
for phrase in prohibited_phrases:
if re.search(phrase, paragraph, re.IGNORECASE):
issues.append((phrase.strip(r'\b'), paragraph.strip()))
return issues
def check_abbreviation_usage(paragraphs):
"""Check for abbreviation consistency after first definition."""
abbreviations = {}
issues = []
for paragraph in paragraphs:
# Find definitions like "Federal Aviation Administration (FAA)"
defined_matches = re.findall(r'\b([A-Za-z &]+)\s+\((\b[A-Z]{2,}\b)\)', paragraph)
for full_term, acronym in defined_matches:
if acronym not in abbreviations:
abbreviations[acronym] = {"full_term": full_term.strip(), "defined": True}
# Check for full term usage after definition
for acronym, data in abbreviations.items():
full_term = data["full_term"]
if full_term in paragraph:
# Ignore first usage where it's defined
if data["defined"]:
data["defined"] = False # Mark it as now defined
else:
# Only flag subsequent occurrences
issues.append((full_term, acronym, paragraph.strip()))
return issues
def check_date_formats(paragraphs):
"""Check for inconsistent date formats."""
date_issues = []
correct_date_pattern = re.compile(r'\b(January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}, \d{4}\b')
date_pattern = re.compile(r'\b\d{1,2}/\d{1,2}/\d{2,4}\b') # MM/DD/YYYY
for paragraph in paragraphs:
if date_pattern.search(paragraph):
dates = date_pattern.findall(paragraph)
for date in dates:
if not correct_date_pattern.match(date):
date_issues.append((date, paragraph.strip()))
return date_issues
def check_placeholders(paragraphs):
"""Check for placeholders that should be removed."""
placeholder_phrases = [
r'\bTBD\b',
r'\bTo be determined\b',
r'\bTo be added\b'
]
issues = []
for paragraph in paragraphs:
for phrase in placeholder_phrases:
if re.search(phrase, paragraph, re.IGNORECASE):
issues.append((phrase.strip(r'\b'), paragraph.strip()))
return issues
def process_document(file_obj, doc_type, template_type):
try:
doc = Document(file_obj)
paragraphs = [para.text for para in doc.paragraphs]
required_headings = get_document_checks(doc_type, template_type).get("required_headings", [])
# Perform each check with `paragraphs` as input
heading_valid, headings_found = heading_title_check(paragraphs, required_headings)
acronyms_valid, undefined_acronyms = acronym_check(paragraphs)
legal_valid, incorrect_legal_references = legal_check(paragraphs)
table_valid, incorrect_captions = table_caption_check(paragraphs, doc_type)
figure_valid, incorrect_fig_captions = figure_caption_check(paragraphs, doc_type)
references_valid, incorrect_table_figure_references = table_figure_reference_check(paragraphs, doc_type)
title_style_valid, incorrect_titles = document_title_check(file_obj, doc_type) if doc_type in ["Advisory Circular", "Order"] else (True, [])
double_period_valid, incorrect_sentences = double_period_check(paragraphs)
spacing_valid, incorrect_spacing = spacing_check(paragraphs)
date_issues = check_date_formats(paragraphs) # Pass paragraphs here
placeholder_issues = check_placeholders(paragraphs) # Pass paragraphs here
# Format results
results = format_results_for_gradio(
heading_valid=heading_valid, headings_found=headings_found,
acronyms_valid=acronyms_valid, undefined_acronyms=undefined_acronyms,
legal_valid=legal_valid, incorrect_legal_references=incorrect_legal_references,
table_valid=table_valid, incorrect_captions=incorrect_captions,
figure_valid=figure_valid, incorrect_fig_captions=incorrect_fig_captions,
references_valid=references_valid, incorrect_table_figure_references=incorrect_table_figure_references,
title_style_valid=title_style_valid, incorrect_titles=incorrect_titles,
double_period_valid=double_period_valid, incorrect_sentences=incorrect_sentences,
spacing_valid=spacing_valid, incorrect_spacing=incorrect_spacing,
date_issues=date_issues, # Added date_issues
placeholder_issues=placeholder_issues, # Added placeholder_issues
required_headings=required_headings, doc_type=doc_type
)
return results
except Exception as e:
print(f"Error in process_document: {str(e)}")
return f"An error occurred while processing the document: {str(e)}"
def format_results_for_gradio(**kwargs):
"""Format the results for display in Gradio."""
results = []
results.append("# Document Check Results\n")
# Required Headings Check
results.append("## Required Headings Check")
if kwargs['heading_valid']:
results.append("βœ… All required headings are present.\n")
else:
missing_headings = set(kwargs['required_headings']) - set(kwargs['headings_found'])
results.append("❌ Missing Required Headings:")
for heading in missing_headings:
results.append(f"- {heading}")
results.append("")
# Acronym Check
results.append("## Acronym Check")
if kwargs['acronyms_valid']:
results.append("βœ… All acronyms are properly defined.\n")
else:
results.append("❌ The following acronyms need to be defined at first use:")
for acronym in kwargs['undefined_acronyms']:
results.append(f"- {acronym}")
results.append("")
# Legal Check
results.append("## Legal Terminology Check")
if kwargs['legal_valid']:
results.append("βœ… All legal references are properly formatted.\n")
else:
results.append("❌ Incorrect Legal Terminology:")
for incorrect_term, correct_term in kwargs['incorrect_legal_references']:
results.append(f"- Use '{correct_term}' instead of '{incorrect_term}'")
results.append("")
# Table Caption Check
results.append("## Table Caption Check")
if kwargs['table_valid']:
results.append("βœ… All table captions are correctly formatted.\n")
else:
results.append("❌ Incorrect Table Captions:")
for caption in kwargs['incorrect_captions']:
results.append(f"- {caption}")
results.append("")
# Figure Caption Check
results.append("## Figure Caption Check")
if kwargs['figure_valid']:
results.append("βœ… All figure captions are correctly formatted.\n")
else:
results.append("❌ Incorrect Figure Captions:")
for caption in kwargs['incorrect_fig_captions']:
results.append(f"- {caption}")
results.append("")
# Table and Figure References Check
results.append("## Table and Figure References Check")
if kwargs['references_valid']:
results.append("βœ… All table and figure references are correctly formatted.\n")
else:
results.append("❌ Incorrect Table/Figure References:")
for ref in kwargs['incorrect_table_figure_references']:
results.append(f"- {ref}")
results.append("")
# Document Title Style Check
results.append("## Document Title Style Check")
if kwargs['title_style_valid']:
results.append("βœ… All document title references are properly styled.\n")
else:
results.append("❌ Incorrect Document Title Styling:")
for title in kwargs['incorrect_titles']:
results.append(f"- {title['text']}")
results.append(f" - Issue: {title['issue']}")
# Add formatting guidance
formatting_notes = {
"Advisory Circular": "Document titles should be italicized, not in quotation marks.",
"Order": "Document titles should be in quotation marks, not italicized.",
"Federal Register Notice": "Document titles should be in quotation marks, not italicized.",
"Policy Statement": "Document titles should not have any special formatting (no italics, no quotation marks)."
}
doc_type = kwargs.get('doc_type', 'Unknown')
if doc_type in formatting_notes:
results.append(f"\nNote: {formatting_notes[doc_type]}")
else:
results.append("\nNote: Please verify the correct formatting style for this document type.")
results.append("")
# Double Period Check
results.append("## Double Period Check")
if kwargs['double_period_valid']:
results.append("βœ… No double periods found.\n")
else:
results.append("❌ Sentences found with double periods:")
for sentence in kwargs['incorrect_sentences']:
results.append(f"- {sentence}")
results.append("")
# Spacing Check
results.append("## Spacing Check")
if kwargs['spacing_valid']:
results.append("βœ… All spacing is correct.\n")
else:
results.append("❌ Incorrect spacing found in:")
for spacing in kwargs['incorrect_spacing']:
results.append(f"- {spacing}")
results.append("")
# Date Format Consistency
results.append("## Date Format Consistency")
if not kwargs['date_issues']:
results.append("βœ… All dates are in the correct format.\n")
else:
results.append("❌ Date Format Issues:")
for date, paragraph in kwargs['date_issues']:
results.append(f"- Incorrect date format '{date}' in: {paragraph}")
results.append("")
# Placeholder Check
results.append("## Placeholder Check")
if not kwargs['placeholder_issues']:
results.append("βœ… No future references or placeholders found.\n")
else:
results.append("❌ Placeholders Found:")
for phrase, paragraph in kwargs['placeholder_issues']:
results.append(f"- Placeholder '{phrase}' in: {paragraph}")
return "\n".join(results)
def process_file(file_obj, doc_type, template_type):
"""Process the uploaded file and return results with error handling."""
if file_obj is None:
return "Please upload a document first."
try:
# Convert bytes to BytesIO object
doc_bytes = io.BytesIO(file_obj) if isinstance(file_obj, bytes) else io.BytesIO(file_obj.read())
# Process the document
results = process_document(doc_bytes, doc_type, template_type)
return results
except Exception as e:
error_message = f"""An error occurred while processing the document:
Error: {str(e)}
Please ensure:
1. The file is a valid Word document (.docx)
2. The file is not corrupted
3. The file is not password protected
Technical details: {str(e)}"""
print(f"Error processing file: {str(e)}")
return error_message
# Create the Gradio interface
demo = gr.Blocks(theme='JohnSmith9982/small_and_pretty')
with demo:
gr.Markdown("# Document Checker Tool")
gr.Markdown("Upload a Word (docx) document to check for compliance with U.S. federal documentation standards.")
gr.Markdown("*This tool is still in development and you might get false positives in your results*")
gr.Markdown("Contact Eric Putnam if you have questions and comments.")
gr.Markdown("""
1. Upload a clean (no track changes or comments) Word file.
2. Choose **Check Document**.""")
document_types = [
"Advisory Circular", "Airworthiness Criteria", "Deviation Memo", "Exemption",
"Federal Register Notice", "Handbook/Manual", "Order", "Policy Statement",
"Rule", "Special Condition", "Technical Standard Order", "Other"
]
template_types = ["Short AC template AC", "Long AC template AC"]
with gr.Row():
with gr.Column(scale=1):
file_input = gr.File(
label="Upload Word Document (.docx)",
file_types=[".docx"],
type="binary"
)
doc_type = gr.Dropdown(
choices=document_types,
label="Document Type",
value="Advisory Circular"
)
template_type = gr.Radio(
choices=template_types,
label="Template Type (Only for Advisory Circular)",
visible=True,
value="Short AC template AC"
)
submit_btn = gr.Button("Check Document", variant="primary")
with gr.Column(scale=2):
output = gr.Markdown(
label="Check Results",
value="Results will appear here after processing..."
)
# Update template type visibility based on document type
def update_template_visibility(doc_type):
return gr.update(visible=doc_type == "Advisory Circular")
doc_type.change(
fn=update_template_visibility,
inputs=[doc_type],
outputs=[template_type]
)
# Process file when submit button is clicked
submit_btn.click(
fn=process_file,
inputs=[file_input, doc_type, template_type],
outputs=[output]
)
# Launch the demo
demo.launch()