import gradio as gr import logging import re from docx import Document import io import traceback def heading_title_check(doc, required_headings): """Check if all required headings are present.""" headings_found = [] try: # Iterate through paragraphs to find headings for paragraph in doc.paragraphs: if paragraph.style.name.startswith('Heading'): headings_found.append(paragraph.text.strip()) except Exception as e: print(f"Error in heading check: {str(e)}") return False, [] # Check if all required headings are present all_present = all(heading in headings_found for heading in required_headings) return all_present, headings_found def acronym_check(doc): """Check if all acronyms are properly defined.""" undefined_acronyms = set() defined_acronyms = set() try: # Regular expression for finding acronyms (2-5 capital letters) acronym_pattern = r'\b[A-Z]{2,5}\b' # Check each paragraph for paragraph in doc.paragraphs: text = paragraph.text # Find all acronyms in this paragraph acronyms = re.findall(acronym_pattern, text) for acronym in acronyms: if acronym not in defined_acronyms: # Look for definition pattern: "full term (ACRONYM)" definition_pattern = rf'.+\({acronym}\)' if not any(re.search(definition_pattern, p.text) for p in doc.paragraphs): undefined_acronyms.add(acronym) else: defined_acronyms.add(acronym) except Exception as e: print(f"Error in acronym check: {str(e)}") return False, [] return len(undefined_acronyms) == 0, list(undefined_acronyms) def legal_check(doc): """Check if legal terminology is used correctly.""" incorrect_legal_references = [] try: # Define legal terminology mapping legal_terms = { "C.F.R.": "Code of Federal Regulations", "F.R.": "Federal Register", "U.S.C.": "United States Code" } # Check each paragraph for paragraph in doc.paragraprams: text = paragraph.text for incorrect_term, correct_term in legal_terms.items(): if incorrect_term in text and correct_term not in text: incorrect_legal_references.append((incorrect_term, correct_term)) except Exception as e: print(f"Error in legal check: {str(e)}") return False, [] return len(incorrect_legal_references) == 0, incorrect_legal_references def table_caption_check(doc, doc_type): """Check if table captions are formatted correctly.""" incorrect_captions = [] try: # Check table captions for table in doc.tables: # Get the paragraph before the table prev_paragraph = table._element.getprevious() if prev_paragraph is not None and prev_paragraph.text.startswith("Table"): # Check if the caption is formatted correctly if doc_type == "Advisory Circular": # AC captions should be "Table X. Caption text" if not prev_paragraph.text.startswith("Table ") or ". " not in prev_paragraph.text: incorrect_captions.append(prev_paragraph.text) else: # Other doc types may have different caption formats pass except Exception as e: print(f"Error in table caption check: {str(e)}") return False, [] return len(incorrect_captions) == 0, incorrect_captions def figure_caption_check(doc, doc_type): """Check if figure captions are formatted correctly.""" incorrect_fig_captions = [] try: # Check figure captions for paragraph in doc.paragraphs: if paragraph.text.startswith("Figure"): # Check if the caption is formatted correctly if doc_type == "Advisory Circular": # AC captions should be "Figure X. Caption text" if ". " not in paragraph.text: incorrect_fig_captions.append(paragraph.text) else: # Other doc types may have different caption formats pass except Exception as e: print(f"Error in figure caption check: {str(e)}") return False, [] return len(incorrect_fig_captions) == 0, incorrect_fig_captions def table_figure_reference_check(doc, doc_type): """Check if table and figure references are formatted correctly.""" incorrect_table_figure_references = [] try: # Check table and figure references for paragraph in doc.paragraphs: text = paragraph.text if "Table" in text or "Figure" in text: # Check if the reference is formatted correctly if doc_type == "Advisory Circular": # AC references should be "Table X" or "Figure X" if not any(text.startswith(f"{item} ") for item in ["Table", "Figure"]): incorrect_table_figure_references.append(text) else: # Other doc types may have different reference formats pass except Exception as e: print(f"Error in table/figure reference check: {str(e)}") return False, [] return len(incorrect_table_figure_references) == 0, incorrect_table_figure_references def document_title_check(doc, doc_type): """Check if the document title is formatted correctly.""" incorrect_titles = [] try: # Check the document title if len(doc.paragraphs) > 0 and doc.paragraphs[0].style.name == 'Title': title_text = doc.paragraphs[0].text # Check the formatting based on document type if doc_type == "Advisory Circular": if not title_text.startswith("ADVISORY CIRCULAR ") or title_text.endswith(" AC"): incorrect_titles.append({"text": title_text, "issue": "Advisory Circular titles should start with 'ADVISORY CIRCULAR ' and end with ' AC'"}) elif doc_type == "Order": if not title_text.startswith('"') or not title_text.endswith('"'): incorrect_titles.append({"text": title_text, "issue": "Order titles should be enclosed in quotation marks"}) elif doc_type == "Federal Register Notice": if not title_text.startswith('"') or not title_text.endswith('"'): incorrect_titles.append({"text": title_text, "issue": "Federal Register Notice titles should be enclosed in quotation marks"}) elif doc_type == "Policy Statement": if title_text.startswith('"') or title_text.endswith('"'): incorrect_titles.append({"text": title_text, "issue": "Policy Statement titles should not have quotation marks"}) except Exception as e: print(f"Error in document title check: {str(e)}") return False, [] return len(incorrect_titles) == 0, incorrect_titles def double_period_check(doc): """Check for sentences with double periods.""" incorrect_sentences = [] try: # Check each paragraph for double periods for paragraph in doc.paragraphs: if ".." in paragraph.text: incorrect_sentences.append(paragraph.text) except Exception as e: print(f"Error in double period check: {str(e)}") return False, [] return len(incorrect_sentences) == 0, incorrect_sentences def spacing_check(doc): """Check for incorrect spacing.""" incorrect_spacing = [] try: # Check each paragraph for spacing issues for paragraph in doc.paragraphs: if " " in paragraph.text: incorrect_spacing.append(paragraph.text) except Exception as e: print(f"Error in spacing check: {str(e)}") return False, [] return len(incorrect_spacing) == 0, incorrect_spacing def check_abbreviation_usage(doc): """Check for consistent usage of abbreviations.""" abbreviation_issues = [] try: # Regular expression to find abbreviations (2-5 capital letters) abbreviation_pattern = r'\b[A-Z]{2,5}\b' # Check each paragraph for paragraph in doc.paragraphs: text = paragraph.text # Find all abbreviations in this paragraph abbreviations = re.findall(abbreviation_pattern, text) for abbr in abbreviations: # Look for the full term definition definition_pattern = rf'.+\({abbr}\)' if any(re.search(definition_pattern, p.text) for p in doc.paragraphs): # Check if the abbreviation is used consistently after definition for other_paragraph in doc.paragraphs: if abbr in other_paragraph.text and definition_pattern not in other_paragraph.text: abbreviation_issues.append((definition_pattern.split('(')[0].strip(), abbr, paragraph.text)) break except Exception as e: print(f"Error in abbreviation check: {str(e)}") return [] return abbreviation_issues def check_date_formats(doc): """Check for consistent date formatting.""" date_issues = [] try: # Look for date patterns in each paragraph for paragraph in doc.paragraphs: text = paragraph.text if re.search(r'\b\d{1,2}/\d{1,2}/\d{4}\b', text): date_issues.append((text, paragraph.text)) except Exception as e: print(f"Error in date format check: {str(e)}") return [] return date_issues def check_placeholders(doc): """Check for the presence of placeholders.""" placeholder_issues = [] try: # Look for placeholder text in each paragraph for paragraph in doc.paragraprams: text = paragraph.text if '[ENTER TEXT]' in text or '[ENTER DATE]' in text: placeholder_issues.append((text, paragraph.text)) except Exception as e: print(f"Error in placeholder check: {str(e)}") return [] return placeholder_issues def get_document_checks(doc_type, template_type): """Return the required headings and other checks based on document type.""" if doc_type == "Advisory Circular": if template_type == "Short AC template AC": return { "required_headings": ["Purpose", "Applicability", "Related Reading Material", "Background", "Discussion"] } else: # Long AC template return { "required_headings": ["Purpose", "Applicability", "Audience", "Related Reading Material", "Background", "Discussion", "Conclusion"] } # Add other document types as needed return {"required_headings": []} def process_file(file_obj, doc_type, template_type): """ Process the uploaded file and return results with error handling """ if file_obj is None: return "Please upload a document first." try: # Convert bytes to BytesIO object that Document can read if isinstance(file_obj, bytes): doc_bytes = io.BytesIO(file_obj) else: doc_bytes = io.BytesIO(file_obj.read()) # Process the document and get results results = process_document(doc_bytes, doc_type, template_type) return results except Exception as e: error_trace = traceback.format_exc() print(f"Error processing file: {str(e)}") print(f"Full traceback: {error_trace}") error_message = f"""An error occurred while processing the document: Error: {str(e)} Please ensure: 1. The file is a valid Word document (.docx) 2. The file is not corrupted 3. The file is not password protected Technical details: {str(e)}""" return error_message def process_document(file_obj, doc_type, template_type): """Process the document and perform checks.""" try: # Read the Word document doc = Document(file_obj) print("Document read successfully.") # Get required headings based on document type required_headings = get_document_checks(doc_type, template_type).get("required_headings", []) # Perform checks heading_valid, headings_found = heading_title_check(doc, required_headings) acronyms_valid, undefined_acronyms = acronym_check(doc) legal_valid, incorrect_legal_references = legal_check(doc) # Replace placeholder table_valid, incorrect_captions = table_caption_check(doc, doc_type) # Replace placeholder figure_valid, incorrect_fig_captions = figure_caption_check(doc, doc_type) # Replace placeholder references_valid, incorrect_table_figure_references = table_figure_reference_check(doc, doc_type) # Replace placeholder title_style_valid, incorrect_titles = document_title_check(doc, doc_type) # Replace placeholder double_period_valid, incorrect_sentences = double_period_check(doc) # Replace placeholder spacing_valid, incorrect_spacing = spacing_check(doc) # Replace placeholder abbreviation_issues = check_abbreviation_usage(doc) # Replace placeholder date_issues = check_date_formats(doc) # Replace placeholder placeholder_issues = check_placeholders(doc) # Replace placeholder # Format results results = format_results_for_gradio( heading_valid=heading_valid, headings_found=headings_found, acronyms_valid=acronyms_valid, undefined_acronyms=undefined_acronyms, legal_valid=legal_valid, incorrect_legal_references=incorrect_legal_references, table_valid=table_valid, incorrect_captions=incorrect_captions, figure_valid=figure_valid, incorrect_fig_captions=incorrect_fig_captions, references_valid=references_valid, incorrect_table_figure_references=incorrect_table_figure_references, title_style_valid=title_style_valid, incorrect_titles=incorrect_titles, required_headings=required_headings, doc_type=doc_type, double_period_valid=double_period_valid, incorrect_sentences=incorrect_sentences, spacing_valid=spacing_valid, incorrect_spacing=incorrect_spacing, abbreviation_issues=abbreviation_issues, date_issues=date_issues, placeholder_issues=placeholder_issues ) return results except Exception as e: print(f"Error in process_document: {str(e)}") raise def get_document_checks(doc_type, template_type): """Return the required headings and other checks based on document type.""" if doc_type == "Advisory Circular": if template_type == "Short AC template AC": return { "required_headings": ["Purpose", "Applicability", "Related Reading Material", "Background", "Discussion"] } else: # Long AC template return { "required_headings": ["Purpose", "Applicability", "Audience", "Related Reading Material", "Background", "Discussion", "Conclusion"] } # Add other document types as needed return {"required_headings": []} def format_results_for_gradio(**kwargs): """Format the results for display in Gradio.""" results = [] results.append("# Document Check Results\n") # Required Headings Check results.append("## Required Headings Check") if kwargs['heading_valid']: results.append("✅ All required headings are present.\n") else: missing_headings = set(kwargs['required_headings']) - set(kwargs['headings_found']) results.append("❌ Missing Required Headings:") for heading in missing_headings: results.append(f"- {heading}") results.append("") # Acronym Check results.append("## Acronym Check") if kwargs['acronyms_valid']: results.append("✅ All acronyms are properly defined.\n") else: results.append("❌ The following acronyms need to be defined at first use:") for acronym in kwargs['undefined_acronyms']: results.append(f"- {acronym}") results.append("") # Legal Check results.append("## Legal Terminology Check") if kwargs['legal_valid']: results.append("✅ All legal references are properly formatted.\n") else: results.append("❌ Incorrect Legal Terminology:") for incorrect_term, correct_term in kwargs['incorrect_legal_references']: results.append(f"- Use '{correct_term}' instead of '{incorrect_term}'") results.append("") # Table Caption Check results.append("## Table Caption Check") if kwargs['table_valid']: results.append("✅ All table captions are correctly formatted.\n") else: results.append("❌ Incorrect Table Captions:") for caption in kwargs['incorrect_captions']: results.append(f"- {caption}") results.append("") # Figure Caption Check results.append("## Figure Caption Check") if kwargs['figure_valid']: results.append("✅ All figure captions are correctly formatted.\n") else: results.append("❌ Incorrect Figure Captions:") for caption in kwargs['incorrect_fig_captions']: results.append(f"- {caption}") results.append("") # Table and Figure References Check results.append("## Table and Figure References Check") if kwargs['references_valid']: results.append("✅ All table and figure references are correctly formatted.\n") else: results.append("❌ Incorrect Table/Figure References:") for ref in kwargs['incorrect_table_figure_references']: results.append(f"- {ref}") results.append("") # Document Title Style Check results.append("## Document Title Style Check") if kwargs['title_style_valid']: results.append("✅ All document title references are properly styled.\n") else: results.append("❌ Incorrect Document Title Styling:") for title in kwargs['incorrect_titles']: results.append(f"- {title['text']}") results.append(f" - Issue: {title['issue']}") # Add formatting guidance formatting_notes = { "Advisory Circular": "Document titles should be italicized, not in quotation marks.", "Order": "Document titles should be in quotation marks, not italicized.", "Federal Register Notice": "Document titles should be in quotation marks, not italicized.", "Policy Statement": "Document titles should not have any special formatting (no italics, no quotation marks)." } doc_type = kwargs.get('doc_type', 'Unknown') if doc_type in formatting_notes: results.append(f"\nNote: {formatting_notes[doc_type]}") else: results.append("\nNote: Please verify the correct formatting style for this document type.") results.append("") # Double Period Check results.append("## Double Period Check") if kwargs['double_period_valid']: results.append("✅ No double periods found.\n") else: results.append("❌ Sentences found with double periods:") for sentence in kwargs['incorrect_sentences']: results.append(f"- {sentence}") results.append("") # Spacing Check results.append("## Spacing Check") if kwargs['spacing_valid']: results.append("✅ All spacing is correct.\n") else: results.append("❌ Incorrect spacing found in:") for spacing in kwargs['incorrect_spacing']: results.append(f"- {spacing}") results.append("") # Date Format Consistency results.append("## Date Format Consistency") if not kwargs['date_issues']: results.append("✅ All dates are in the correct format.\n") else: results.append("❌ Date Format Issues:") for date, paragraph in kwargs['date_issues']: results.append(f"- Incorrect date format '{date}' in: {paragraph}") results.append("") # Placeholder Check results.append("## Placeholder Check") if not kwargs['placeholder_issues']: results.append("✅ No placeholders found.\n") else: results.append("❌ Placeholders Found:") for phrase, paragraph in kwargs['placeholder_issues']: results.append(f"- Placeholder '{phrase}' in: {paragraph}") return "\n".join(results) def process_file(file_obj, doc_type, template_type): """Process the uploaded file and return results with error handling.""" if file_obj is None: return "Please upload a document first." try: # Convert bytes to BytesIO object doc_bytes = io.BytesIO(file_obj) if isinstance(file_obj, bytes) else io.BytesIO(file_obj.read()) # Process the document results = process_document(doc_bytes, doc_type, template_type) return results except Exception as e: error_message = f"""An error occurred while processing the document: Error: {str(e)} Please ensure: 1. The file is a valid Word document (.docx) 2. The file is not corrupted 3. The file is not password protected Technical details: {str(e)}""" print(f"Error processing file: {str(e)}") return error_message # Create the Gradio interface demo = gr.Blocks(theme='JohnSmith9982/small_and_pretty') with demo: gr.Markdown("# FAA Document Checker") gr.Markdown("Upload a Word document to check for compliance with FAA documentation standards.") document_types = [ "Advisory Circular", "Airworthiness Criteria", "Deviation Memo", "Exemption", "Federal Register Notice", "Handbook/Manual", "Order", "Policy Statement", "Rule", "Special Condition", "Technical Standard Order", "Other" ] template_types = ["Short AC template AC", "Long AC template AC"] with gr.Row(): with gr.Column(scale=1): file_input = gr.File( label="Upload Word Document (.docx)", file_types=[".docx"], type="binary" ) doc_type = gr.Dropdown( choices=document_types, label="Document Type", value="Advisory Circular" ) template_type = gr.Radio( choices=template_types, label="Template Type (Only for Advisory Circular)", visible=True, value="Short AC template AC" ) submit_btn = gr.Button("Check Document", variant="primary") with gr.Column(scale=2): output = gr.Markdown( label="Check Results", value="Results will appear here after processing..." ) # Update template type visibility based on document type def update_template_visibility(doc_type): return gr.update(visible=doc_type == "Advisory Circular") doc_type.change( fn=update_template_visibility, inputs=[doc_type], outputs=[template_type] ) # Process file when submit button is clicked submit_btn.click( fn=process_file, inputs=[file_input, doc_type, template_type], outputs=[output] ) # Launch the demo demo.launch()