Spaces:

Hoctar77
/

DocumentCheckerTool

Sleeping

File size: 24,262 Bytes

import gradio as gr
import logging
import re
from docx import Document
import io
import traceback

def heading_title_check(doc, required_headings):
    """Check if all required headings are present."""
    headings_found = []
    try:
        # Iterate through paragraphs to find headings
        for paragraph in doc.paragraphs:
            if paragraph.style.name.startswith('Heading'):
                headings_found.append(paragraph.text.strip())
    except Exception as e:
        print(f"Error in heading check: {str(e)}")
        return False, []
        
    # Check if all required headings are present
    all_present = all(heading in headings_found for heading in required_headings)
    return all_present, headings_found

def acronym_check(doc):
    """Check if all acronyms are properly defined."""
    undefined_acronyms = set()
    defined_acronyms = set()
    
    try:
        # Regular expression for finding acronyms (2-5 capital letters)
        acronym_pattern = r'\b[A-Z]{2,5}\b'
        
        # Check each paragraph
        for paragraph in doc.paragraphs:
            text = paragraph.text
            
            # Find all acronyms in this paragraph
            acronyms = re.findall(acronym_pattern, text)
            
            for acronym in acronyms:
                if acronym not in defined_acronyms:
                    # Look for definition pattern: "full term (ACRONYM)"
                    definition_pattern = rf'.+\({acronym}\)'
                    if not any(re.search(definition_pattern, p.text) for p in doc.paragraphs):
                        undefined_acronyms.add(acronym)
                    else:
                        defined_acronyms.add(acronym)
    except Exception as e:
        print(f"Error in acronym check: {str(e)}")
        return False, []
    
    return len(undefined_acronyms) == 0, list(undefined_acronyms)

def legal_check(doc):
    """Check if legal terminology is used correctly."""
    incorrect_legal_references = []
    
    try:
        # Define legal terminology mapping
        legal_terms = {
            "C.F.R.": "Code of Federal Regulations",
            "F.R.": "Federal Register",
            "U.S.C.": "United States Code"
        }
        
        # Check each paragraph
        for paragraph in doc.paragraprams:
            text = paragraph.text
            for incorrect_term, correct_term in legal_terms.items():
                if incorrect_term in text and correct_term not in text:
                    incorrect_legal_references.append((incorrect_term, correct_term))
    except Exception as e:
        print(f"Error in legal check: {str(e)}")
        return False, []
    
    return len(incorrect_legal_references) == 0, incorrect_legal_references

def table_caption_check(doc, doc_type):
    """Check if table captions are formatted correctly."""
    incorrect_captions = []
    
    try:
        # Check table captions
        for table in doc.tables:
            # Get the paragraph before the table
            prev_paragraph = table._element.getprevious()
            if prev_paragraph is not None and prev_paragraph.text.startswith("Table"):
                # Check if the caption is formatted correctly
                if doc_type == "Advisory Circular":
                    # AC captions should be "Table X. Caption text"
                    if not prev_paragraph.text.startswith("Table ") or ". " not in prev_paragraph.text:
                        incorrect_captions.append(prev_paragraph.text)
                else:
                    # Other doc types may have different caption formats
                    pass
    except Exception as e:
        print(f"Error in table caption check: {str(e)}")
        return False, []
    
    return len(incorrect_captions) == 0, incorrect_captions

def figure_caption_check(doc, doc_type):
    """Check if figure captions are formatted correctly."""
    incorrect_fig_captions = []
    
    try:
        # Check figure captions
        for paragraph in doc.paragraphs:
            if paragraph.text.startswith("Figure"):
                # Check if the caption is formatted correctly
                if doc_type == "Advisory Circular":
                    # AC captions should be "Figure X. Caption text"
                    if ". " not in paragraph.text:
                        incorrect_fig_captions.append(paragraph.text)
                else:
                    # Other doc types may have different caption formats
                    pass
    except Exception as e:
        print(f"Error in figure caption check: {str(e)}")
        return False, []
    
    return len(incorrect_fig_captions) == 0, incorrect_fig_captions

def table_figure_reference_check(doc, doc_type):
    """Check if table and figure references are formatted correctly."""
    incorrect_table_figure_references = []
    
    try:
        # Check table and figure references
        for paragraph in doc.paragraphs:
            text = paragraph.text
            if "Table" in text or "Figure" in text:
                # Check if the reference is formatted correctly
                if doc_type == "Advisory Circular":
                    # AC references should be "Table X" or "Figure X"
                    if not any(text.startswith(f"{item} ") for item in ["Table", "Figure"]):
                        incorrect_table_figure_references.append(text)
                else:
                    # Other doc types may have different reference formats
                    pass
    except Exception as e:
        print(f"Error in table/figure reference check: {str(e)}")
        return False, []
    
    return len(incorrect_table_figure_references) == 0, incorrect_table_figure_references

def document_title_check(doc, doc_type):
    """Check if the document title is formatted correctly."""
    incorrect_titles = []
    
    try:
        # Check the document title
        if len(doc.paragraphs) > 0 and doc.paragraphs[0].style.name == 'Title':
            title_text = doc.paragraphs[0].text
            
            # Check the formatting based on document type
            if doc_type == "Advisory Circular":
                if not title_text.startswith("ADVISORY CIRCULAR ") or title_text.endswith(" AC"):
                    incorrect_titles.append({"text": title_text, "issue": "Advisory Circular titles should start with 'ADVISORY CIRCULAR ' and end with ' AC'"})
            elif doc_type == "Order":
                if not title_text.startswith('"') or not title_text.endswith('"'):
                    incorrect_titles.append({"text": title_text, "issue": "Order titles should be enclosed in quotation marks"})
            elif doc_type == "Federal Register Notice":
                if not title_text.startswith('"') or not title_text.endswith('"'):
                    incorrect_titles.append({"text": title_text, "issue": "Federal Register Notice titles should be enclosed in quotation marks"})
            elif doc_type == "Policy Statement":
                if title_text.startswith('"') or title_text.endswith('"'):
                    incorrect_titles.append({"text": title_text, "issue": "Policy Statement titles should not have quotation marks"})
    except Exception as e:
        print(f"Error in document title check: {str(e)}")
        return False, []
    
    return len(incorrect_titles) == 0, incorrect_titles

def double_period_check(doc):
    """Check for sentences with double periods."""
    incorrect_sentences = []
    
    try:
        # Check each paragraph for double periods
        for paragraph in doc.paragraphs:
            if ".." in paragraph.text:
                incorrect_sentences.append(paragraph.text)
    except Exception as e:
        print(f"Error in double period check: {str(e)}")
        return False, []
    
    return len(incorrect_sentences) == 0, incorrect_sentences

def spacing_check(doc):
    """Check for incorrect spacing."""
    incorrect_spacing = []
    
    try:
        # Check each paragraph for spacing issues
        for paragraph in doc.paragraphs:
            if "  " in paragraph.text:
                incorrect_spacing.append(paragraph.text)
    except Exception as e:
        print(f"Error in spacing check: {str(e)}")
        return False, []
    
    return len(incorrect_spacing) == 0, incorrect_spacing

def check_abbreviation_usage(doc):
    """Check for consistent usage of abbreviations."""
    abbreviation_issues = []
    
    try:
        # Regular expression to find abbreviations (2-5 capital letters)
        abbreviation_pattern = r'\b[A-Z]{2,5}\b'
        
        # Check each paragraph
        for paragraph in doc.paragraphs:
            text = paragraph.text
            
            # Find all abbreviations in this paragraph
            abbreviations = re.findall(abbreviation_pattern, text)
            
            for abbr in abbreviations:
                # Look for the full term definition
                definition_pattern = rf'.+\({abbr}\)'
                if any(re.search(definition_pattern, p.text) for p in doc.paragraphs):
                    # Check if the abbreviation is used consistently after definition
                    for other_paragraph in doc.paragraphs:
                        if abbr in other_paragraph.text and definition_pattern not in other_paragraph.text:
                            abbreviation_issues.append((definition_pattern.split('(')[0].strip(), abbr, paragraph.text))
                            break
    except Exception as e:
        print(f"Error in abbreviation check: {str(e)}")
        return []
    
    return abbreviation_issues

def check_date_formats(doc):
    """Check for consistent date formatting."""
    date_issues = []
    
    try:
        # Look for date patterns in each paragraph
        for paragraph in doc.paragraphs:
            text = paragraph.text
            if re.search(r'\b\d{1,2}/\d{1,2}/\d{4}\b', text):
                date_issues.append((text, paragraph.text))
    except Exception as e:
        print(f"Error in date format check: {str(e)}")
        return []
    
    return date_issues

def check_placeholders(doc):
    """Check for the presence of placeholders."""
    placeholder_issues = []
    
    try:
        # Look for placeholder text in each paragraph
        for paragraph in doc.paragraprams:
            text = paragraph.text
            if '[ENTER TEXT]' in text or '[ENTER DATE]' in text:
                placeholder_issues.append((text, paragraph.text))
    except Exception as e:
        print(f"Error in placeholder check: {str(e)}")
        return []
    
    return placeholder_issues

def get_document_checks(doc_type, template_type):
    """Return the required headings and other checks based on document type."""
    if doc_type == "Advisory Circular":
        if template_type == "Short AC template AC":
            return {
                "required_headings": ["Purpose", "Applicability", "Related Reading Material", 
                                    "Background", "Discussion"]
            }
        else:  # Long AC template
            return {
                "required_headings": ["Purpose", "Applicability", "Audience", "Related Reading Material",
                                    "Background", "Discussion", "Conclusion"]
            }
    # Add other document types as needed
    return {"required_headings": []}

def process_file(file_obj, doc_type, template_type):
    """
    Process the uploaded file and return results with error handling
    """
    if file_obj is None:
        return "Please upload a document first."
    
    try:
        # Convert bytes to BytesIO object that Document can read
        if isinstance(file_obj, bytes):
            doc_bytes = io.BytesIO(file_obj)
        else:
            doc_bytes = io.BytesIO(file_obj.read())
            
        # Process the document and get results
        results = process_document(doc_bytes, doc_type, template_type)
        return results
        
    except Exception as e:
        error_trace = traceback.format_exc()
        print(f"Error processing file: {str(e)}")
        print(f"Full traceback: {error_trace}")
        
        error_message = f"""An error occurred while processing the document:
        
Error: {str(e)}

Please ensure:
1. The file is a valid Word document (.docx)
2. The file is not corrupted
3. The file is not password protected

Technical details: {str(e)}"""
        
        return error_message

def process_document(file_obj, doc_type, template_type):
    """Process the document and perform checks."""
    try:
        # Read the Word document
        doc = Document(file_obj)
        print("Document read successfully.")
        
        # Get required headings based on document type
        required_headings = get_document_checks(doc_type, template_type).get("required_headings", [])
        
        # Perform checks
        heading_valid, headings_found = heading_title_check(doc, required_headings)
        acronyms_valid, undefined_acronyms = acronym_check(doc)
        legal_valid, incorrect_legal_references = legal_check(doc)  # Replace placeholder
        table_valid, incorrect_captions = table_caption_check(doc, doc_type)  # Replace placeholder
        figure_valid, incorrect_fig_captions = figure_caption_check(doc, doc_type)  # Replace placeholder
        references_valid, incorrect_table_figure_references = table_figure_reference_check(doc, doc_type)  # Replace placeholder
        title_style_valid, incorrect_titles = document_title_check(doc, doc_type)  # Replace placeholder
        double_period_valid, incorrect_sentences = double_period_check(doc)  # Replace placeholder
        spacing_valid, incorrect_spacing = spacing_check(doc)  # Replace placeholder
        abbreviation_issues = check_abbreviation_usage(doc)  # Replace placeholder
        date_issues = check_date_formats(doc)  # Replace placeholder
        placeholder_issues = check_placeholders(doc)  # Replace placeholder
        
        # Format results
        results = format_results_for_gradio(
            heading_valid=heading_valid,
            headings_found=headings_found,
            acronyms_valid=acronyms_valid,
            undefined_acronyms=undefined_acronyms,
            legal_valid=legal_valid,
            incorrect_legal_references=incorrect_legal_references,
            table_valid=table_valid,
            incorrect_captions=incorrect_captions,
            figure_valid=figure_valid,
            incorrect_fig_captions=incorrect_fig_captions,
            references_valid=references_valid,
            incorrect_table_figure_references=incorrect_table_figure_references,
            title_style_valid=title_style_valid,
            incorrect_titles=incorrect_titles,
            required_headings=required_headings,
            doc_type=doc_type,
            double_period_valid=double_period_valid,
            incorrect_sentences=incorrect_sentences,
            spacing_valid=spacing_valid,
            incorrect_spacing=incorrect_spacing,
            abbreviation_issues=abbreviation_issues,
            date_issues=date_issues,
            placeholder_issues=placeholder_issues
        )
        
        return results
    
    except Exception as e:
        print(f"Error in process_document: {str(e)}")
        raise

def get_document_checks(doc_type, template_type):
    """Return the required headings and other checks based on document type."""
    if doc_type == "Advisory Circular":
        if template_type == "Short AC template AC":
            return {
                "required_headings": ["Purpose", "Applicability", "Related Reading Material", 
                                    "Background", "Discussion"]
            }
        else:  # Long AC template
            return {
                "required_headings": ["Purpose", "Applicability", "Audience", "Related Reading Material",
                                    "Background", "Discussion", "Conclusion"]
            }
    # Add other document types as needed
    return {"required_headings": []}

def format_results_for_gradio(**kwargs):
    """Format the results for display in Gradio."""
    results = []
    results.append("# Document Check Results\n")
    
    # Required Headings Check
    results.append("## Required Headings Check")
    if kwargs['heading_valid']:
        results.append("✅ All required headings are present.\n")
    else:
        missing_headings = set(kwargs['required_headings']) - set(kwargs['headings_found'])
        results.append("❌ Missing Required Headings:")
        for heading in missing_headings:
            results.append(f"- {heading}")
    results.append("")
    
    # Acronym Check
    results.append("## Acronym Check")
    if kwargs['acronyms_valid']:
        results.append("✅ All acronyms are properly defined.\n")
    else:
        results.append("❌ The following acronyms need to be defined at first use:")
        for acronym in kwargs['undefined_acronyms']:
            results.append(f"- {acronym}")
    results.append("")

    # Legal Check
    results.append("## Legal Terminology Check")
    if kwargs['legal_valid']:
        results.append("✅ All legal references are properly formatted.\n")
    else:
        results.append("❌ Incorrect Legal Terminology:")
        for incorrect_term, correct_term in kwargs['incorrect_legal_references']:
            results.append(f"- Use '{correct_term}' instead of '{incorrect_term}'")
    results.append("")
    
    # Table Caption Check
    results.append("## Table Caption Check")
    if kwargs['table_valid']:
        results.append("✅ All table captions are correctly formatted.\n")
    else:
        results.append("❌ Incorrect Table Captions:")
        for caption in kwargs['incorrect_captions']:
            results.append(f"- {caption}")
    results.append("")

    # Figure Caption Check
    results.append("## Figure Caption Check")
    if kwargs['figure_valid']:
        results.append("✅ All figure captions are correctly formatted.\n")
    else:
        results.append("❌ Incorrect Figure Captions:")
        for caption in kwargs['incorrect_fig_captions']:
            results.append(f"- {caption}")
    results.append("")

    # Table and Figure References Check
    results.append("## Table and Figure References Check")
    if kwargs['references_valid']:
        results.append("✅ All table and figure references are correctly formatted.\n")
    else:
        results.append("❌ Incorrect Table/Figure References:")
        for ref in kwargs['incorrect_table_figure_references']:
            results.append(f"- {ref}")
    results.append("")

    # Document Title Style Check
    results.append("## Document Title Style Check")
    if kwargs['title_style_valid']:
        results.append("✅ All document title references are properly styled.\n")
    else:
        results.append("❌ Incorrect Document Title Styling:")
        for title in kwargs['incorrect_titles']:
            results.append(f"- {title['text']}")
            results.append(f"  - Issue: {title['issue']}")
        
        # Add formatting guidance
        formatting_notes = {
            "Advisory Circular": "Document titles should be italicized, not in quotation marks.",
            "Order": "Document titles should be in quotation marks, not italicized.",
            "Federal Register Notice": "Document titles should be in quotation marks, not italicized.",
            "Policy Statement": "Document titles should not have any special formatting (no italics, no quotation marks)."
        }
        
        doc_type = kwargs.get('doc_type', 'Unknown')
        if doc_type in formatting_notes:
            results.append(f"\nNote: {formatting_notes[doc_type]}")
        else:
            results.append("\nNote: Please verify the correct formatting style for this document type.")
    results.append("")

    # Double Period Check
    results.append("## Double Period Check")
    if kwargs['double_period_valid']:
        results.append("✅ No double periods found.\n")
    else:
        results.append("❌ Sentences found with double periods:")
        for sentence in kwargs['incorrect_sentences']:
            results.append(f"- {sentence}")
    results.append("")

    # Spacing Check
    results.append("## Spacing Check")
    if kwargs['spacing_valid']:
        results.append("✅ All spacing is correct.\n")
    else:
        results.append("❌ Incorrect spacing found in:")
        for spacing in kwargs['incorrect_spacing']:
            results.append(f"- {spacing}")
    results.append("")

    # Date Format Consistency
    results.append("## Date Format Consistency")
    if not kwargs['date_issues']:
        results.append("✅ All dates are in the correct format.\n")
    else:
        results.append("❌ Date Format Issues:")
        for date, paragraph in kwargs['date_issues']:
            results.append(f"- Incorrect date format '{date}' in: {paragraph}")
    results.append("")

    # Placeholder Check
    results.append("## Placeholder Check")
    if not kwargs['placeholder_issues']:
        results.append("✅ No placeholders found.\n")
    else:
        results.append("❌ Placeholders Found:")
        for phrase, paragraph in kwargs['placeholder_issues']:
            results.append(f"- Placeholder '{phrase}' in: {paragraph}")
    
    return "\n".join(results)

def process_file(file_obj, doc_type, template_type):
    """Process the uploaded file and return results with error handling."""
    if file_obj is None:
        return "Please upload a document first."
    
    try:
        # Convert bytes to BytesIO object
        doc_bytes = io.BytesIO(file_obj) if isinstance(file_obj, bytes) else io.BytesIO(file_obj.read())
        
        # Process the document
        results = process_document(doc_bytes, doc_type, template_type)
        return results
        
    except Exception as e:
        error_message = f"""An error occurred while processing the document:
        
Error: {str(e)}

Please ensure:
1. The file is a valid Word document (.docx)
2. The file is not corrupted
3. The file is not password protected

Technical details: {str(e)}"""
        print(f"Error processing file: {str(e)}")
        return error_message

# Create the Gradio interface
demo = gr.Blocks(theme='JohnSmith9982/small_and_pretty')

with demo:
    gr.Markdown("# FAA Document Checker")
    gr.Markdown("Upload a Word document to check for compliance with FAA documentation standards.")
    
    document_types = [
        "Advisory Circular", "Airworthiness Criteria", "Deviation Memo", "Exemption", 
        "Federal Register Notice", "Handbook/Manual", "Order", "Policy Statement", 
        "Rule", "Special Condition", "Technical Standard Order", "Other"
    ]
    
    template_types = ["Short AC template AC", "Long AC template AC"]
    
    with gr.Row():
        with gr.Column(scale=1):
            file_input = gr.File(
                label="Upload Word Document (.docx)",
                file_types=[".docx"],
                type="binary"
            )
            doc_type = gr.Dropdown(
                choices=document_types,
                label="Document Type",
                value="Advisory Circular"
            )
            template_type = gr.Radio(
                choices=template_types,
                label="Template Type (Only for Advisory Circular)",
                visible=True,
                value="Short AC template AC"
            )
            submit_btn = gr.Button("Check Document", variant="primary")
        
        with gr.Column(scale=2):
            output = gr.Markdown(
                label="Check Results",
                value="Results will appear here after processing..."
            )
    
    # Update template type visibility based on document type
    def update_template_visibility(doc_type):
        return gr.update(visible=doc_type == "Advisory Circular")
    
    doc_type.change(
        fn=update_template_visibility,
        inputs=[doc_type],
        outputs=[template_type]
    )
    
    # Process file when submit button is clicked
    submit_btn.click(
        fn=process_file,
        inputs=[file_input, doc_type, template_type],
        outputs=[output]
    )

# Launch the demo
demo.launch()