Spaces:

Hoctar77
/

DocumentCheckerTool

Sleeping

App Files Files Community

Hoctar77 commited on Oct 29, 2024

Commit

436beda

verified ·

1 Parent(s): c6ba992

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -88

app.py CHANGED Viewed

@@ -5,41 +5,26 @@ from docx import Document
 import io
 import traceback
-def heading_title_check(doc, required_headings):
-    """
-    Check if required headings are present in the document.
-    Args:
-        doc (list): List of paragraph texts from the document
-        required_headings (list): List of required heading titles
-    Returns:
-        tuple: (bool, list) - (True if all headings present, list of found headings)
-    """
     headings_found = []
-    # Create a set of required headings for efficient lookup
     required_headings_set = set(required_headings)
-    for para in doc:
         para_strip = para.strip()
-        # Check if the paragraph is in the required headings list
         if para_strip in required_headings_set:
             headings_found.append(para_strip)
-    # Check if all required headings are found
     all_headings_present = set(headings_found) == required_headings_set
     return all_headings_present, headings_found
-def acronym_check(doc):
     """Check if all acronyms are defined at first use and return undefined acronyms."""
     defined_acronyms = set()  # Set to store defined acronyms
     undefined_acronyms = set()  # Set to store undefined acronyms
     acronym_pattern = re.compile(r'(\b[A-Z]{2,}\b)')  # Regex to find acronyms (2 or more uppercase letters)
     defined_pattern = re.compile(r'(\b\w+\b) \((\b[A-Z]{2,}\b)\)')  # Regex to find definitions like "Federal Aviation Administration (FAA)"
-    for paragraph in doc:
         # Check for defined acronyms
         defined_matches = defined_pattern.findall(paragraph)
         for full_term, acronym in defined_matches:
@@ -51,9 +36,10 @@ def acronym_check(doc):
             if acronym not in defined_acronyms:
                 undefined_acronyms.add(acronym)  # Add to undefined acronyms if not defined
-    return len(undefined_acronyms) == 0, undefined_acronyms  # Return True if all acronyms are defined, along with undefined acronyms
-def legal_check(doc):
     """Check for correct legal references in the document and suggest corrections.
     Args:
@@ -77,7 +63,7 @@ def legal_check(doc):
     # List to store tuples of incorrect terms and their correct versions
     incorrect_legal_references = []
-    for paragraph in doc:
         # Special handling for "Title 14" / "title 14"
         title_14_pattern = r"(?P<prefix>^|[.!?\s])\s*(?P<title>title 14|Title 14)\b"
         matches = re.finditer(title_14_pattern, paragraph)
@@ -102,7 +88,7 @@ def legal_check(doc):
     return len(incorrect_legal_references) == 0, incorrect_legal_references
-def table_caption_check(doc, doc_type):
     """
     Check for correctly formatted table captions in the document.
     Supports both numeric (Table 1-2) and alphanumeric (Table C-1) formats.
@@ -116,7 +102,7 @@ def table_caption_check(doc, doc_type):
     incorrect_captions = []
-    for paragraph in doc:
         paragraph_strip = paragraph.strip()
         if paragraph_strip.lower().startswith("table"):
             if not table_caption_pattern.match(paragraph_strip):
@@ -124,7 +110,7 @@ def table_caption_check(doc, doc_type):
     return len(incorrect_captions) == 0, incorrect_captions
-def figure_caption_check(doc, doc_type):
     """
     Check for correctly formatted figure captions in the document.
     Supports both numeric (Figure 1-2) and alphanumeric (Figure C-1) formats.
@@ -137,7 +123,7 @@ def figure_caption_check(doc, doc_type):
         figure_caption_pattern = re.compile(r'^Figure\s+([A-Z0-9]+)[\.\s]', re.IGNORECASE)
     incorrect_fig_captions = []
-    for paragraph in doc:
         paragraph_strip = paragraph.strip()
         if paragraph_strip.lower().startswith("figure"):
             if not figure_caption_pattern.match(paragraph_strip):
@@ -145,7 +131,7 @@ def figure_caption_check(doc, doc_type):
     return len(incorrect_fig_captions) == 0, incorrect_fig_captions
-def table_figure_reference_check(doc, doc_type):
     """Check for incorrect references to tables and figures in the document."""
     incorrect_table_figure_references = []
@@ -158,7 +144,7 @@ def table_figure_reference_check(doc, doc_type):
         incorrect_table_ref_pattern = re.compile(r'\bTable\s+\d+(-\d+)?\b', re.IGNORECASE)
         incorrect_figure_ref_pattern = re.compile(r'\bFigure\s+\d+(-\d+)?\b', re.IGNORECASE)
-    for paragraph in doc:
         paragraph_strip = paragraph.strip()
         # Exclude captions
         starts_with_table_or_figure = paragraph_strip.lower().startswith('table') or paragraph_strip.lower().startswith('figure')
@@ -366,11 +352,11 @@ def get_document_checks(doc_type, template_type):
     logger.info(f"Retrieved checks: {checks}")
     return checks
-def double_period_check(doc):
     """Check for sentences that end with two periods."""
     incorrect_sentences = []
-    for paragraph in doc:
         # Split the paragraph into sentences based on common sentence-ending punctuation
         sentences = re.split(r'(?<=[.!?]) +', paragraph)
         for sentence in sentences:
@@ -379,7 +365,7 @@ def double_period_check(doc):
     return len(incorrect_sentences) == 0, incorrect_sentences  # Return True if no double periods are found, along with any incorrect sentences
-def spacing_check(doc):
     """
     Check for correct spacing in US federal regulatory documents.
     Checks for:
@@ -421,7 +407,7 @@ def spacing_check(doc):
     return len(incorrect_spacing) == 0, incorrect_spacing
-def check_prohibited_phrases(doc):
     """Check for prohibited words or phrases."""
     prohibited_phrases = [
         r'\babove\b',
@@ -430,17 +416,17 @@ def check_prohibited_phrases(doc):
         r'\bthere are\b'
     ]
     issues = []
-    for paragraph in doc:
         for phrase in prohibited_phrases:
             if re.search(phrase, paragraph, re.IGNORECASE):
                 issues.append((phrase.strip(r'\b'), paragraph.strip()))
     return issues
-def check_abbreviation_usage(doc):
     """Check for abbreviation consistency after first definition."""
     abbreviations = {}
     issues = []
-    for paragraph in doc:
         # Find definitions like "Federal Aviation Administration (FAA)"
         defined_matches = re.findall(r'\b([A-Za-z &]+)\s+\((\b[A-Z]{2,}\b)\)', paragraph)
         for full_term, acronym in defined_matches:
@@ -460,12 +446,12 @@ def check_abbreviation_usage(doc):
     return issues
-def check_date_formats(doc):
     """Check for inconsistent date formats."""
     date_issues = []
     correct_date_pattern = re.compile(r'\b(January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}, \d{4}\b')
     date_pattern = re.compile(r'\b\d{1,2}/\d{1,2}/\d{2,4}\b')  # MM/DD/YYYY
-    for paragraph in doc:
         if date_pattern.search(paragraph):
             dates = date_pattern.findall(paragraph)
             for date in dates:
@@ -473,7 +459,7 @@ def check_date_formats(doc):
                     date_issues.append((date, paragraph.strip()))
     return date_issues
-def check_placeholders(doc):
     """Check for placeholders that should be removed."""
     placeholder_phrases = [
         r'\bTBD\b',
@@ -481,48 +467,12 @@ def check_placeholders(doc):
         r'\bTo be added\b'
     ]
     issues = []
-    for paragraph in doc:
         for phrase in placeholder_phrases:
             if re.search(phrase, paragraph, re.IGNORECASE):
                 issues.append((phrase.strip(r'\b'), paragraph.strip()))
     return issues
-def process_file(file_obj, doc_type, template_type):
-    """
-    Process the uploaded file and return results with error handling
-    """
-    if file_obj is None:
-        return "Please upload a document first."
-    try:
-        # Convert bytes to BytesIO object that Document can read
-        if isinstance(file_obj, bytes):
-            doc_bytes = io.BytesIO(file_obj)
-        else:
-            doc_bytes = io.BytesIO(file_obj.read())
-        # Process the document and get results
-        results = process_document(doc_bytes, doc_type, template_type)
-        return results
-    except Exception as e:
-        error_trace = traceback.format_exc()
-        print(f"Error processing file: {str(e)}")
-        print(f"Full traceback: {error_trace}")
-        error_message = f"""An error occurred while processing the document:
-Error: {str(e)}
-Please ensure:
-1. The file is a valid Word document (.docx)
-2. The file is not corrupted
-3. The file is not password protected
-Technical details: {str(e)}"""
-        return error_message
 def process_document(file_obj, doc_type, template_type):
     """Process the document and perform checks."""
     try:
@@ -530,22 +480,25 @@ def process_document(file_obj, doc_type, template_type):
         doc = Document(file_obj)
         print("Document read successfully.")
         # Get required headings based on document type
         required_headings = get_document_checks(doc_type, template_type).get("required_headings", [])
         # Perform checks
-        heading_valid, headings_found = heading_title_check(doc, required_headings)
-        acronyms_valid, undefined_acronyms = acronym_check(doc)
-        legal_valid, incorrect_legal_references = legal_check(doc)  # Replace placeholder
-        table_valid, incorrect_captions = table_caption_check(doc, doc_type)  # Replace placeholder
-        figure_valid, incorrect_fig_captions = figure_caption_check(doc, doc_type)  # Replace placeholder
-        references_valid, incorrect_table_figure_references = table_figure_reference_check(doc, doc_type)  # Replace placeholder
-        title_style_valid, incorrect_titles = document_title_check(doc, doc_type)  # Replace placeholder
-        double_period_valid, incorrect_sentences = double_period_check(doc)  # Replace placeholder
-        spacing_valid, incorrect_spacing = spacing_check(doc)  # Replace placeholder
-        abbreviation_issues = check_abbreviation_usage(doc)  # Replace placeholder
-        date_issues = check_date_formats(doc)  # Replace placeholder
-        placeholder_issues = check_placeholders(doc)  # Replace placeholder
         # Format results
         results = format_results_for_gradio(
@@ -761,8 +714,11 @@ demo = gr.Blocks(theme='JohnSmith9982/small_and_pretty')
 with demo:
     gr.Markdown("# Document Checker Tool")
     gr.Markdown("Upload a Word (docx) document to check for compliance with U.S. federal documentation standards.")
-    gr.Markdown("### This tool is still in development")
     gr.Markdown("Contact Eric Putnam if you have questions and comments.")
     document_types = [
         "Advisory Circular", "Airworthiness Criteria", "Deviation Memo", "Exemption",

 import io
 import traceback
+def heading_title_check(paragraphs, required_headings):
     headings_found = []
     required_headings_set = set(required_headings)
+    for para in paragraphs:
         para_strip = para.strip()
         if para_strip in required_headings_set:
             headings_found.append(para_strip)
     all_headings_present = set(headings_found) == required_headings_set
     return all_headings_present, headings_found
+def acronym_check(paragraphs):
     """Check if all acronyms are defined at first use and return undefined acronyms."""
     defined_acronyms = set()  # Set to store defined acronyms
     undefined_acronyms = set()  # Set to store undefined acronyms
     acronym_pattern = re.compile(r'(\b[A-Z]{2,}\b)')  # Regex to find acronyms (2 or more uppercase letters)
     defined_pattern = re.compile(r'(\b\w+\b) \((\b[A-Z]{2,}\b)\)')  # Regex to find definitions like "Federal Aviation Administration (FAA)"
+    for paragraph in paragraphs:  # Use paragraphs here
         # Check for defined acronyms
         defined_matches = defined_pattern.findall(paragraph)
         for full_term, acronym in defined_matches:
             if acronym not in defined_acronyms:
                 undefined_acronyms.add(acronym)  # Add to undefined acronyms if not defined
+    return len(undefined_acronyms) == 0, undefined_acronyms
+def legal_check(paragraphs):
     """Check for correct legal references in the document and suggest corrections.
     Args:
     # List to store tuples of incorrect terms and their correct versions
     incorrect_legal_references = []
+    for paragraph in paragraphs:
         # Special handling for "Title 14" / "title 14"
         title_14_pattern = r"(?P<prefix>^|[.!?\s])\s*(?P<title>title 14|Title 14)\b"
         matches = re.finditer(title_14_pattern, paragraph)
     return len(incorrect_legal_references) == 0, incorrect_legal_references
+def table_caption_check(paragraphs, doc_type):
     """
     Check for correctly formatted table captions in the document.
     Supports both numeric (Table 1-2) and alphanumeric (Table C-1) formats.
     incorrect_captions = []
+    for paragraph in paragraphs:
         paragraph_strip = paragraph.strip()
         if paragraph_strip.lower().startswith("table"):
             if not table_caption_pattern.match(paragraph_strip):
     return len(incorrect_captions) == 0, incorrect_captions
+def figure_caption_check(paragraphs, doc_type):
     """
     Check for correctly formatted figure captions in the document.
     Supports both numeric (Figure 1-2) and alphanumeric (Figure C-1) formats.
         figure_caption_pattern = re.compile(r'^Figure\s+([A-Z0-9]+)[\.\s]', re.IGNORECASE)
     incorrect_fig_captions = []
+    for paragraph in paragraphs:
         paragraph_strip = paragraph.strip()
         if paragraph_strip.lower().startswith("figure"):
             if not figure_caption_pattern.match(paragraph_strip):
     return len(incorrect_fig_captions) == 0, incorrect_fig_captions
+def table_figure_reference_check(paragraphs, doc_type):
     """Check for incorrect references to tables and figures in the document."""
     incorrect_table_figure_references = []
         incorrect_table_ref_pattern = re.compile(r'\bTable\s+\d+(-\d+)?\b', re.IGNORECASE)
         incorrect_figure_ref_pattern = re.compile(r'\bFigure\s+\d+(-\d+)?\b', re.IGNORECASE)
+    for paragraph in paragraphs:
         paragraph_strip = paragraph.strip()
         # Exclude captions
         starts_with_table_or_figure = paragraph_strip.lower().startswith('table') or paragraph_strip.lower().startswith('figure')
     logger.info(f"Retrieved checks: {checks}")
     return checks
+def double_period_check(paragraphs):
     """Check for sentences that end with two periods."""
     incorrect_sentences = []
+    for paragraph in paragraphs:
         # Split the paragraph into sentences based on common sentence-ending punctuation
         sentences = re.split(r'(?<=[.!?]) +', paragraph)
         for sentence in sentences:
     return len(incorrect_sentences) == 0, incorrect_sentences  # Return True if no double periods are found, along with any incorrect sentences
+def spacing_check(paragraphs):
     """
     Check for correct spacing in US federal regulatory documents.
     Checks for:
     return len(incorrect_spacing) == 0, incorrect_spacing
+def check_prohibited_phrases(paragraphs):
     """Check for prohibited words or phrases."""
     prohibited_phrases = [
         r'\babove\b',
         r'\bthere are\b'
     ]
     issues = []
+    for paragraph in paragraphs:
         for phrase in prohibited_phrases:
             if re.search(phrase, paragraph, re.IGNORECASE):
                 issues.append((phrase.strip(r'\b'), paragraph.strip()))
     return issues
+def check_abbreviation_usage(paragraphs):
     """Check for abbreviation consistency after first definition."""
     abbreviations = {}
     issues = []
+    for paragraph in paragraphs:
         # Find definitions like "Federal Aviation Administration (FAA)"
         defined_matches = re.findall(r'\b([A-Za-z &]+)\s+\((\b[A-Z]{2,}\b)\)', paragraph)
         for full_term, acronym in defined_matches:
     return issues
+def check_date_formats(paragraphs):
     """Check for inconsistent date formats."""
     date_issues = []
     correct_date_pattern = re.compile(r'\b(January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}, \d{4}\b')
     date_pattern = re.compile(r'\b\d{1,2}/\d{1,2}/\d{2,4}\b')  # MM/DD/YYYY
+    for paragraph in paragraphs:
         if date_pattern.search(paragraph):
             dates = date_pattern.findall(paragraph)
             for date in dates:
                     date_issues.append((date, paragraph.strip()))
     return date_issues
+def check_placeholders(paragraphs):
     """Check for placeholders that should be removed."""
     placeholder_phrases = [
         r'\bTBD\b',
         r'\bTo be added\b'
     ]
     issues = []
+    for paragraph in paragraphs:
         for phrase in placeholder_phrases:
             if re.search(phrase, paragraph, re.IGNORECASE):
                 issues.append((phrase.strip(r'\b'), paragraph.strip()))
     return issues
 def process_document(file_obj, doc_type, template_type):
     """Process the document and perform checks."""
     try:
         doc = Document(file_obj)
         print("Document read successfully.")
+        # Extract text from each paragraph to make it iterable
+        paragraphs = [para.text for para in doc.paragraphs]
         # Get required headings based on document type
         required_headings = get_document_checks(doc_type, template_type).get("required_headings", [])
         # Perform checks
+        heading_valid, headings_found = heading_title_check(paragraphs, required_headings)
+        acronyms_valid, undefined_acronyms = acronym_check(paragraphs)
+        legal_valid, incorrect_legal_references = legal_check(paragraphs)
+        table_valid, incorrect_captions = table_caption_check(paragraphs, doc_type)
+        figure_valid, incorrect_fig_captions = figure_caption_check(paragraphs, doc_type)
+        references_valid, incorrect_table_figure_references = table_figure_reference_check(paragraphs, doc_type)
+        title_style_valid, incorrect_titles = document_title_check(file_obj, doc_type)
+        double_period_valid, incorrect_sentences = double_period_check(paragraphs)
+        spacing_valid, incorrect_spacing = spacing_check(paragraphs)
+        abbreviation_issues = check_abbreviation_usage(paragraphs)
+        date_issues = check_date_formats(paragraphs)
+        placeholder_issues = check_placeholders(paragraphs)
         # Format results
         results = format_results_for_gradio(
 with demo:
     gr.Markdown("# Document Checker Tool")
     gr.Markdown("Upload a Word (docx) document to check for compliance with U.S. federal documentation standards.")
+    gr.Markdown("*This tool is still in development and you might get false positives in your results*")
     gr.Markdown("Contact Eric Putnam if you have questions and comments.")
+    gr.Markdown("""
+    1. Upload a clean (no track changes or comments) Word file.
+    2. Choose **Check Document**.""")
     document_types = [
         "Advisory Circular", "Airworthiness Criteria", "Deviation Memo", "Exemption",