Spaces:

Hoctar77
/

DocumentCheckerTool

Sleeping

App Files Files Community

Hoctar77 commited on Nov 7, 2024

Commit

736fba7

verified ·

1 Parent(s): dc51583

Update app.py

Browse files

Files changed (1) hide show

app.py +99 -87

app.py CHANGED Viewed

@@ -627,96 +627,100 @@ class FAADocumentChecker(DocumentChecker):
         return DocumentCheckResult(success=success, issues=incorrect_references)
     @profile_performance
-    def document_title_check(self, doc_path: str, doc_type: str) -> DocumentCheckResult:
         """Check for correct formatting of document titles."""
         try:
-            doc = Document(doc_path)
         except Exception as e:
-            self.logger.error(f"Error reading the document in title check: {e}")
-            return DocumentCheckResult(success=False, issues=[{'error': str(e)}])
-        incorrect_titles = []
-        # Define formatting rules for different document types
-        formatting_rules = {
-            "Advisory Circular": {"italics": True, "quotes": False},
-            "Airworthiness Criteria": {"italics": False, "quotes": True},
-            "Deviation Memo": {"italics": False, "quotes": True},
-            "Exemption": {"italics": False, "quotes": True},
-            "Federal Register Notice": {"italics": False, "quotes": True},
-            "Order": {"italics": False, "quotes": True},
-            "Policy Statement": {"italics": False, "quotes": False},
-            "Rule": {"italics": False, "quotes": True},
-            "Special Condition": {"italics": False, "quotes": True},
-            "Technical Standard Order": {"italics": False, "quotes": True},
-            "Other": {"italics": False, "quotes": False}
-        }
-        if doc_type not in formatting_rules:
-            self.logger.warning(f"Unsupported document type: {doc_type}. Skipping title check.")
-            return DocumentCheckResult(success=True, issues=[])
-        required_format = formatting_rules[doc_type]
-        ac_pattern = re.compile(r'(AC\s+\d+(?:-\d+)?(?:,|\s)+)(.+?)(?=\.|,|$)')
-        for paragraph in doc.paragraphs:
-            text = paragraph.text
-            matches = ac_pattern.finditer(text)
-            for match in matches:
-                full_match = match.group(0)
-                title_text = match.group(2).strip()
-                # Get the position where the title starts
-                title_start = match.start(2)
-                title_end = match.end(2)
-                # Check for any type of quotation marks, including smart quotes
-                title_in_quotes = any(q in title_text for q in ['"', "'", '“', '”', '‘', '’'])
-                # Check the formatting of the title
-                title_is_italicized = False
-                current_pos = 0
-                for run in paragraph.runs:
-                    run_length = len(run.text)
-                    run_start = current_pos
-                    run_end = current_pos + run_length
-                    if run_start <= title_start < run_end:
-                        title_is_italicized = run.italic
-                        break
-                    current_pos += run_length
-                # Check if formatting matches the required format
-                formatting_incorrect = False
-                issue_message = []
-                # Check italics requirement
-                if required_format["italics"] and not title_is_italicized:
-                    formatting_incorrect = True
-                    issue_message.append("should be italicized")
-                elif not required_format["italics"] and title_is_italicized:
-                    formatting_incorrect = True
-                    issue_message.append("should not be italicized")
-                # Check quotes requirement
-                if required_format["quotes"] and not title_in_quotes:
-                    formatting_incorrect = True
-                    issue_message.append("should be in quotes")
-                elif not required_format["quotes"] and title_in_quotes:
-                    formatting_incorrect = True
-                    issue_message.append("should not be in quotes")
-                if formatting_incorrect:
-                    incorrect_titles.append({
-                        'text': title_text,
-                        'issue': ', '.join(issue_message),
-                        'sentence': text.strip()
-                    })
-        success = len(incorrect_titles) == 0
-        return DocumentCheckResult(success=success, issues=incorrect_titles)
     @profile_performance
     def double_period_check(self, doc: List[str]) -> DocumentCheckResult:
@@ -933,10 +937,17 @@ class FAADocumentChecker(DocumentChecker):
 def process_document(file_obj, doc_type, template_type):
     """Process the document and run all checks."""
     try:
         checker = FAADocumentChecker()
         doc = Document(file_obj)
         paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
         # Run all checks
         results = {}
         results['heading_check'] = checker.heading_title_check(paragraphs, doc_type)
@@ -957,6 +968,7 @@ def process_document(file_obj, doc_type, template_type):
         return format_results_for_gradio(results, doc_type)
     except Exception as e:
         print(f"Error in process_document: {str(e)}")
         return f"An error occurred while processing the document: {str(e)}"
 def format_results_for_gradio(results: Dict[str, DocumentCheckResult], doc_type: str) -> str:

         return DocumentCheckResult(success=success, issues=incorrect_references)
     @profile_performance
+    def document_title_check(self, doc_path, doc_type: str) -> DocumentCheckResult:
         """Check for correct formatting of document titles."""
         try:
+            # Handle both file paths and BytesIO objects
+            if isinstance(doc_path, (str, bytes, io.BytesIO)):
+                doc = Document(doc_path)
+            else:
+                return DocumentCheckResult(
+                    success=False,
+                    issues=[{'error': 'Invalid document input type'}]
+                )
+            # Rest of the method remains the same
+            incorrect_titles = []
+            # Define formatting rules for different document types
+            formatting_rules = {
+                "Advisory Circular": {"italics": True, "quotes": False},
+                "Airworthiness Criteria": {"italics": False, "quotes": True},
+                "Deviation Memo": {"italics": False, "quotes": True},
+                "Exemption": {"italics": False, "quotes": True},
+                "Federal Register Notice": {"italics": False, "quotes": True},
+                "Order": {"italics": False, "quotes": True},
+                "Policy Statement": {"italics": False, "quotes": False},
+                "Rule": {"italics": False, "quotes": True},
+                "Special Condition": {"italics": False, "quotes": True},
+                "Technical Standard Order": {"italics": False, "quotes": True},
+                "Other": {"italics": False, "quotes": False}
+            }
+            if doc_type not in formatting_rules:
+                self.logger.warning(f"Unsupported document type: {doc_type}. Skipping title check.")
+                return DocumentCheckResult(success=True, issues=[])
+            required_format = formatting_rules[doc_type]
+            ac_pattern = re.compile(r'(AC\s+\d+(?:-\d+)?(?:,|\s)+)(.+?)(?=\.|,|$)')
+            for paragraph in doc.paragraphs:
+                text = paragraph.text
+                matches = ac_pattern.finditer(text)
+                for match in matches:
+                    full_match = match.group(0)
+                    title_text = match.group(2).strip()
+                    title_start = match.start(2)
+                    title_end = match.end(2)
+                    title_in_quotes = any(q in title_text for q in ['"', "'", '"', '"', ''', '''])
+                    title_is_italicized = False
+                    current_pos = 0
+                    for run in paragraph.runs:
+                        run_length = len(run.text)
+                        run_start = current_pos
+                        run_end = current_pos + run_length
+                        if run_start <= title_start < run_end:
+                            title_is_italicized = run.italic
+                            break
+                        current_pos += run_length
+                    formatting_incorrect = False
+                    issue_message = []
+                    if required_format["italics"] and not title_is_italicized:
+                        formatting_incorrect = True
+                        issue_message.append("should be italicized")
+                    elif not required_format["italics"] and title_is_italicized:
+                        formatting_incorrect = True
+                        issue_message.append("should not be italicized")
+                    if required_format["quotes"] and not title_in_quotes:
+                        formatting_incorrect = True
+                        issue_message.append("should be in quotes")
+                    elif not required_format["quotes"] and title_in_quotes:
+                        formatting_incorrect = True
+                        issue_message.append("should not be in quotes")
+                    if formatting_incorrect:
+                        incorrect_titles.append({
+                            'text': title_text,
+                            'issue': ', '.join(issue_message),
+                            'sentence': text.strip()
+                        })
+            return DocumentCheckResult(
+                success=len(incorrect_titles) == 0,
+                issues=incorrect_titles
+            )
         except Exception as e:
+            self.logger.error(f"Error in document_title_check: {e}")
+            return DocumentCheckResult(
+                success=False,
+                issues=[{'error': str(e)}]
+            )
     @profile_performance
     def double_period_check(self, doc: List[str]) -> DocumentCheckResult:
 def process_document(file_obj, doc_type, template_type):
     """Process the document and run all checks."""
     try:
+        # Convert file object to BytesIO
+        if isinstance(file_obj, bytes):
+            file_obj = io.BytesIO(file_obj)
         checker = FAADocumentChecker()
         doc = Document(file_obj)
         paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
+        # Rewind the file object for additional processing
+        file_obj.seek(0)
         # Run all checks
         results = {}
         results['heading_check'] = checker.heading_title_check(paragraphs, doc_type)
         return format_results_for_gradio(results, doc_type)
     except Exception as e:
         print(f"Error in process_document: {str(e)}")
+        traceback.print_exc()  # This will print the full traceback
         return f"An error occurred while processing the document: {str(e)}"
 def format_results_for_gradio(results: Dict[str, DocumentCheckResult], doc_type: str) -> str: