Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -627,96 +627,100 @@ class FAADocumentChecker(DocumentChecker):
|
|
627 |
return DocumentCheckResult(success=success, issues=incorrect_references)
|
628 |
|
629 |
@profile_performance
|
630 |
-
def document_title_check(self, doc_path
|
631 |
"""Check for correct formatting of document titles."""
|
632 |
try:
|
633 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
634 |
except Exception as e:
|
635 |
-
self.logger.error(f"Error
|
636 |
-
return DocumentCheckResult(
|
637 |
-
|
638 |
-
|
639 |
-
|
640 |
-
# Define formatting rules for different document types
|
641 |
-
formatting_rules = {
|
642 |
-
"Advisory Circular": {"italics": True, "quotes": False},
|
643 |
-
"Airworthiness Criteria": {"italics": False, "quotes": True},
|
644 |
-
"Deviation Memo": {"italics": False, "quotes": True},
|
645 |
-
"Exemption": {"italics": False, "quotes": True},
|
646 |
-
"Federal Register Notice": {"italics": False, "quotes": True},
|
647 |
-
"Order": {"italics": False, "quotes": True},
|
648 |
-
"Policy Statement": {"italics": False, "quotes": False},
|
649 |
-
"Rule": {"italics": False, "quotes": True},
|
650 |
-
"Special Condition": {"italics": False, "quotes": True},
|
651 |
-
"Technical Standard Order": {"italics": False, "quotes": True},
|
652 |
-
"Other": {"italics": False, "quotes": False}
|
653 |
-
}
|
654 |
-
|
655 |
-
if doc_type not in formatting_rules:
|
656 |
-
self.logger.warning(f"Unsupported document type: {doc_type}. Skipping title check.")
|
657 |
-
return DocumentCheckResult(success=True, issues=[])
|
658 |
-
|
659 |
-
required_format = formatting_rules[doc_type]
|
660 |
-
|
661 |
-
ac_pattern = re.compile(r'(AC\s+\d+(?:-\d+)?(?:,|\s)+)(.+?)(?=\.|,|$)')
|
662 |
-
|
663 |
-
for paragraph in doc.paragraphs:
|
664 |
-
text = paragraph.text
|
665 |
-
matches = ac_pattern.finditer(text)
|
666 |
-
|
667 |
-
for match in matches:
|
668 |
-
full_match = match.group(0)
|
669 |
-
title_text = match.group(2).strip()
|
670 |
-
|
671 |
-
# Get the position where the title starts
|
672 |
-
title_start = match.start(2)
|
673 |
-
title_end = match.end(2)
|
674 |
-
|
675 |
-
# Check for any type of quotation marks, including smart quotes
|
676 |
-
title_in_quotes = any(q in title_text for q in ['"', "'", '“', '”', '‘', '’'])
|
677 |
-
|
678 |
-
# Check the formatting of the title
|
679 |
-
title_is_italicized = False
|
680 |
-
current_pos = 0
|
681 |
-
for run in paragraph.runs:
|
682 |
-
run_length = len(run.text)
|
683 |
-
run_start = current_pos
|
684 |
-
run_end = current_pos + run_length
|
685 |
-
if run_start <= title_start < run_end:
|
686 |
-
title_is_italicized = run.italic
|
687 |
-
break
|
688 |
-
current_pos += run_length
|
689 |
-
|
690 |
-
# Check if formatting matches the required format
|
691 |
-
formatting_incorrect = False
|
692 |
-
issue_message = []
|
693 |
-
|
694 |
-
# Check italics requirement
|
695 |
-
if required_format["italics"] and not title_is_italicized:
|
696 |
-
formatting_incorrect = True
|
697 |
-
issue_message.append("should be italicized")
|
698 |
-
elif not required_format["italics"] and title_is_italicized:
|
699 |
-
formatting_incorrect = True
|
700 |
-
issue_message.append("should not be italicized")
|
701 |
-
|
702 |
-
# Check quotes requirement
|
703 |
-
if required_format["quotes"] and not title_in_quotes:
|
704 |
-
formatting_incorrect = True
|
705 |
-
issue_message.append("should be in quotes")
|
706 |
-
elif not required_format["quotes"] and title_in_quotes:
|
707 |
-
formatting_incorrect = True
|
708 |
-
issue_message.append("should not be in quotes")
|
709 |
-
|
710 |
-
if formatting_incorrect:
|
711 |
-
incorrect_titles.append({
|
712 |
-
'text': title_text,
|
713 |
-
'issue': ', '.join(issue_message),
|
714 |
-
'sentence': text.strip()
|
715 |
-
})
|
716 |
-
|
717 |
-
success = len(incorrect_titles) == 0
|
718 |
-
|
719 |
-
return DocumentCheckResult(success=success, issues=incorrect_titles)
|
720 |
|
721 |
@profile_performance
|
722 |
def double_period_check(self, doc: List[str]) -> DocumentCheckResult:
|
@@ -933,10 +937,17 @@ class FAADocumentChecker(DocumentChecker):
|
|
933 |
def process_document(file_obj, doc_type, template_type):
|
934 |
"""Process the document and run all checks."""
|
935 |
try:
|
|
|
|
|
|
|
|
|
936 |
checker = FAADocumentChecker()
|
937 |
doc = Document(file_obj)
|
938 |
paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
|
939 |
|
|
|
|
|
|
|
940 |
# Run all checks
|
941 |
results = {}
|
942 |
results['heading_check'] = checker.heading_title_check(paragraphs, doc_type)
|
@@ -957,6 +968,7 @@ def process_document(file_obj, doc_type, template_type):
|
|
957 |
return format_results_for_gradio(results, doc_type)
|
958 |
except Exception as e:
|
959 |
print(f"Error in process_document: {str(e)}")
|
|
|
960 |
return f"An error occurred while processing the document: {str(e)}"
|
961 |
|
962 |
def format_results_for_gradio(results: Dict[str, DocumentCheckResult], doc_type: str) -> str:
|
|
|
627 |
return DocumentCheckResult(success=success, issues=incorrect_references)
|
628 |
|
629 |
@profile_performance
|
630 |
+
def document_title_check(self, doc_path, doc_type: str) -> DocumentCheckResult:
|
631 |
"""Check for correct formatting of document titles."""
|
632 |
try:
|
633 |
+
# Handle both file paths and BytesIO objects
|
634 |
+
if isinstance(doc_path, (str, bytes, io.BytesIO)):
|
635 |
+
doc = Document(doc_path)
|
636 |
+
else:
|
637 |
+
return DocumentCheckResult(
|
638 |
+
success=False,
|
639 |
+
issues=[{'error': 'Invalid document input type'}]
|
640 |
+
)
|
641 |
+
|
642 |
+
# Rest of the method remains the same
|
643 |
+
incorrect_titles = []
|
644 |
+
|
645 |
+
# Define formatting rules for different document types
|
646 |
+
formatting_rules = {
|
647 |
+
"Advisory Circular": {"italics": True, "quotes": False},
|
648 |
+
"Airworthiness Criteria": {"italics": False, "quotes": True},
|
649 |
+
"Deviation Memo": {"italics": False, "quotes": True},
|
650 |
+
"Exemption": {"italics": False, "quotes": True},
|
651 |
+
"Federal Register Notice": {"italics": False, "quotes": True},
|
652 |
+
"Order": {"italics": False, "quotes": True},
|
653 |
+
"Policy Statement": {"italics": False, "quotes": False},
|
654 |
+
"Rule": {"italics": False, "quotes": True},
|
655 |
+
"Special Condition": {"italics": False, "quotes": True},
|
656 |
+
"Technical Standard Order": {"italics": False, "quotes": True},
|
657 |
+
"Other": {"italics": False, "quotes": False}
|
658 |
+
}
|
659 |
+
|
660 |
+
if doc_type not in formatting_rules:
|
661 |
+
self.logger.warning(f"Unsupported document type: {doc_type}. Skipping title check.")
|
662 |
+
return DocumentCheckResult(success=True, issues=[])
|
663 |
+
|
664 |
+
required_format = formatting_rules[doc_type]
|
665 |
+
ac_pattern = re.compile(r'(AC\s+\d+(?:-\d+)?(?:,|\s)+)(.+?)(?=\.|,|$)')
|
666 |
+
|
667 |
+
for paragraph in doc.paragraphs:
|
668 |
+
text = paragraph.text
|
669 |
+
matches = ac_pattern.finditer(text)
|
670 |
+
|
671 |
+
for match in matches:
|
672 |
+
full_match = match.group(0)
|
673 |
+
title_text = match.group(2).strip()
|
674 |
+
title_start = match.start(2)
|
675 |
+
title_end = match.end(2)
|
676 |
+
title_in_quotes = any(q in title_text for q in ['"', "'", '"', '"', ''', '''])
|
677 |
+
|
678 |
+
title_is_italicized = False
|
679 |
+
current_pos = 0
|
680 |
+
for run in paragraph.runs:
|
681 |
+
run_length = len(run.text)
|
682 |
+
run_start = current_pos
|
683 |
+
run_end = current_pos + run_length
|
684 |
+
if run_start <= title_start < run_end:
|
685 |
+
title_is_italicized = run.italic
|
686 |
+
break
|
687 |
+
current_pos += run_length
|
688 |
+
|
689 |
+
formatting_incorrect = False
|
690 |
+
issue_message = []
|
691 |
+
|
692 |
+
if required_format["italics"] and not title_is_italicized:
|
693 |
+
formatting_incorrect = True
|
694 |
+
issue_message.append("should be italicized")
|
695 |
+
elif not required_format["italics"] and title_is_italicized:
|
696 |
+
formatting_incorrect = True
|
697 |
+
issue_message.append("should not be italicized")
|
698 |
+
|
699 |
+
if required_format["quotes"] and not title_in_quotes:
|
700 |
+
formatting_incorrect = True
|
701 |
+
issue_message.append("should be in quotes")
|
702 |
+
elif not required_format["quotes"] and title_in_quotes:
|
703 |
+
formatting_incorrect = True
|
704 |
+
issue_message.append("should not be in quotes")
|
705 |
+
|
706 |
+
if formatting_incorrect:
|
707 |
+
incorrect_titles.append({
|
708 |
+
'text': title_text,
|
709 |
+
'issue': ', '.join(issue_message),
|
710 |
+
'sentence': text.strip()
|
711 |
+
})
|
712 |
+
|
713 |
+
return DocumentCheckResult(
|
714 |
+
success=len(incorrect_titles) == 0,
|
715 |
+
issues=incorrect_titles
|
716 |
+
)
|
717 |
+
|
718 |
except Exception as e:
|
719 |
+
self.logger.error(f"Error in document_title_check: {e}")
|
720 |
+
return DocumentCheckResult(
|
721 |
+
success=False,
|
722 |
+
issues=[{'error': str(e)}]
|
723 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
724 |
|
725 |
@profile_performance
|
726 |
def double_period_check(self, doc: List[str]) -> DocumentCheckResult:
|
|
|
937 |
def process_document(file_obj, doc_type, template_type):
|
938 |
"""Process the document and run all checks."""
|
939 |
try:
|
940 |
+
# Convert file object to BytesIO
|
941 |
+
if isinstance(file_obj, bytes):
|
942 |
+
file_obj = io.BytesIO(file_obj)
|
943 |
+
|
944 |
checker = FAADocumentChecker()
|
945 |
doc = Document(file_obj)
|
946 |
paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
|
947 |
|
948 |
+
# Rewind the file object for additional processing
|
949 |
+
file_obj.seek(0)
|
950 |
+
|
951 |
# Run all checks
|
952 |
results = {}
|
953 |
results['heading_check'] = checker.heading_title_check(paragraphs, doc_type)
|
|
|
968 |
return format_results_for_gradio(results, doc_type)
|
969 |
except Exception as e:
|
970 |
print(f"Error in process_document: {str(e)}")
|
971 |
+
traceback.print_exc() # This will print the full traceback
|
972 |
return f"An error occurred while processing the document: {str(e)}"
|
973 |
|
974 |
def format_results_for_gradio(results: Dict[str, DocumentCheckResult], doc_type: str) -> str:
|