Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -722,16 +722,6 @@ class FAADocumentChecker(DocumentChecker):
|
|
722 |
# Core Check Methods
|
723 |
@profile_performance
|
724 |
def heading_title_check(self, doc: List[str], doc_type: str) -> DocumentCheckResult:
|
725 |
-
"""
|
726 |
-
Check headings for a specific document type.
|
727 |
-
|
728 |
-
Args:
|
729 |
-
doc (List[str]): List of document paragraphs
|
730 |
-
doc_type (str): Type of document being checked
|
731 |
-
|
732 |
-
Returns:
|
733 |
-
DocumentCheckResult: Result of heading check including found and missing headings
|
734 |
-
"""
|
735 |
if not self.validate_input(doc):
|
736 |
self.logger.error("Invalid document input for heading check")
|
737 |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
|
@@ -758,13 +748,29 @@ class FAADocumentChecker(DocumentChecker):
|
|
758 |
headings_found = []
|
759 |
required_headings_set = set(required_headings)
|
760 |
|
761 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
762 |
for para in doc:
|
763 |
para_strip = para.strip()
|
764 |
-
#
|
765 |
-
|
766 |
-
if
|
767 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
768 |
|
769 |
# Check if all required headings are found
|
770 |
found_headings_set = set(headings_found)
|
@@ -796,6 +802,7 @@ class FAADocumentChecker(DocumentChecker):
|
|
796 |
|
797 |
return DocumentCheckResult(success=success, issues=issues, details=details)
|
798 |
|
|
|
799 |
@profile_performance
|
800 |
def heading_title_period_check(self, doc: List[str], doc_type: str) -> DocumentCheckResult:
|
801 |
"""
|
@@ -907,39 +914,22 @@ class FAADocumentChecker(DocumentChecker):
|
|
907 |
|
908 |
@profile_performance
|
909 |
def acronym_check(self, doc: List[str]) -> DocumentCheckResult:
|
910 |
-
"""
|
911 |
-
Check if acronyms are defined at their first use, ignoring uppercase headings
|
912 |
-
and common exceptions.
|
913 |
-
"""
|
914 |
if not self.validate_input(doc):
|
915 |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
|
916 |
|
917 |
-
defined_acronyms = set()
|
918 |
-
first_occurrences = {} # Track first occurrence of each acronym
|
919 |
-
undefined_acronyms = []
|
920 |
-
|
921 |
# Common words that might appear in uppercase but aren't acronyms
|
922 |
-
heading_words =
|
923 |
-
'INFORMATION', 'GENERAL', 'SUMMARY', 'INTRODUCTION', 'BACKGROUND',
|
924 |
-
'DISCUSSION', 'CONCLUSION', 'APPENDIX', 'CHAPTER', 'SECTION',
|
925 |
-
'PURPOSE', 'APPLICABILITY', 'CANCELLATION', 'DEFINITION', 'REQUIREMENTS',
|
926 |
-
'AUTHORITY', 'POLICY', 'SCOPE', 'RELATED', 'MATERIAL', 'DISTRIBUTION',
|
927 |
-
'EXPLANATION', 'PROCEDURES', 'NOTE', 'WARNING', 'CAUTION', 'EXCEPTION',
|
928 |
-
'GROUPS', 'PARTS', 'TABLE', 'FIGURE', 'REFERENCES', 'DEFINITIONS'
|
929 |
-
}
|
930 |
|
931 |
# Standard acronyms that don't need to be defined
|
932 |
-
predefined_acronyms =
|
933 |
-
'CFR', 'U.S.', 'USA', 'US', 'U.S.C', 'e.g.', 'i.e.', 'FAQ', 'No.', 'ZIP', 'PDF', 'SSN',
|
934 |
-
'DC', 'MA', 'WA', 'TX', 'MO'
|
935 |
-
}
|
936 |
|
937 |
-
|
|
|
|
|
|
|
938 |
|
939 |
-
#
|
940 |
defined_pattern = re.compile(r'\b([\w\s&]+?)\s*\((\b[A-Z]{2,}\b)\)')
|
941 |
-
|
942 |
-
# Modified acronym pattern to exclude common heading patterns
|
943 |
acronym_pattern = re.compile(r'\b[A-Z]{2,}\b(?!\s*[:.]\s*)')
|
944 |
|
945 |
for paragraph in doc:
|
@@ -948,48 +938,68 @@ class FAADocumentChecker(DocumentChecker):
|
|
948 |
if all(word.isupper() for word in words) and any(word in heading_words for word in words):
|
949 |
continue
|
950 |
|
951 |
-
# Check for definitions first
|
952 |
defined_matches = defined_pattern.findall(paragraph)
|
953 |
for full_term, acronym in defined_matches:
|
954 |
-
|
955 |
-
|
956 |
-
|
957 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
958 |
|
959 |
# Check for acronym usage
|
960 |
usage_matches = acronym_pattern.finditer(paragraph)
|
961 |
for match in usage_matches:
|
962 |
acronym = match.group()
|
963 |
-
|
|
|
|
|
|
|
|
|
964 |
# Skip if it's part of a heading or contains non-letter characters
|
965 |
-
if (acronym in heading_words or
|
966 |
any(not c.isalpha() for c in acronym) or
|
967 |
len(acronym) > 10): # Usually acronyms aren't this long
|
968 |
continue
|
969 |
|
970 |
if acronym not in defined_acronyms:
|
971 |
-
#
|
972 |
-
|
973 |
-
|
974 |
-
|
975 |
-
|
976 |
-
|
977 |
-
|
978 |
-
|
979 |
-
|
980 |
-
|
981 |
-
|
982 |
-
|
983 |
-
|
984 |
-
|
985 |
-
|
986 |
-
|
987 |
-
|
988 |
-
|
989 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
990 |
|
991 |
return DocumentCheckResult(success=success, issues=issues)
|
992 |
|
|
|
993 |
@profile_performance
|
994 |
def check_terminology(self, doc: List[str]) -> DocumentCheckResult:
|
995 |
"""
|
@@ -1942,6 +1952,14 @@ class DocumentCheckResultsFormatter:
|
|
1942 |
subsequent_indent=' '
|
1943 |
)
|
1944 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1945 |
# Handle issues with direct sentence reference
|
1946 |
elif 'sentence' in issue:
|
1947 |
return textwrap.fill(
|
@@ -2405,7 +2423,7 @@ def create_interface():
|
|
2405 |
"""
|
2406 |
|
2407 |
# Extract issues
|
2408 |
-
issues_match = re.findall(r'•\s*(
|
2409 |
issues_html_section = ""
|
2410 |
if issues_match:
|
2411 |
issues_html_section = """
|
|
|
722 |
# Core Check Methods
|
723 |
@profile_performance
|
724 |
def heading_title_check(self, doc: List[str], doc_type: str) -> DocumentCheckResult:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
725 |
if not self.validate_input(doc):
|
726 |
self.logger.error("Invalid document input for heading check")
|
727 |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
|
|
|
748 |
headings_found = []
|
749 |
required_headings_set = set(required_headings)
|
750 |
|
751 |
+
# Precompile a regex pattern to match headings at the start of the paragraph
|
752 |
+
# Escape special characters in headings and allow for optional spaces and periods
|
753 |
+
heading_patterns = []
|
754 |
+
for heading in required_headings:
|
755 |
+
escaped_heading = re.escape(heading.rstrip('.'))
|
756 |
+
pattern = rf'^\s*{escaped_heading}\.?\s*'
|
757 |
+
heading_patterns.append(pattern)
|
758 |
+
combined_pattern = re.compile('|'.join(heading_patterns), re.IGNORECASE)
|
759 |
+
|
760 |
for para in doc:
|
761 |
para_strip = para.strip()
|
762 |
+
# Check if paragraph starts with any of the required headings
|
763 |
+
match = combined_pattern.match(para_strip)
|
764 |
+
if match:
|
765 |
+
# Extract the matched heading
|
766 |
+
matched_heading = match.group().strip()
|
767 |
+
# Normalize the matched heading to compare with required headings
|
768 |
+
matched_heading_base = matched_heading.rstrip('.').strip()
|
769 |
+
# Find the exact heading from required headings (case-insensitive)
|
770 |
+
for required_heading in required_headings:
|
771 |
+
if matched_heading_base.lower() == required_heading.rstrip('.').lower():
|
772 |
+
headings_found.append(required_heading)
|
773 |
+
break
|
774 |
|
775 |
# Check if all required headings are found
|
776 |
found_headings_set = set(headings_found)
|
|
|
802 |
|
803 |
return DocumentCheckResult(success=success, issues=issues, details=details)
|
804 |
|
805 |
+
|
806 |
@profile_performance
|
807 |
def heading_title_period_check(self, doc: List[str], doc_type: str) -> DocumentCheckResult:
|
808 |
"""
|
|
|
914 |
|
915 |
@profile_performance
|
916 |
def acronym_check(self, doc: List[str]) -> DocumentCheckResult:
|
|
|
|
|
|
|
|
|
917 |
if not self.validate_input(doc):
|
918 |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
|
919 |
|
|
|
|
|
|
|
|
|
920 |
# Common words that might appear in uppercase but aren't acronyms
|
921 |
+
heading_words = self.config_manager.config.get('heading_words', HEADING_WORDS)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
922 |
|
923 |
# Standard acronyms that don't need to be defined
|
924 |
+
predefined_acronyms = self.config_manager.config.get('predefined_acronyms', PREDEFINED_ACRONYMS)
|
|
|
|
|
|
|
925 |
|
926 |
+
# Tracking structures
|
927 |
+
defined_acronyms = {} # Stores definition info
|
928 |
+
used_acronyms = set() # Stores acronyms used after definition
|
929 |
+
issues = []
|
930 |
|
931 |
+
# Patterns
|
932 |
defined_pattern = re.compile(r'\b([\w\s&]+?)\s*\((\b[A-Z]{2,}\b)\)')
|
|
|
|
|
933 |
acronym_pattern = re.compile(r'\b[A-Z]{2,}\b(?!\s*[:.]\s*)')
|
934 |
|
935 |
for paragraph in doc:
|
|
|
938 |
if all(word.isupper() for word in words) and any(word in heading_words for word in words):
|
939 |
continue
|
940 |
|
941 |
+
# Check for acronym definitions first
|
942 |
defined_matches = defined_pattern.findall(paragraph)
|
943 |
for full_term, acronym in defined_matches:
|
944 |
+
if acronym not in predefined_acronyms:
|
945 |
+
if acronym not in defined_acronyms:
|
946 |
+
defined_acronyms[acronym] = {
|
947 |
+
'full_term': full_term.strip(),
|
948 |
+
'defined_at': paragraph.strip(),
|
949 |
+
'used': False # Initially not used
|
950 |
+
}
|
951 |
+
else:
|
952 |
+
# Handle duplicate definitions if necessary
|
953 |
+
pass # You may add logic for duplicate definitions
|
954 |
|
955 |
# Check for acronym usage
|
956 |
usage_matches = acronym_pattern.finditer(paragraph)
|
957 |
for match in usage_matches:
|
958 |
acronym = match.group()
|
959 |
+
|
960 |
+
# Skip predefined acronyms
|
961 |
+
if acronym in predefined_acronyms:
|
962 |
+
continue
|
963 |
+
|
964 |
# Skip if it's part of a heading or contains non-letter characters
|
965 |
+
if (acronym in heading_words or
|
966 |
any(not c.isalpha() for c in acronym) or
|
967 |
len(acronym) > 10): # Usually acronyms aren't this long
|
968 |
continue
|
969 |
|
970 |
if acronym not in defined_acronyms:
|
971 |
+
# Undefined acronym used
|
972 |
+
issues.append({
|
973 |
+
'type': 'undefined_acronym',
|
974 |
+
'acronym': acronym,
|
975 |
+
'sentence': paragraph.strip()
|
976 |
+
})
|
977 |
+
else:
|
978 |
+
# Mark as used
|
979 |
+
defined_acronyms[acronym]['used'] = True
|
980 |
+
used_acronyms.add(acronym)
|
981 |
+
|
982 |
+
# Check for defined but unused acronyms
|
983 |
+
unused_acronyms = [
|
984 |
+
{
|
985 |
+
'type': 'unused_acronym',
|
986 |
+
'acronym': acronym,
|
987 |
+
'full_term': data['full_term'],
|
988 |
+
'defined_at': data['defined_at']
|
989 |
+
}
|
990 |
+
for acronym, data in defined_acronyms.items()
|
991 |
+
if not data['used']
|
992 |
+
]
|
993 |
+
|
994 |
+
# Combine issues
|
995 |
+
if unused_acronyms:
|
996 |
+
issues.extend(unused_acronyms)
|
997 |
+
|
998 |
+
success = len(issues) == 0
|
999 |
|
1000 |
return DocumentCheckResult(success=success, issues=issues)
|
1001 |
|
1002 |
+
|
1003 |
@profile_performance
|
1004 |
def check_terminology(self, doc: List[str]) -> DocumentCheckResult:
|
1005 |
"""
|
|
|
1952 |
subsequent_indent=' '
|
1953 |
)
|
1954 |
|
1955 |
+
# Handle unused acronym issues
|
1956 |
+
if issue.get('type') == 'unused_acronym':
|
1957 |
+
return textwrap.fill(
|
1958 |
+
f" • Acronym '{issue['acronym']}' defined but not used again after definition.",
|
1959 |
+
width=76,
|
1960 |
+
subsequent_indent=' '
|
1961 |
+
)
|
1962 |
+
|
1963 |
# Handle issues with direct sentence reference
|
1964 |
elif 'sentence' in issue:
|
1965 |
return textwrap.fill(
|
|
|
2423 |
"""
|
2424 |
|
2425 |
# Extract issues
|
2426 |
+
issues_match = re.findall(r'•\s*(.+?)(?=\n•|\Z)', content, re.DOTALL)
|
2427 |
issues_html_section = ""
|
2428 |
if issues_match:
|
2429 |
issues_html_section = """
|