Hoctar77 commited on
Commit
736fba7
·
verified ·
1 Parent(s): dc51583

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +99 -87
app.py CHANGED
@@ -627,96 +627,100 @@ class FAADocumentChecker(DocumentChecker):
627
  return DocumentCheckResult(success=success, issues=incorrect_references)
628
 
629
  @profile_performance
630
- def document_title_check(self, doc_path: str, doc_type: str) -> DocumentCheckResult:
631
  """Check for correct formatting of document titles."""
632
  try:
633
- doc = Document(doc_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
634
  except Exception as e:
635
- self.logger.error(f"Error reading the document in title check: {e}")
636
- return DocumentCheckResult(success=False, issues=[{'error': str(e)}])
637
-
638
- incorrect_titles = []
639
-
640
- # Define formatting rules for different document types
641
- formatting_rules = {
642
- "Advisory Circular": {"italics": True, "quotes": False},
643
- "Airworthiness Criteria": {"italics": False, "quotes": True},
644
- "Deviation Memo": {"italics": False, "quotes": True},
645
- "Exemption": {"italics": False, "quotes": True},
646
- "Federal Register Notice": {"italics": False, "quotes": True},
647
- "Order": {"italics": False, "quotes": True},
648
- "Policy Statement": {"italics": False, "quotes": False},
649
- "Rule": {"italics": False, "quotes": True},
650
- "Special Condition": {"italics": False, "quotes": True},
651
- "Technical Standard Order": {"italics": False, "quotes": True},
652
- "Other": {"italics": False, "quotes": False}
653
- }
654
-
655
- if doc_type not in formatting_rules:
656
- self.logger.warning(f"Unsupported document type: {doc_type}. Skipping title check.")
657
- return DocumentCheckResult(success=True, issues=[])
658
-
659
- required_format = formatting_rules[doc_type]
660
-
661
- ac_pattern = re.compile(r'(AC\s+\d+(?:-\d+)?(?:,|\s)+)(.+?)(?=\.|,|$)')
662
-
663
- for paragraph in doc.paragraphs:
664
- text = paragraph.text
665
- matches = ac_pattern.finditer(text)
666
-
667
- for match in matches:
668
- full_match = match.group(0)
669
- title_text = match.group(2).strip()
670
-
671
- # Get the position where the title starts
672
- title_start = match.start(2)
673
- title_end = match.end(2)
674
-
675
- # Check for any type of quotation marks, including smart quotes
676
- title_in_quotes = any(q in title_text for q in ['"', "'", '“', '”', '‘', '’'])
677
-
678
- # Check the formatting of the title
679
- title_is_italicized = False
680
- current_pos = 0
681
- for run in paragraph.runs:
682
- run_length = len(run.text)
683
- run_start = current_pos
684
- run_end = current_pos + run_length
685
- if run_start <= title_start < run_end:
686
- title_is_italicized = run.italic
687
- break
688
- current_pos += run_length
689
-
690
- # Check if formatting matches the required format
691
- formatting_incorrect = False
692
- issue_message = []
693
-
694
- # Check italics requirement
695
- if required_format["italics"] and not title_is_italicized:
696
- formatting_incorrect = True
697
- issue_message.append("should be italicized")
698
- elif not required_format["italics"] and title_is_italicized:
699
- formatting_incorrect = True
700
- issue_message.append("should not be italicized")
701
-
702
- # Check quotes requirement
703
- if required_format["quotes"] and not title_in_quotes:
704
- formatting_incorrect = True
705
- issue_message.append("should be in quotes")
706
- elif not required_format["quotes"] and title_in_quotes:
707
- formatting_incorrect = True
708
- issue_message.append("should not be in quotes")
709
-
710
- if formatting_incorrect:
711
- incorrect_titles.append({
712
- 'text': title_text,
713
- 'issue': ', '.join(issue_message),
714
- 'sentence': text.strip()
715
- })
716
-
717
- success = len(incorrect_titles) == 0
718
-
719
- return DocumentCheckResult(success=success, issues=incorrect_titles)
720
 
721
  @profile_performance
722
  def double_period_check(self, doc: List[str]) -> DocumentCheckResult:
@@ -933,10 +937,17 @@ class FAADocumentChecker(DocumentChecker):
933
  def process_document(file_obj, doc_type, template_type):
934
  """Process the document and run all checks."""
935
  try:
 
 
 
 
936
  checker = FAADocumentChecker()
937
  doc = Document(file_obj)
938
  paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
939
 
 
 
 
940
  # Run all checks
941
  results = {}
942
  results['heading_check'] = checker.heading_title_check(paragraphs, doc_type)
@@ -957,6 +968,7 @@ def process_document(file_obj, doc_type, template_type):
957
  return format_results_for_gradio(results, doc_type)
958
  except Exception as e:
959
  print(f"Error in process_document: {str(e)}")
 
960
  return f"An error occurred while processing the document: {str(e)}"
961
 
962
  def format_results_for_gradio(results: Dict[str, DocumentCheckResult], doc_type: str) -> str:
 
627
  return DocumentCheckResult(success=success, issues=incorrect_references)
628
 
629
  @profile_performance
630
+ def document_title_check(self, doc_path, doc_type: str) -> DocumentCheckResult:
631
  """Check for correct formatting of document titles."""
632
  try:
633
+ # Handle both file paths and BytesIO objects
634
+ if isinstance(doc_path, (str, bytes, io.BytesIO)):
635
+ doc = Document(doc_path)
636
+ else:
637
+ return DocumentCheckResult(
638
+ success=False,
639
+ issues=[{'error': 'Invalid document input type'}]
640
+ )
641
+
642
+ # Rest of the method remains the same
643
+ incorrect_titles = []
644
+
645
+ # Define formatting rules for different document types
646
+ formatting_rules = {
647
+ "Advisory Circular": {"italics": True, "quotes": False},
648
+ "Airworthiness Criteria": {"italics": False, "quotes": True},
649
+ "Deviation Memo": {"italics": False, "quotes": True},
650
+ "Exemption": {"italics": False, "quotes": True},
651
+ "Federal Register Notice": {"italics": False, "quotes": True},
652
+ "Order": {"italics": False, "quotes": True},
653
+ "Policy Statement": {"italics": False, "quotes": False},
654
+ "Rule": {"italics": False, "quotes": True},
655
+ "Special Condition": {"italics": False, "quotes": True},
656
+ "Technical Standard Order": {"italics": False, "quotes": True},
657
+ "Other": {"italics": False, "quotes": False}
658
+ }
659
+
660
+ if doc_type not in formatting_rules:
661
+ self.logger.warning(f"Unsupported document type: {doc_type}. Skipping title check.")
662
+ return DocumentCheckResult(success=True, issues=[])
663
+
664
+ required_format = formatting_rules[doc_type]
665
+ ac_pattern = re.compile(r'(AC\s+\d+(?:-\d+)?(?:,|\s)+)(.+?)(?=\.|,|$)')
666
+
667
+ for paragraph in doc.paragraphs:
668
+ text = paragraph.text
669
+ matches = ac_pattern.finditer(text)
670
+
671
+ for match in matches:
672
+ full_match = match.group(0)
673
+ title_text = match.group(2).strip()
674
+ title_start = match.start(2)
675
+ title_end = match.end(2)
676
+ title_in_quotes = any(q in title_text for q in ['"', "'", '"', '"', ''', '''])
677
+
678
+ title_is_italicized = False
679
+ current_pos = 0
680
+ for run in paragraph.runs:
681
+ run_length = len(run.text)
682
+ run_start = current_pos
683
+ run_end = current_pos + run_length
684
+ if run_start <= title_start < run_end:
685
+ title_is_italicized = run.italic
686
+ break
687
+ current_pos += run_length
688
+
689
+ formatting_incorrect = False
690
+ issue_message = []
691
+
692
+ if required_format["italics"] and not title_is_italicized:
693
+ formatting_incorrect = True
694
+ issue_message.append("should be italicized")
695
+ elif not required_format["italics"] and title_is_italicized:
696
+ formatting_incorrect = True
697
+ issue_message.append("should not be italicized")
698
+
699
+ if required_format["quotes"] and not title_in_quotes:
700
+ formatting_incorrect = True
701
+ issue_message.append("should be in quotes")
702
+ elif not required_format["quotes"] and title_in_quotes:
703
+ formatting_incorrect = True
704
+ issue_message.append("should not be in quotes")
705
+
706
+ if formatting_incorrect:
707
+ incorrect_titles.append({
708
+ 'text': title_text,
709
+ 'issue': ', '.join(issue_message),
710
+ 'sentence': text.strip()
711
+ })
712
+
713
+ return DocumentCheckResult(
714
+ success=len(incorrect_titles) == 0,
715
+ issues=incorrect_titles
716
+ )
717
+
718
  except Exception as e:
719
+ self.logger.error(f"Error in document_title_check: {e}")
720
+ return DocumentCheckResult(
721
+ success=False,
722
+ issues=[{'error': str(e)}]
723
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
724
 
725
  @profile_performance
726
  def double_period_check(self, doc: List[str]) -> DocumentCheckResult:
 
937
  def process_document(file_obj, doc_type, template_type):
938
  """Process the document and run all checks."""
939
  try:
940
+ # Convert file object to BytesIO
941
+ if isinstance(file_obj, bytes):
942
+ file_obj = io.BytesIO(file_obj)
943
+
944
  checker = FAADocumentChecker()
945
  doc = Document(file_obj)
946
  paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
947
 
948
+ # Rewind the file object for additional processing
949
+ file_obj.seek(0)
950
+
951
  # Run all checks
952
  results = {}
953
  results['heading_check'] = checker.heading_title_check(paragraphs, doc_type)
 
968
  return format_results_for_gradio(results, doc_type)
969
  except Exception as e:
970
  print(f"Error in process_document: {str(e)}")
971
+ traceback.print_exc() # This will print the full traceback
972
  return f"An error occurred while processing the document: {str(e)}"
973
 
974
  def format_results_for_gradio(results: Dict[str, DocumentCheckResult], doc_type: str) -> str: