samyak152002 commited on
Commit
d07ab72
·
verified ·
1 Parent(s): 966e948

Create main_analyzer.py

Browse files
Files changed (1) hide show
  1. main_analyzer.py +130 -0
main_analyzer.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # main_analyzer.py
2
+ import fitz # PyMuPDF
3
+ import os
4
+ import tempfile
5
+ import re
6
+ import traceback
7
+ from typing import Tuple, Dict, Any, List, Optional
8
+ from collections import defaultdict
9
+
10
+ # Import functions from our refactored modules
11
+ from pdf_processing import extract_pdf_text, try_map_issues_to_page_rects # convert_rect_to_dict is used by try_map_issues
12
+ from text_utils import convert_markdown_to_plain_text
13
+ from content_analysis import (
14
+ check_metadata, check_disclosures, check_figures_and_tables,
15
+ check_references_summary, check_structure, check_language_issues_and_regex,
16
+ check_figure_order, check_reference_order
17
+ )
18
+
19
+
20
+ def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
21
+ doc_for_mapping = None
22
+ temp_fitz_file_path = None
23
+
24
+ try:
25
+ markdown_text = extract_pdf_text(filepath_or_stream)
26
+ if not markdown_text:
27
+ return {"error": "Failed to extract text (Markdown) from PDF."}, None
28
+
29
+ plain_text_for_general_checks = convert_markdown_to_plain_text(markdown_text)
30
+ cleaned_plain_text_for_regex = re.sub(r'\s+', ' ', plain_text_for_general_checks.replace('\n', ' ')).strip()
31
+
32
+ language_and_regex_issue_report = check_language_issues_and_regex(markdown_text)
33
+
34
+ if "error" in language_and_regex_issue_report:
35
+ return {"error": f"Language/Regex check error: {language_and_regex_issue_report['error']}"}, None
36
+
37
+ detailed_issues_for_mapping = language_and_regex_issue_report.get("issues_list", [])
38
+
39
+ if detailed_issues_for_mapping:
40
+ if isinstance(filepath_or_stream, str):
41
+ pdf_path_for_fitz = filepath_or_stream
42
+ elif hasattr(filepath_or_stream, 'read') and callable(filepath_or_stream.read):
43
+ filepath_or_stream.seek(0)
44
+ temp_fitz_file = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
45
+ temp_fitz_file_path = temp_fitz_file.name
46
+ temp_fitz_file.write(filepath_or_stream.read())
47
+ temp_fitz_file.close()
48
+ pdf_path_for_fitz = temp_fitz_file_path
49
+ else:
50
+ return {"error": "Invalid PDF input for coordinate mapping."}, None
51
+
52
+ try:
53
+ doc_for_mapping = fitz.open(pdf_path_for_fitz)
54
+ if doc_for_mapping.page_count > 0:
55
+ print(f"\n--- Mapping {len(detailed_issues_for_mapping)} Issues (filtered) to PDF Coordinates ---")
56
+ if detailed_issues_for_mapping:
57
+ for page_idx in range(doc_for_mapping.page_count):
58
+ page = doc_for_mapping[page_idx]
59
+ current_page_num_1_based = page_idx + 1
60
+
61
+ unmapped_issues_on_this_page_by_context = defaultdict(list)
62
+ for issue_dict in detailed_issues_for_mapping:
63
+ if not issue_dict['is_mapped_to_pdf']:
64
+ unmapped_issues_on_this_page_by_context[issue_dict['context_text']].append(issue_dict)
65
+
66
+ if not unmapped_issues_on_this_page_by_context:
67
+ if all(iss['is_mapped_to_pdf'] for iss in detailed_issues_for_mapping): break
68
+ continue
69
+
70
+ for ctx_str, issues_for_ctx in unmapped_issues_on_this_page_by_context.items():
71
+ if not ctx_str.strip(): continue
72
+ try:
73
+ pdf_rects = page.search_for(ctx_str, flags=fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE)
74
+ if pdf_rects:
75
+ try_map_issues_to_page_rects(issues_for_ctx, pdf_rects, current_page_num_1_based)
76
+ except Exception as search_exc:
77
+ print(f"Warning: Error searching for context '{ctx_str[:30]}' on page {current_page_num_1_based}: {search_exc}")
78
+ total_mapped = sum(1 for iss in detailed_issues_for_mapping if iss['is_mapped_to_pdf'])
79
+ print(f"Finished coordinate mapping. Mapped issues: {total_mapped}/{len(detailed_issues_for_mapping)}.")
80
+ else:
81
+ print("No language/regex issues found within the defined content boundaries to map.")
82
+ except Exception as e_map:
83
+ print(f"Error during PDF coordinate mapping: {e_map}")
84
+ traceback.print_exc()
85
+ finally:
86
+ if doc_for_mapping: doc_for_mapping.close()
87
+ if temp_fitz_file_path and os.path.exists(temp_fitz_file_path):
88
+ os.unlink(temp_fitz_file_path)
89
+
90
+ final_formatted_issues_list = []
91
+ for issue_data in detailed_issues_for_mapping:
92
+ page_num_for_json = 0
93
+ coords_for_json = []
94
+ if issue_data['is_mapped_to_pdf'] and issue_data['pdf_coordinates_list']:
95
+ coord_dict = issue_data['pdf_coordinates_list'][0]
96
+ coords_for_json = [coord_dict['x0'], coord_dict['y0'], coord_dict['x1'], coord_dict['y1']]
97
+ page_num_for_json = issue_data['mapped_page_number']
98
+
99
+ final_formatted_issues_list.append({
100
+ "message": issue_data['message'], "context": issue_data['context_text'],
101
+ "suggestions": issue_data['replacements_suggestion'], "category": issue_data['category_name'],
102
+ "rule_id": issue_data['ruleId'], "offset": issue_data['offset_in_text'],
103
+ "length": issue_data['error_length'], "coordinates": coords_for_json,
104
+ "page": page_num_for_json
105
+ })
106
+
107
+ results = {
108
+ "issues": final_formatted_issues_list,
109
+ "document_checks": {
110
+ "metadata": check_metadata(cleaned_plain_text_for_regex),
111
+ "disclosures": check_disclosures(cleaned_plain_text_for_regex),
112
+ "figures_and_tables": check_figures_and_tables(cleaned_plain_text_for_regex),
113
+ "references_summary": check_references_summary(cleaned_plain_text_for_regex),
114
+ "structure": check_structure(cleaned_plain_text_for_regex),
115
+ "figure_order_analysis": check_figure_order(cleaned_plain_text_for_regex),
116
+ "reference_order_analysis": check_reference_order(cleaned_plain_text_for_regex),
117
+ "plain_language_summary_present": bool(re.search(r'plain language summary', cleaned_plain_text_for_regex, re.IGNORECASE)),
118
+ "readability_issues_detected": False,
119
+ }
120
+ }
121
+
122
+ return results, None
123
+
124
+ except Exception as e:
125
+ print(f"Overall analysis error in analyze_pdf: {e}")
126
+ traceback.print_exc()
127
+ if doc_for_mapping: doc_for_mapping.close()
128
+ if temp_fitz_file_path and os.path.exists(temp_fitz_file_path):
129
+ os.unlink(temp_fitz_file_path)
130
+ return {"error": str(e)}, None