SpencerCPurdy commited on
Commit
e8538b0
Β·
verified Β·
1 Parent(s): c755587

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +723 -0
app.py ADDED
@@ -0,0 +1,723 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Multi-Modal Document Intelligence System
2
+ # Author: Spencer Purdy
3
+ # Description: An advanced document analysis tool that combines LayoutLMv3 for document understanding
4
+ # with efficient language models to extract information, summarize, and answer questions about documents.
5
+ # Optimized for Google Colab Pro performance.
6
+
7
+ import subprocess
8
+ import sys
9
+ import os
10
+ import io
11
+ from typing import List, Dict, Tuple, Optional
12
+ import json
13
+ import re
14
+ import hashlib
15
+ import time
16
+
17
+ # Install required packages function
18
+ def install_packages():
19
+ """Install all required packages for the document intelligence system"""
20
+ packages = [
21
+ 'gradio',
22
+ 'transformers',
23
+ 'torch',
24
+ 'torchvision',
25
+ 'Pillow',
26
+ 'pytesseract',
27
+ 'pdf2image',
28
+ 'opencv-python',
29
+ 'sentencepiece',
30
+ 'accelerate'
31
+ ]
32
+
33
+ print("Installing required packages...")
34
+ for package in packages:
35
+ subprocess.check_call([sys.executable, '-m', 'pip', 'install', package, '-q'])
36
+
37
+ # Install system dependencies for PDF processing and OCR
38
+ print("Installing system dependencies...")
39
+ subprocess.check_call(['apt-get', 'update', '-qq'])
40
+ subprocess.check_call(['apt-get', 'install', '-y', '-qq', 'poppler-utils', 'tesseract-ocr'])
41
+
42
+ # Try importing, install if needed
43
+ try:
44
+ import gradio as gr
45
+ from transformers import (
46
+ AutoProcessor, AutoModelForTokenClassification,
47
+ AutoTokenizer, AutoModelForSeq2SeqLM,
48
+ pipeline
49
+ )
50
+ import torch
51
+ from PIL import Image
52
+ import pytesseract
53
+ from pdf2image import convert_from_path
54
+ import cv2
55
+ import numpy as np
56
+ except ImportError:
57
+ print("Installing required packages...")
58
+ install_packages()
59
+ # Re-import after installation
60
+ import gradio as gr
61
+ from transformers import (
62
+ AutoProcessor, AutoModelForTokenClassification,
63
+ AutoTokenizer, AutoModelForSeq2SeqLM,
64
+ pipeline
65
+ )
66
+ import torch
67
+ from PIL import Image
68
+ import pytesseract
69
+ from pdf2image import convert_from_path
70
+ import cv2
71
+ import numpy as np
72
+
73
+ # Configure device for optimal performance
74
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
75
+ print(f"Using device: {device}")
76
+
77
+ # Model initialization with optimized settings
78
+ print("Loading models...")
79
+
80
+ # Load LayoutLMv3 for document structure understanding
81
+ print("Loading LayoutLMv3...")
82
+ layoutlm_processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
83
+ layoutlm_model = AutoModelForTokenClassification.from_pretrained(
84
+ "microsoft/layoutlmv3-base",
85
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
86
+ ).to(device)
87
+ layoutlm_model.eval() # Set to evaluation mode for faster inference
88
+
89
+ # Load efficient T5 model for text generation (much faster than Phi-2)
90
+ print("Loading T5 model for summarization and Q&A...")
91
+ t5_model_name = "google/flan-t5-base" # 250M parameters, efficient and effective
92
+ t5_tokenizer = AutoTokenizer.from_pretrained(t5_model_name)
93
+ t5_model = AutoModelForSeq2SeqLM.from_pretrained(
94
+ t5_model_name,
95
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
96
+ ).to(device)
97
+ t5_model.eval() # Set to evaluation mode
98
+
99
+ print("Models loaded successfully!")
100
+
101
+ class DocumentProcessor:
102
+ """
103
+ Main document processing class that handles OCR, text extraction,
104
+ summarization, and question answering for various document types.
105
+ """
106
+
107
+ def __init__(self):
108
+ """Initialize the document processor with empty state"""
109
+ self.extracted_text = ""
110
+ self.document_metadata = {}
111
+ self.page_contents = []
112
+ self.processing_cache = {} # Cache for processed documents
113
+
114
+ def _get_file_hash(self, file_path: str) -> str:
115
+ """Generate a hash for the file to use as cache key"""
116
+ with open(file_path, 'rb') as f:
117
+ return hashlib.md5(f.read()).hexdigest()
118
+
119
+ def process_pdf(self, pdf_path: str, max_pages: int = 20) -> List[Image.Image]:
120
+ """
121
+ Convert PDF pages to images for OCR processing
122
+
123
+ Args:
124
+ pdf_path: Path to the PDF file
125
+ max_pages: Maximum number of pages to process (for memory management)
126
+
127
+ Returns:
128
+ List of PIL Images representing PDF pages
129
+ """
130
+ try:
131
+ # Convert PDF to images with resolution optimization
132
+ images = convert_from_path(
133
+ pdf_path,
134
+ dpi=150, # Balance between quality and performance
135
+ first_page=1,
136
+ last_page=min(max_pages, 100) # Limit pages for memory
137
+ )
138
+ return images
139
+ except Exception as e:
140
+ print(f"Error processing PDF: {e}")
141
+ return []
142
+
143
+ def extract_text_from_image(self, image: Image.Image) -> Dict[str, any]:
144
+ """
145
+ Extract text and layout information from an image using OCR
146
+
147
+ Args:
148
+ image: PIL Image to process
149
+
150
+ Returns:
151
+ Dictionary containing extracted text and metadata
152
+ """
153
+ try:
154
+ # Resize image if too large to improve performance
155
+ max_dimension = 2000
156
+ if max(image.size) > max_dimension:
157
+ ratio = max_dimension / max(image.size)
158
+ new_size = tuple(int(dim * ratio) for dim in image.size)
159
+ image = image.resize(new_size, Image.Resampling.LANCZOS)
160
+
161
+ # Convert to numpy array for OCR
162
+ image_np = np.array(image)
163
+
164
+ # Perform OCR with Tesseract
165
+ ocr_config = '--oem 3 --psm 6' # Use LSTM engine with uniform block detection
166
+ ocr_data = pytesseract.image_to_data(
167
+ image_np,
168
+ output_type=pytesseract.Output.DICT,
169
+ config=ocr_config
170
+ )
171
+
172
+ # Extract words and bounding boxes
173
+ words = []
174
+ boxes = []
175
+ confidences = []
176
+
177
+ for i in range(len(ocr_data['text'])):
178
+ if ocr_data['text'][i].strip() and ocr_data['conf'][i] > 30: # Filter by confidence
179
+ words.append(ocr_data['text'][i])
180
+ boxes.append([
181
+ ocr_data['left'][i],
182
+ ocr_data['top'][i],
183
+ ocr_data['left'][i] + ocr_data['width'][i],
184
+ ocr_data['top'][i] + ocr_data['height'][i]
185
+ ])
186
+ confidences.append(ocr_data['conf'][i])
187
+
188
+ # Join words to form complete text
189
+ text = ' '.join(words)
190
+
191
+ # Process with LayoutLMv3 for structure understanding (if text found)
192
+ structured_text = text
193
+ if words and len(words) < 400: # Limit for performance
194
+ try:
195
+ # Prepare inputs for LayoutLMv3
196
+ encoding = layoutlm_processor(
197
+ image,
198
+ words[:400], # Limit words
199
+ boxes=boxes[:400],
200
+ return_tensors="pt",
201
+ truncation=True,
202
+ padding="max_length",
203
+ max_length=512
204
+ )
205
+
206
+ # Move to device and run inference
207
+ encoding = {k: v.to(device) for k, v in encoding.items()}
208
+
209
+ with torch.no_grad():
210
+ outputs = layoutlm_model(**encoding)
211
+
212
+ # Get predictions
213
+ predictions = outputs.logits.argmax(-1).squeeze().tolist()
214
+ if isinstance(predictions, int):
215
+ predictions = [predictions]
216
+
217
+ # Structure text based on layout
218
+ structured_text = self._structure_text(words[:len(predictions)], boxes[:len(predictions)])
219
+ except Exception as e:
220
+ print(f"LayoutLM processing skipped: {e}")
221
+ structured_text = self._simple_structure_text(words, boxes)
222
+ else:
223
+ structured_text = self._simple_structure_text(words, boxes)
224
+
225
+ return {
226
+ 'raw_text': text,
227
+ 'words': words,
228
+ 'boxes': boxes,
229
+ 'structured_text': structured_text,
230
+ 'num_words': len(words),
231
+ 'avg_confidence': sum(confidences) / len(confidences) if confidences else 0
232
+ }
233
+
234
+ except Exception as e:
235
+ print(f"Error extracting text: {e}")
236
+ return {
237
+ 'raw_text': "",
238
+ 'words': [],
239
+ 'boxes': [],
240
+ 'structured_text': "",
241
+ 'num_words': 0,
242
+ 'avg_confidence': 0
243
+ }
244
+
245
+ def _simple_structure_text(self, words: List[str], boxes: List[List[int]]) -> str:
246
+ """
247
+ Simple text structuring based on spatial layout
248
+ Groups words into lines based on vertical position
249
+ """
250
+ if not words:
251
+ return ""
252
+
253
+ # Group words by lines
254
+ lines = []
255
+ current_line = []
256
+ last_y = None
257
+
258
+ for word, box in zip(words, boxes):
259
+ y_pos = box[1] # Top position
260
+
261
+ if last_y is None or abs(y_pos - last_y) < 15: # Same line threshold
262
+ current_line.append(word)
263
+ else:
264
+ if current_line:
265
+ lines.append(' '.join(current_line))
266
+ current_line = [word]
267
+
268
+ last_y = y_pos
269
+
270
+ if current_line:
271
+ lines.append(' '.join(current_line))
272
+
273
+ return '\n'.join(lines)
274
+
275
+ def _structure_text(self, words: List[str], boxes: List[List[int]]) -> str:
276
+ """Enhanced text structuring with better line detection"""
277
+ return self._simple_structure_text(words, boxes)
278
+
279
+ def process_document(self, file_path: str) -> str:
280
+ """
281
+ Process any document type (PDF or image) and extract text
282
+
283
+ Args:
284
+ file_path: Path to the document file
285
+
286
+ Returns:
287
+ Status message indicating success or failure
288
+ """
289
+ # Reset state
290
+ self.extracted_text = ""
291
+ self.page_contents = []
292
+ self.document_metadata = {
293
+ 'filename': os.path.basename(file_path),
294
+ 'pages': 0,
295
+ 'total_words': 0
296
+ }
297
+
298
+ # Check cache
299
+ file_hash = self._get_file_hash(file_path)
300
+ if file_hash in self.processing_cache:
301
+ cached_data = self.processing_cache[file_hash]
302
+ self.extracted_text = cached_data['text']
303
+ self.page_contents = cached_data['pages']
304
+ self.document_metadata = cached_data['metadata']
305
+ return f"βœ… Loaded from cache: {self.document_metadata['filename']}\n" \
306
+ f"πŸ“„ Pages: {self.document_metadata['pages']}\n" \
307
+ f"πŸ“ Words: {self.document_metadata['total_words']}"
308
+
309
+ try:
310
+ start_time = time.time()
311
+
312
+ if file_path.lower().endswith('.pdf'):
313
+ # Process PDF document
314
+ images = self.process_pdf(file_path)
315
+ self.document_metadata['pages'] = len(images)
316
+
317
+ for i, image in enumerate(images):
318
+ print(f"Processing page {i+1}/{len(images)}...")
319
+ page_data = self.extract_text_from_image(image)
320
+ self.page_contents.append(page_data)
321
+ self.extracted_text += f"\n\n--- Page {i+1} ---\n\n"
322
+ self.extracted_text += page_data['structured_text']
323
+ self.document_metadata['total_words'] += page_data['num_words']
324
+
325
+ else:
326
+ # Process single image
327
+ image = Image.open(file_path).convert('RGB')
328
+ page_data = self.extract_text_from_image(image)
329
+ self.page_contents.append(page_data)
330
+ self.extracted_text = page_data['structured_text']
331
+ self.document_metadata['pages'] = 1
332
+ self.document_metadata['total_words'] = page_data['num_words']
333
+
334
+ # Cache the results
335
+ self.processing_cache[file_hash] = {
336
+ 'text': self.extracted_text,
337
+ 'pages': self.page_contents,
338
+ 'metadata': self.document_metadata
339
+ }
340
+
341
+ processing_time = time.time() - start_time
342
+
343
+ if self.document_metadata['total_words'] == 0:
344
+ return f"⚠️ No text found in {self.document_metadata['filename']}. Please ensure the document contains readable text."
345
+
346
+ return f"βœ… Successfully processed {self.document_metadata['filename']}\n" \
347
+ f"πŸ“„ Pages: {self.document_metadata['pages']}\n" \
348
+ f"πŸ“ Words extracted: {self.document_metadata['total_words']}\n" \
349
+ f"⏱️ Processing time: {processing_time:.1f}s"
350
+
351
+ except Exception as e:
352
+ return f"❌ Error processing document: {str(e)}"
353
+
354
+ def summarize_document(self) -> str:
355
+ """
356
+ Generate a concise summary of the document using T5 model
357
+
358
+ Returns:
359
+ Document summary or error message
360
+ """
361
+ if not self.extracted_text:
362
+ return "No document has been processed yet. Please upload and process a document first."
363
+
364
+ try:
365
+ start_time = time.time()
366
+
367
+ # Prepare text for summarization (limit to manage tokens)
368
+ text_to_summarize = self.extracted_text[:2048]
369
+
370
+ # Create prompt for T5
371
+ prompt = f"Summarize the following document:\n\n{text_to_summarize}"
372
+
373
+ # Tokenize input
374
+ inputs = t5_tokenizer(
375
+ prompt,
376
+ return_tensors="pt",
377
+ max_length=1024,
378
+ truncation=True
379
+ ).to(device)
380
+
381
+ # Generate summary
382
+ with torch.no_grad():
383
+ summary_ids = t5_model.generate(
384
+ inputs.input_ids,
385
+ max_length=150,
386
+ min_length=30,
387
+ num_beams=4,
388
+ length_penalty=2.0,
389
+ early_stopping=True
390
+ )
391
+
392
+ # Decode summary
393
+ summary = t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
394
+
395
+ generation_time = time.time() - start_time
396
+
397
+ return f"{summary}\n\n⏱️ Generated in {generation_time:.1f}s"
398
+
399
+ except Exception as e:
400
+ return f"Error generating summary: {str(e)}"
401
+
402
+ def answer_question(self, question: str) -> str:
403
+ """
404
+ Answer questions about the document using T5 model
405
+
406
+ Args:
407
+ question: User's question about the document
408
+
409
+ Returns:
410
+ Answer to the question
411
+ """
412
+ if not self.extracted_text:
413
+ return "Please upload and process a document first."
414
+
415
+ if not question.strip():
416
+ return "Please enter a question."
417
+
418
+ try:
419
+ start_time = time.time()
420
+
421
+ # Prepare context and question
422
+ context = self.extracted_text[:1536] # Limit context
423
+
424
+ # Format prompt for T5
425
+ prompt = f"Answer the question based on the context.\n\nContext: {context}\n\nQuestion: {question}\n\nAnswer:"
426
+
427
+ # Tokenize
428
+ inputs = t5_tokenizer(
429
+ prompt,
430
+ return_tensors="pt",
431
+ max_length=1024,
432
+ truncation=True
433
+ ).to(device)
434
+
435
+ # Generate answer
436
+ with torch.no_grad():
437
+ answer_ids = t5_model.generate(
438
+ inputs.input_ids,
439
+ max_length=100,
440
+ min_length=5,
441
+ num_beams=3,
442
+ temperature=0.7,
443
+ do_sample=True,
444
+ top_p=0.9
445
+ )
446
+
447
+ # Decode answer
448
+ answer = t5_tokenizer.decode(answer_ids[0], skip_special_tokens=True)
449
+
450
+ generation_time = time.time() - start_time
451
+
452
+ return f"{answer}\n\n⏱️ Generated in {generation_time:.1f}s"
453
+
454
+ except Exception as e:
455
+ return f"Error answering question: {str(e)}"
456
+
457
+ def extract_key_information(self) -> Dict[str, List[str]]:
458
+ """
459
+ Extract key entities from the document using regex patterns
460
+
461
+ Returns:
462
+ Dictionary of extracted entities organized by type
463
+ """
464
+ if not self.extracted_text:
465
+ return {"message": ["No document has been processed yet."]}
466
+
467
+ try:
468
+ entities = {
469
+ 'dates': [],
470
+ 'emails': [],
471
+ 'phone_numbers': [],
472
+ 'monetary_amounts': [],
473
+ 'percentages': [],
474
+ 'urls': []
475
+ }
476
+
477
+ # Date extraction patterns
478
+ date_patterns = [
479
+ r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b',
480
+ r'\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b',
481
+ r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2},?\s+\d{4}\b',
482
+ r'\b\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{4}\b'
483
+ ]
484
+
485
+ for pattern in date_patterns:
486
+ matches = re.findall(pattern, self.extracted_text, re.IGNORECASE)
487
+ entities['dates'].extend(matches)
488
+
489
+ # Email extraction
490
+ email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
491
+ entities['emails'] = re.findall(email_pattern, self.extracted_text)
492
+
493
+ # Phone number extraction (various formats)
494
+ phone_patterns = [
495
+ r'\b\+?1?\s*\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})\b',
496
+ r'\b\d{3}[-.\s]\d{3}[-.\s]\d{4}\b'
497
+ ]
498
+
499
+ for pattern in phone_patterns:
500
+ matches = re.findall(pattern, self.extracted_text)
501
+ if isinstance(matches[0], tuple) if matches else False:
502
+ entities['phone_numbers'].extend(['-'.join(match) for match in matches])
503
+ else:
504
+ entities['phone_numbers'].extend(matches)
505
+
506
+ # Monetary amount extraction
507
+ money_patterns = [
508
+ r'\$\s*[\d,]+\.?\d*',
509
+ r'USD\s*[\d,]+\.?\d*',
510
+ r'\b\d{1,3}(?:,\d{3})*(?:\.\d{2})?\s*(?:dollars?|USD)\b'
511
+ ]
512
+
513
+ for pattern in money_patterns:
514
+ matches = re.findall(pattern, self.extracted_text, re.IGNORECASE)
515
+ entities['monetary_amounts'].extend(matches)
516
+
517
+ # Percentage extraction
518
+ percent_pattern = r'\b\d+\.?\d*\s*%'
519
+ entities['percentages'] = re.findall(percent_pattern, self.extracted_text)
520
+
521
+ # URL extraction
522
+ url_pattern = r'https?://(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&/=]*)'
523
+ entities['urls'] = re.findall(url_pattern, self.extracted_text)
524
+
525
+ # Clean up and deduplicate
526
+ for key in entities:
527
+ # Remove duplicates and limit to 10 items
528
+ unique_items = list(dict.fromkeys(entities[key])) # Preserves order
529
+ entities[key] = unique_items[:10]
530
+
531
+ # Remove empty categories
532
+ entities = {k: v for k, v in entities.items() if v}
533
+
534
+ if not entities:
535
+ entities = {"info": ["No specific entities found. The document may need better quality or contain different types of information."]}
536
+
537
+ return entities
538
+
539
+ except Exception as e:
540
+ return {"error": [f"Error extracting information: {str(e)}"]}
541
+
542
+ # Initialize global processor
543
+ processor = DocumentProcessor()
544
+
545
+ # Gradio interface handlers
546
+ def process_document_handler(file):
547
+ """Handle document upload and processing"""
548
+ if file is None:
549
+ return "Please upload a document.", "", {}
550
+
551
+ # Process the document
552
+ status = processor.process_document(file)
553
+
554
+ # Get text preview
555
+ text_preview = processor.extracted_text[:1000] + "..." if len(processor.extracted_text) > 1000 else processor.extracted_text
556
+
557
+ # Extract key information
558
+ key_info = processor.extract_key_information()
559
+
560
+ return status, text_preview, key_info
561
+
562
+ def summarize_handler():
563
+ """Handle document summarization request"""
564
+ return processor.summarize_document()
565
+
566
+ def qa_handler(question):
567
+ """Handle question answering request"""
568
+ if not question:
569
+ return "Please enter a question."
570
+ return processor.answer_question(question)
571
+
572
+ def create_interface():
573
+ """
574
+ Create the Gradio interface for the document intelligence system
575
+ """
576
+
577
+ with gr.Blocks(title="Multi-Modal Document Intelligence System", theme=gr.themes.Soft()) as interface:
578
+ # Header
579
+ gr.Markdown("""
580
+ # 🧠 Multi-Modal Document Intelligence System
581
+
582
+ **Upload any document (PDF or image) and unlock its insights with AI!**
583
+
584
+ This advanced system combines:
585
+ - πŸ“„ **LayoutLMv3** for understanding document structure and layout
586
+ - πŸ€– **Flan-T5** for intelligent summarization and question answering
587
+ - πŸ” **OCR Technology** for accurate text extraction from any document
588
+
589
+ ### ✨ Features
590
+ - Upload PDFs or images (JPG, PNG, etc.)
591
+ - Automatic text extraction with layout understanding
592
+ - Intelligent document summarization
593
+ - Natural language Q&A about your documents
594
+ - Key information extraction (dates, emails, amounts, etc.)
595
+ """)
596
+
597
+ # Main interface layout
598
+ with gr.Row():
599
+ # Left column - Upload and processing
600
+ with gr.Column(scale=1):
601
+ file_input = gr.File(
602
+ label="πŸ“ Upload Document",
603
+ file_types=[".pdf", ".png", ".jpg", ".jpeg", ".bmp", ".tiff"],
604
+ type="filepath"
605
+ )
606
+
607
+ process_btn = gr.Button("πŸ”„ Process Document", variant="primary", size="lg")
608
+
609
+ status_output = gr.Textbox(
610
+ label="πŸ“Š Processing Status",
611
+ lines=4,
612
+ interactive=False
613
+ )
614
+
615
+ gr.Markdown("### πŸ”‘ Key Information Extracted")
616
+ key_info_output = gr.JSON(label="Extracted Entities", elem_id="key_info")
617
+
618
+ # Right column - Results and interaction
619
+ with gr.Column(scale=2):
620
+ text_preview = gr.Textbox(
621
+ label="πŸ“„ Document Text Preview",
622
+ lines=10,
623
+ max_lines=15,
624
+ interactive=False
625
+ )
626
+
627
+ with gr.Tab("πŸ“ Summary"):
628
+ summary_btn = gr.Button("Generate Summary", variant="secondary")
629
+ summary_output = gr.Textbox(
630
+ label="Document Summary",
631
+ lines=8,
632
+ interactive=False
633
+ )
634
+
635
+ with gr.Tab("❓ Q&A"):
636
+ question_input = gr.Textbox(
637
+ label="Ask a question about the document",
638
+ placeholder="e.g., What are the main points? What dates are mentioned? What is the total amount?",
639
+ lines=2
640
+ )
641
+ qa_btn = gr.Button("Get Answer", variant="secondary")
642
+ answer_output = gr.Textbox(
643
+ label="Answer",
644
+ lines=6,
645
+ interactive=False
646
+ )
647
+
648
+ # Example questions
649
+ gr.Markdown("### πŸ“š Example Questions")
650
+ gr.Examples(
651
+ examples=[
652
+ "What is the main topic of this document?",
653
+ "What dates are mentioned?",
654
+ "What is the total amount due?",
655
+ "Who are the key people mentioned?",
656
+ "What are the main findings?",
657
+ "Summarize the key points."
658
+ ],
659
+ inputs=question_input
660
+ )
661
+
662
+ # Footer with instructions
663
+ gr.Markdown("""
664
+ ---
665
+ ### 🎯 How to Use
666
+ 1. **Upload** a PDF or image document
667
+ 2. **Process** the document to extract text
668
+ 3. **Review** the extracted text and key information
669
+ 4. **Generate** a summary or ask questions
670
+
671
+ ### πŸ’‘ Tips for Best Results
672
+ - Use clear, high-quality documents
673
+ - For images, ensure good lighting and contrast
674
+ - The system works with multiple languages
675
+ - Processing time depends on document size and complexity
676
+
677
+ ---
678
+ πŸ‘¨β€πŸ’» **Created by Spencer Purdy** | Computer Science @ Auburn University
679
+ [GitHub](https://github.com/spencercpurdy) | [LinkedIn](https://linkedin.com/in/spencerpurdy) | [Hugging Face](https://huggingface.co/spencercpurdy)
680
+ """)
681
+
682
+ # Connect event handlers
683
+ process_btn.click(
684
+ fn=process_document_handler,
685
+ inputs=file_input,
686
+ outputs=[status_output, text_preview, key_info_output]
687
+ )
688
+
689
+ summary_btn.click(
690
+ fn=summarize_handler,
691
+ inputs=[],
692
+ outputs=summary_output
693
+ )
694
+
695
+ qa_btn.click(
696
+ fn=qa_handler,
697
+ inputs=question_input,
698
+ outputs=answer_output
699
+ )
700
+
701
+ # Allow Enter key to submit questions
702
+ question_input.submit(
703
+ fn=qa_handler,
704
+ inputs=question_input,
705
+ outputs=answer_output
706
+ )
707
+
708
+ return interface
709
+
710
+ # Main execution
711
+ if __name__ == "__main__":
712
+ print("Starting Multi-Modal Document Intelligence System...")
713
+
714
+ # Create and launch the interface
715
+ interface = create_interface()
716
+
717
+ # Launch with public link
718
+ interface.launch(
719
+ debug=True,
720
+ share=True,
721
+ server_name="0.0.0.0",
722
+ server_port=7860
723
+ )