File size: 4,264 Bytes
8ba2238 17c6e9f 8ba2238 17c6e9f 8ba2238 843ddbc 8ba2238 843ddbc 8ba2238 843ddbc 8ba2238 843ddbc 1d8c673 843ddbc 8ba2238 17c6e9f 8ba2238 843ddbc 1d8c673 8ba2238 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
"""
File: module_ocr2.py
Description: module to interact with OCR deep learning models.
Author: Didier Guillevic
Date: 2025-04-07
"""
import gradio as gr
import os
import magic
import pdf2image
import tempfile
import ocr2 # OCR with software 2.0 models
#
# Get file type: PDF or Image or something else
#
def get_file_type(file_path):
# Check file extension
file_extension = os.path.splitext(file_path)[1].lower()
# Check MIME type
mime = magic.Magic(mime=True)
mime_type = mime.from_file(file_path)
# Determine file type
if file_extension == '.pdf' or mime_type == 'application/pdf':
return 'PDF'
elif file_extension in ['.jpg', '.jpeg', '.png', '.gif'] or mime_type.startswith('image/'):
return 'Image'
elif file_extension == '.pptx' or mime_type == 'application/vnd.openxmlformats-officedocument.presentationml.presentation':
return 'PowerPoint'
else:
return 'Other'
#
# Process one file
#
def process(input_file: str):
"""Process given file with OCR using given languages."
"""
file_type = get_file_type(input_file)
if file_type == 'PDF':
return ocr2.process_pdf(input_file)
elif file_type == 'Image':
return ocr2.process_image(input_file)
else:
return "Unsupported file type. Please upload a PDF, or an image file."
return ocr2.process(input_file)
#
# Preview the document (image or PDF)
#
def preview_file(file):
if file is None:
return None, None
file_path = file.name
file_extension = file_path.lower().split('.')[-1]
if file_extension in ['jpg', 'jpeg', 'png', 'gif', 'bmp']:
# For images, return the image directly
return file_path, None
elif file_extension == 'pdf':
# For PDFs, convert first page to image using pdf2image
try:
# Convert only the first page for preview
pages = pdf2image.convert_from_path(
file_path,
first_page=1,
last_page=1,
dpi=150 # Good quality for preview
)
if pages:
# Save the first page as a temporary image
with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as tmp_file:
pages[0].save(tmp_file.name, 'PNG')
return tmp_file.name, f"PDF Preview: {os.path.basename(file_path)}"
else:
return None, "<p>Could not convert PDF to image</p>"
except Exception as e:
return None, f"<p>Error previewing PDF: {str(e)}</p>"
else:
return None, f"<p>Preview not available for {file_extension} files</p>"
#
# User interface
#
with gr.Blocks() as demo:
# Upload file to process
with gr.Row():
with gr.Column():
input_file = gr.File(
label="Upload a PDF or an image file",
file_types=[".pdf", ".jpg", ".jpeg", ".png", ".gif", ".bmp"],
scale=1)
#preview_image = gr.Image(label="Preview", show_label=True)
#preview_text = gr.HTML(label="Status")
output_text = gr.Textbox(label="OCR output", scale=2)
# Buttons
with gr.Row():
ocr_btn = gr.Button(value="OCR", variant="primary")
clear_btn = gr.Button("Clear", variant="secondary")
# Examples
with gr.Accordion("Examples", open=False):
examples = gr.Examples(
[
['./scanned_doc.pdf',],
['./passport_jp.png']
],
inputs=[input_file,],
outputs=[output_text,],
fn=process,
cache_examples=False,
label="Examples"
)
# Update preview when file is uploaded
#input_file.change(
# fn=preview_file,
# inputs=[input_file],
# outputs=[preview_image, preview_text]
#)
# Functions
ocr_btn.click(
fn=process,
inputs=[input_file,],
outputs=[output_text,]
)
clear_btn.click(
fn=lambda : (None, ''),
inputs=[],
outputs=[input_file, output_text] # input_file, output_text
)
if __name__ == '__main__':
demo.launch()
|