Didier's picture
Update module_ocr.py
b599655 verified
"""
File: module_ocr.py
Description: Use a vision language model for Optical Character Recognition (OCR) tasks.
Author: Didier Guillevic
Date: 2025-04-06
"""
import gradio as gr
import ocr
import pdf2image
import tempfile
import os
#
# Process one file
#
def process(input_file: str):
"""Process given file with OCR."
"""
return ocr.process_file(input_file)
#
# Preview the document (image or PDF)
#
def preview_file(file):
if file is None:
return None, None
file_path = file.name
file_extension = file_path.lower().split('.')[-1]
if file_extension in ['jpg', 'jpeg', 'png', 'gif', 'bmp']:
# For images, return the image directly
return file_path, None
elif file_extension == 'pdf':
# For PDFs, convert first page to image using pdf2image
try:
# Convert only the first page for preview
pages = pdf2image.convert_from_path(
file_path,
first_page=1,
last_page=1,
dpi=150 # Good quality for preview
)
if pages:
# Save the first page as a temporary image
with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as tmp_file:
pages[0].save(tmp_file.name, 'PNG')
return tmp_file.name, f"PDF Preview: {os.path.basename(file_path)}"
else:
return None, "<p>Could not convert PDF to image</p>"
except Exception as e:
return None, f"<p>Error previewing PDF: {str(e)}</p>"
else:
return None, f"<p>Preview not available for {file_extension} files</p>"
#
# User interface
#
with gr.Blocks() as demo:
# Upload file to process
with gr.Row():
with gr.Column():
input_file = gr.File(
label="Upload a PDF or image file",
file_types=[".pdf", ".jpg", ".jpeg", ".png", ".gif", ".bmp"],
scale=1)
preview_image = gr.Image(label="Preview", show_label=True)
preview_text = gr.HTML(label="Status")
output_text = gr.Textbox(label="OCR output", scale=2)
# Buttons
with gr.Row():
ocr_btn = gr.Button(value="OCR", variant="primary")
clear_btn = gr.Button("Clear", variant="secondary")
# Examples
with gr.Accordion("Examples", open=False):
examples = gr.Examples(
[
['./scanned_doc.pdf',],
['./passport_jp.png',]
],
inputs=[input_file,],
outputs=[output_text,],
fn=process,
cache_examples=False,
label="Examples"
)
# Update preview when file is uploaded
input_file.change(
fn=preview_file,
inputs=[input_file],
outputs=[preview_image, preview_text]
)
# Functions
ocr_btn.click(
fn=process,
inputs=[input_file,],
outputs=[output_text,]
)
clear_btn.click(
fn=lambda : (None, ''),
inputs=[],
outputs=[input_file, output_text] # input_file, output_text
)
if __name__ == '__main__':
demo.launch()