""" File: module_ocr.py Description: Use a vision language model for Optical Character Recognition (OCR) tasks. Author: Didier Guillevic Date: 2025-04-06 """ import gradio as gr import ocr import pdf2image import tempfile import os # # Process one file # def process(input_file: str): """Process given file with OCR." """ return ocr.process_file(input_file) # # Preview the document (image or PDF) # def preview_file(file): if file is None: return None, None file_path = file.name file_extension = file_path.lower().split('.')[-1] if file_extension in ['jpg', 'jpeg', 'png', 'gif', 'bmp']: # For images, return the image directly return file_path, None elif file_extension == 'pdf': # For PDFs, convert first page to image using pdf2image try: # Convert only the first page for preview pages = pdf2image.convert_from_path( file_path, first_page=1, last_page=1, dpi=150 # Good quality for preview ) if pages: # Save the first page as a temporary image with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as tmp_file: pages[0].save(tmp_file.name, 'PNG') return tmp_file.name, f"PDF Preview: {os.path.basename(file_path)}" else: return None, "
Could not convert PDF to image
" except Exception as e: return None, f"Error previewing PDF: {str(e)}
" else: return None, f"Preview not available for {file_extension} files
" # # User interface # with gr.Blocks() as demo: # Upload file to process with gr.Row(): with gr.Column(): input_file = gr.File( label="Upload a PDF or image file", file_types=[".pdf", ".jpg", ".jpeg", ".png", ".gif", ".bmp"], scale=1) preview_image = gr.Image(label="Preview", show_label=True) preview_text = gr.HTML(label="Status") output_text = gr.Textbox(label="OCR output", scale=2) # Buttons with gr.Row(): ocr_btn = gr.Button(value="OCR", variant="primary") clear_btn = gr.Button("Clear", variant="secondary") # Examples with gr.Accordion("Examples", open=False): examples = gr.Examples( [ ['./scanned_doc.pdf',], ['./passport_jp.png',] ], inputs=[input_file,], outputs=[output_text,], fn=process, cache_examples=False, label="Examples" ) # Update preview when file is uploaded input_file.change( fn=preview_file, inputs=[input_file], outputs=[preview_image, preview_text] ) # Functions ocr_btn.click( fn=process, inputs=[input_file,], outputs=[output_text,] ) clear_btn.click( fn=lambda : (None, ''), inputs=[], outputs=[input_file, output_text] # input_file, output_text ) if __name__ == '__main__': demo.launch()