File size: 3,237 Bytes
8cc1c22
 
 
 
 
 
 
 
 
7f4dfdf
 
b599655
8cc1c22
 
 
 
 
 
 
 
 
7f4dfdf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8cc1c22
 
 
 
 
 
 
 
7f4dfdf
 
 
 
 
 
 
8cc1c22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7f4dfdf
 
 
 
 
 
 
8cc1c22
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
"""
File: module_ocr.py
Description: Use a vision language model for Optical Character Recognition (OCR) tasks.
Author: Didier Guillevic
Date: 2025-04-06
"""

import gradio as gr
import ocr
import pdf2image
import tempfile
import os

#
# Process one file
#
def process(input_file: str):
    """Process given file with OCR."
    """
    return ocr.process_file(input_file)

#
# Preview the document (image or PDF)
#
def preview_file(file):
    if file is None:
        return None, None
    
    file_path = file.name
    file_extension = file_path.lower().split('.')[-1]
    
    if file_extension in ['jpg', 'jpeg', 'png', 'gif', 'bmp']:
        # For images, return the image directly
        return file_path, None
    
    elif file_extension == 'pdf':
        # For PDFs, convert first page to image using pdf2image
        try:
            # Convert only the first page for preview
            pages = pdf2image.convert_from_path(
                file_path, 
                first_page=1, 
                last_page=1,
                dpi=150  # Good quality for preview
            )
            
            if pages:
                # Save the first page as a temporary image
                with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as tmp_file:
                    pages[0].save(tmp_file.name, 'PNG')
                    return tmp_file.name, f"PDF Preview: {os.path.basename(file_path)}"
            else:
                return None, "<p>Could not convert PDF to image</p>"
                
        except Exception as e:
            return None, f"<p>Error previewing PDF: {str(e)}</p>"
    
    else:
        return None, f"<p>Preview not available for {file_extension} files</p>"


#
# User interface
#
with gr.Blocks() as demo:

    # Upload file to process
    with gr.Row():
        with gr.Column():
            input_file = gr.File(
                label="Upload a PDF or image file",
                file_types=[".pdf", ".jpg", ".jpeg", ".png", ".gif", ".bmp"],
                scale=1)
            preview_image = gr.Image(label="Preview", show_label=True)
            preview_text = gr.HTML(label="Status")
        output_text = gr.Textbox(label="OCR output", scale=2)

    # Buttons
    with gr.Row():
        ocr_btn = gr.Button(value="OCR", variant="primary")
        clear_btn = gr.Button("Clear", variant="secondary")
    
    # Examples
    with gr.Accordion("Examples", open=False):
        examples = gr.Examples(
            [
                ['./scanned_doc.pdf',],
                ['./passport_jp.png',]
            ],
            inputs=[input_file,],
            outputs=[output_text,],
            fn=process,
            cache_examples=False,
            label="Examples"
        )
    
    # Update preview when file is uploaded
    input_file.change(
        fn=preview_file,
        inputs=[input_file],
        outputs=[preview_image, preview_text]
    )
    
    # Functions
    ocr_btn.click(
        fn=process,
        inputs=[input_file,],
        outputs=[output_text,]
    )
    clear_btn.click(
        fn=lambda : (None, ''),
        inputs=[],
        outputs=[input_file, output_text] # input_file, output_text
    )

if __name__ == '__main__':
    demo.launch()