Spaces:

Didier
/

Optical_character_recognition

Running

File size: 7,283 Bytes

"""
File: module_ocr.py

Description: Gradio module to interact the tesseract OCR code.

Author: Didier Guillevic
Date: 2024-11-23
"""

import gradio as gr
import os
import uuid
import shutil
import threading
import time
import pathlib
import pdf2image

import ocr
import lang_codes


# Directory to save the (temporary) OCR'ed PDF files (whose path is returned to user)
output_dir = "tmp_results"
os.makedirs(output_dir, exist_ok=True)

# Define age limit for newly created files (in seconds, 24 hours = 86400 seconds)
AGE_LIMIT = 3600

# Function to clean up old PDF files
def cleanup_old_files():
    while True:
        current_time = time.time()
        for filename in os.listdir(output_dir):
            file_path = os.path.join(output_dir, filename)
            if filename.endswith(".pdf"):
                # Check if the file is older than the age limit
                file_age = current_time - os.path.getmtime(file_path)
                if file_age > AGE_LIMIT:
                    print(f"Removing old file: {file_path}")
                    os.remove(file_path)
        # Sleep for an hour before checking again
        time.sleep(3600)

# Start the cleanup thread
cleanup_thread = threading.Thread(target=cleanup_old_files, daemon=True)
cleanup_thread.start()

#
# Process one file
#
def process(
        input_file: str,
        src_langs: list[str], # list of ISO 639-3 language codes
        output_type: str
    ):
    """Process given file with OCR using given languages."
    """
    # default result
    output_text = ''
    output_pdf = None

    # format language as expected by tesseract package, e.g. 'eng+fra'
    language = '+'.join(src_langs)

    # PDF file or image file?
    input_file_suffix = pathlib.Path(input_file).suffix.lower()

    # output text?
    if output_type in ['text', 'text+pdf']:
        if input_file_suffix == '.pdf':
            texts = ocr.pdf_scanner.pdf_to_text( # on text per page
                pdf_path=input_file.name,
                language=language
            )
            output_text = '\n\n'.join(texts)
        else:
            output_text = ocr.pdf_scanner.image_to_text(
                image_path=input_file,
                language=language,
                psm=3
            )

    # output pdf?
    if output_type in ['pdf', 'text+pdf']:
        # Create a path for output PDF file
        base_filename = os.path.basename(input_file)
        base_filename, _ = os.path.splitext(base_filename)
        output_path = f"{base_filename}_OCR_{uuid.uuid4()}.pdf"
        output_path = os.path.join(output_dir, output_path)

        if input_file_suffix == '.pdf':
            output_pdf = ocr.pdf_scanner.pdf_to_searchable_pdf(
                pdf_path=input_file,
                output_path=output_path,
                language=language,
                attempt_repair=True
            )
        else:
            output_pdf = ocr.pdf_scanner.image_to_searchable_pdf(
                image_path=input_file,
                output_path=output_path,
                language=language,
                psm=3
            )
    
    return output_text, output_pdf

#
# Preview the document (image or PDF)
#
def preview_file(file):
    if file is None:
        return None, None
    
    file_path = file.name
    file_extension = file_path.lower().split('.')[-1]
    
    if file_extension in ['jpg', 'jpeg', 'png', 'gif', 'bmp']:
        # For images, return the image directly
        return file_path, None
    
    elif file_extension == 'pdf':
        # For PDFs, convert first page to image using pdf2image
        try:
            # Convert only the first page for preview
            pages = pdf2image.convert_from_path(
                file_path, 
                first_page=1, 
                last_page=1,
                dpi=150  # Good quality for preview
            )
            
            if pages:
                # Save the first page as a temporary image
                with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as tmp_file:
                    pages[0].save(tmp_file.name, 'PNG')
                    return tmp_file.name, f"PDF Preview: {os.path.basename(file_path)}"
            else:
                return None, "<p>Could not convert PDF to image</p>"
                
        except Exception as e:
            return None, f"<p>Error previewing PDF: {str(e)}</p>"
    
    else:
        return None, f"<p>Preview not available for {file_extension} files</p>"


#
# User interface
#
with gr.Blocks() as demo:

    def update_visibility(file):
        return gr.update(visible=True) if file else gr.update(visible=False)
    
    # Upload file to process
    with gr.Row():
        with gr.Column():
            input_file = gr.File(
                label="Upload an image or a PDF file of a scanned document",
                height=160
            )
            #preview_image = gr.Image(label="Preview", show_label=True)
            #preview_text = gr.HTML(label="Status")
            output_file = gr.File(
                label="Download OCR'ed PDF",
                visible=False # Initially not visible
            )
        with gr.Column():
            output_text = gr.Textbox(label="OCR output")

    # Input: language(s) used in document, output types
    with gr.Row():
        src_langs = gr.Dropdown(
            label='Language(s) of document',
            choices=lang_codes.tesseract_lang_codes.items(),
            multiselect=True,
            value=['eng', 'fra'],
            scale=4
        )
        output_type = gr.Dropdown(
            label='Output type',
            choices=['text', 'pdf', 'text+pdf'],
            multiselect=False,
            value='text+pdf',
            scale=1
        )

    # Buttons
    with gr.Row():
        ocr_btn = gr.Button(value="OCR", variant="primary")
        clear_btn = gr.Button("Clear", variant="secondary")
    
    # Examples
    with gr.Accordion("Examples", open=False):
        examples = gr.Examples(
            [
                ['./Non-text-searchable.pdf', ['eng','fra']],
                ['./sample_ID.jpeg', ['eng','fra']],
            ],
            inputs=[input_file, src_langs, output_type],
            outputs=[output_text, output_file],
            fn=process,
            cache_examples=False,
            label="Examples"
        )

    # Documentation
    with gr.Accordion("Documentation", open=False):
        gr.Markdown(f"""
            - Model: using the tesseract package for OCR 1.0 (traditional)
        """)

    # Update preview when file is uploaded
    #input_file.change(
    #    fn=preview_file,
    #    inputs=[input_file],
    #    outputs=[preview_image, preview_text]
    #)
    
    # Functions
    ocr_btn.click(
        fn=process,
        inputs=[input_file, src_langs, output_type],
        outputs=[output_text, output_file]
    ).then(
        update_visibility,
        inputs=output_file,
        outputs=output_file
    )
    clear_btn.click(
        fn=lambda : (None, '', None),
        inputs=[],
        outputs=[input_file, output_text, output_file] # input_file, output_text, output_file
    ).then(
        update_visibility,
        inputs=output_file,
        outputs=output_file
    )

if __name__ == '__main__':
    demo.launch()