Didier's picture
Update module_ocr.py
ba754a7 verified
"""
File: module_ocr.py
Description: Gradio module to interact the tesseract OCR code.
Author: Didier Guillevic
Date: 2024-11-23
"""
import gradio as gr
import os
import uuid
import shutil
import threading
import time
import pathlib
import pdf2image
import ocr
import lang_codes
# Directory to save the (temporary) OCR'ed PDF files (whose path is returned to user)
output_dir = "tmp_results"
os.makedirs(output_dir, exist_ok=True)
# Define age limit for newly created files (in seconds, 24 hours = 86400 seconds)
AGE_LIMIT = 3600
# Function to clean up old PDF files
def cleanup_old_files():
while True:
current_time = time.time()
for filename in os.listdir(output_dir):
file_path = os.path.join(output_dir, filename)
if filename.endswith(".pdf"):
# Check if the file is older than the age limit
file_age = current_time - os.path.getmtime(file_path)
if file_age > AGE_LIMIT:
print(f"Removing old file: {file_path}")
os.remove(file_path)
# Sleep for an hour before checking again
time.sleep(3600)
# Start the cleanup thread
cleanup_thread = threading.Thread(target=cleanup_old_files, daemon=True)
cleanup_thread.start()
#
# Process one file
#
def process(
input_file: str,
src_langs: list[str], # list of ISO 639-3 language codes
output_type: str
):
"""Process given file with OCR using given languages."
"""
# default result
output_text = ''
output_pdf = None
# format language as expected by tesseract package, e.g. 'eng+fra'
language = '+'.join(src_langs)
# PDF file or image file?
input_file_suffix = pathlib.Path(input_file).suffix.lower()
# output text?
if output_type in ['text', 'text+pdf']:
if input_file_suffix == '.pdf':
texts = ocr.pdf_scanner.pdf_to_text( # on text per page
pdf_path=input_file.name,
language=language
)
output_text = '\n\n'.join(texts)
else:
output_text = ocr.pdf_scanner.image_to_text(
image_path=input_file,
language=language,
psm=3
)
# output pdf?
if output_type in ['pdf', 'text+pdf']:
# Create a path for output PDF file
base_filename = os.path.basename(input_file)
base_filename, _ = os.path.splitext(base_filename)
output_path = f"{base_filename}_OCR_{uuid.uuid4()}.pdf"
output_path = os.path.join(output_dir, output_path)
if input_file_suffix == '.pdf':
output_pdf = ocr.pdf_scanner.pdf_to_searchable_pdf(
pdf_path=input_file,
output_path=output_path,
language=language,
attempt_repair=True
)
else:
output_pdf = ocr.pdf_scanner.image_to_searchable_pdf(
image_path=input_file,
output_path=output_path,
language=language,
psm=3
)
return output_text, output_pdf
#
# Preview the document (image or PDF)
#
def preview_file(file):
if file is None:
return None, None
file_path = file.name
file_extension = file_path.lower().split('.')[-1]
if file_extension in ['jpg', 'jpeg', 'png', 'gif', 'bmp']:
# For images, return the image directly
return file_path, None
elif file_extension == 'pdf':
# For PDFs, convert first page to image using pdf2image
try:
# Convert only the first page for preview
pages = pdf2image.convert_from_path(
file_path,
first_page=1,
last_page=1,
dpi=150 # Good quality for preview
)
if pages:
# Save the first page as a temporary image
with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as tmp_file:
pages[0].save(tmp_file.name, 'PNG')
return tmp_file.name, f"PDF Preview: {os.path.basename(file_path)}"
else:
return None, "<p>Could not convert PDF to image</p>"
except Exception as e:
return None, f"<p>Error previewing PDF: {str(e)}</p>"
else:
return None, f"<p>Preview not available for {file_extension} files</p>"
#
# User interface
#
with gr.Blocks() as demo:
def update_visibility(file):
return gr.update(visible=True) if file else gr.update(visible=False)
# Upload file to process
with gr.Row():
with gr.Column():
input_file = gr.File(
label="Upload an image or a PDF file of a scanned document",
height=160
)
#preview_image = gr.Image(label="Preview", show_label=True)
#preview_text = gr.HTML(label="Status")
output_file = gr.File(
label="Download OCR'ed PDF",
visible=False # Initially not visible
)
with gr.Column():
output_text = gr.Textbox(label="OCR output")
# Input: language(s) used in document, output types
with gr.Row():
src_langs = gr.Dropdown(
label='Language(s) of document',
choices=lang_codes.tesseract_lang_codes.items(),
multiselect=True,
value=['eng', 'fra'],
scale=4
)
output_type = gr.Dropdown(
label='Output type',
choices=['text', 'pdf', 'text+pdf'],
multiselect=False,
value='text+pdf',
scale=1
)
# Buttons
with gr.Row():
ocr_btn = gr.Button(value="OCR", variant="primary")
clear_btn = gr.Button("Clear", variant="secondary")
# Examples
with gr.Accordion("Examples", open=False):
examples = gr.Examples(
[
['./Non-text-searchable.pdf', ['eng','fra']],
['./sample_ID.jpeg', ['eng','fra']],
],
inputs=[input_file, src_langs, output_type],
outputs=[output_text, output_file],
fn=process,
cache_examples=False,
label="Examples"
)
# Documentation
with gr.Accordion("Documentation", open=False):
gr.Markdown(f"""
- Model: using the tesseract package for OCR 1.0 (traditional)
""")
# Update preview when file is uploaded
#input_file.change(
# fn=preview_file,
# inputs=[input_file],
# outputs=[preview_image, preview_text]
#)
# Functions
ocr_btn.click(
fn=process,
inputs=[input_file, src_langs, output_type],
outputs=[output_text, output_file]
).then(
update_visibility,
inputs=output_file,
outputs=output_file
)
clear_btn.click(
fn=lambda : (None, '', None),
inputs=[],
outputs=[input_file, output_text, output_file] # input_file, output_text, output_file
).then(
update_visibility,
inputs=output_file,
outputs=output_file
)
if __name__ == '__main__':
demo.launch()