Spaces:

Didier
/

Optical_character_recognition

Running

App Files Files Community

Optical_character_recognition / module_ocr.py

Didier

Update module_ocr.py

ba754a7 verified 17 days ago

raw

history blame contribute delete

7.28 kB

	"""
	File: module_ocr.py

	Description: Gradio module to interact the tesseract OCR code.

	Author: Didier Guillevic
	Date: 2024-11-23
	"""

	import gradio as gr
	import os
	import uuid
	import shutil
	import threading
	import time
	import pathlib
	import pdf2image

	import ocr
	import lang_codes


	# Directory to save the (temporary) OCR'ed PDF files (whose path is returned to user)
	output_dir = "tmp_results"
	os.makedirs(output_dir, exist_ok=True)

	# Define age limit for newly created files (in seconds, 24 hours = 86400 seconds)
	AGE_LIMIT = 3600

	# Function to clean up old PDF files
	def cleanup_old_files():
	while True:
	current_time = time.time()
	for filename in os.listdir(output_dir):
	file_path = os.path.join(output_dir, filename)
	if filename.endswith(".pdf"):
	# Check if the file is older than the age limit
	file_age = current_time - os.path.getmtime(file_path)
	if file_age > AGE_LIMIT:
	print(f"Removing old file: {file_path}")
	os.remove(file_path)
	# Sleep for an hour before checking again
	time.sleep(3600)

	# Start the cleanup thread
	cleanup_thread = threading.Thread(target=cleanup_old_files, daemon=True)
	cleanup_thread.start()

	#
	# Process one file
	#
	def process(
	input_file: str,
	src_langs: list[str], # list of ISO 639-3 language codes
	output_type: str
	):
	"""Process given file with OCR using given languages."
	"""
	# default result
	output_text = ''
	output_pdf = None

	# format language as expected by tesseract package, e.g. 'eng+fra'
	language = '+'.join(src_langs)

	# PDF file or image file?
	input_file_suffix = pathlib.Path(input_file).suffix.lower()

	# output text?
	if output_type in ['text', 'text+pdf']:
	if input_file_suffix == '.pdf':
	texts = ocr.pdf_scanner.pdf_to_text( # on text per page
	pdf_path=input_file.name,
	language=language
	)
	output_text = '\n\n'.join(texts)
	else:
	output_text = ocr.pdf_scanner.image_to_text(
	image_path=input_file,
	language=language,
	psm=3
	)

	# output pdf?
	if output_type in ['pdf', 'text+pdf']:
	# Create a path for output PDF file
	base_filename = os.path.basename(input_file)
	base_filename, _ = os.path.splitext(base_filename)
	output_path = f"{base_filename}_OCR_{uuid.uuid4()}.pdf"
	output_path = os.path.join(output_dir, output_path)

	if input_file_suffix == '.pdf':
	output_pdf = ocr.pdf_scanner.pdf_to_searchable_pdf(
	pdf_path=input_file,
	output_path=output_path,
	language=language,
	attempt_repair=True
	)
	else:
	output_pdf = ocr.pdf_scanner.image_to_searchable_pdf(
	image_path=input_file,
	output_path=output_path,
	language=language,
	psm=3
	)

	return output_text, output_pdf

	#
	# Preview the document (image or PDF)
	#
	def preview_file(file):
	if file is None:
	return None, None

	file_path = file.name
	file_extension = file_path.lower().split('.')[-1]

	if file_extension in ['jpg', 'jpeg', 'png', 'gif', 'bmp']:
	# For images, return the image directly
	return file_path, None

	elif file_extension == 'pdf':
	# For PDFs, convert first page to image using pdf2image
	try:
	# Convert only the first page for preview
	pages = pdf2image.convert_from_path(
	file_path,
	first_page=1,
	last_page=1,
	dpi=150 # Good quality for preview
	)

	if pages:
	# Save the first page as a temporary image
	with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as tmp_file:
	pages[0].save(tmp_file.name, 'PNG')
	return tmp_file.name, f"PDF Preview: {os.path.basename(file_path)}"
	else:
	return None, "<p>Could not convert PDF to image</p>"

	except Exception as e:
	return None, f"<p>Error previewing PDF: {str(e)}</p>"

	else:
	return None, f"<p>Preview not available for {file_extension} files</p>"


	#
	# User interface
	#
	with gr.Blocks() as demo:

	def update_visibility(file):
	return gr.update(visible=True) if file else gr.update(visible=False)

	# Upload file to process
	with gr.Row():
	with gr.Column():
	input_file = gr.File(
	label="Upload an image or a PDF file of a scanned document",
	height=160
	)
	#preview_image = gr.Image(label="Preview", show_label=True)
	#preview_text = gr.HTML(label="Status")
	output_file = gr.File(
	label="Download OCR'ed PDF",
	visible=False # Initially not visible
	)
	with gr.Column():
	output_text = gr.Textbox(label="OCR output")

	# Input: language(s) used in document, output types
	with gr.Row():
	src_langs = gr.Dropdown(
	label='Language(s) of document',
	choices=lang_codes.tesseract_lang_codes.items(),
	multiselect=True,
	value=['eng', 'fra'],
	scale=4
	)
	output_type = gr.Dropdown(
	label='Output type',
	choices=['text', 'pdf', 'text+pdf'],
	multiselect=False,
	value='text+pdf',
	scale=1
	)

	# Buttons
	with gr.Row():
	ocr_btn = gr.Button(value="OCR", variant="primary")
	clear_btn = gr.Button("Clear", variant="secondary")

	# Examples
	with gr.Accordion("Examples", open=False):
	examples = gr.Examples(
	[
	['./Non-text-searchable.pdf', ['eng','fra']],
	['./sample_ID.jpeg', ['eng','fra']],
	],
	inputs=[input_file, src_langs, output_type],
	outputs=[output_text, output_file],
	fn=process,
	cache_examples=False,
	label="Examples"
	)

	# Documentation
	with gr.Accordion("Documentation", open=False):
	gr.Markdown(f"""
	- Model: using the tesseract package for OCR 1.0 (traditional)
	""")

	# Update preview when file is uploaded
	#input_file.change(
	# fn=preview_file,
	# inputs=[input_file],
	# outputs=[preview_image, preview_text]
	#)

	# Functions
	ocr_btn.click(
	fn=process,
	inputs=[input_file, src_langs, output_type],
	outputs=[output_text, output_file]
	).then(
	update_visibility,
	inputs=output_file,
	outputs=output_file
	)
	clear_btn.click(
	fn=lambda : (None, '', None),
	inputs=[],
	outputs=[input_file, output_text, output_file] # input_file, output_text, output_file
	).then(
	update_visibility,
	inputs=output_file,
	outputs=output_file
	)

	if __name__ == '__main__':
	demo.launch()