Spaces:

CultriX
/

Easy-OCR

Running on Zero

File size: 8,393 Bytes

import os
import tempfile
import uuid
import concurrent.futures
from typing import List, Tuple

import fitz  # PyMuPDF for PDF operations
import torch
import gradio as gr
import spaces                # HuggingFace Spaces helper (ZeroGPU)
import easyocr
import warnings

# Suppress benign CuDNN LSTM warning
warnings.filterwarnings("ignore", "RNN module weights are not part")

# ----------------------------------------------------------------------
# Configuration constants
# ----------------------------------------------------------------------
SUPPORTED_FILE_TYPES = [
    ".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".gif"
]
LANGUAGES = ["en", "nl", "de", "fr", "es", "it", "pt", "ru", "zh_cn", "ja", "ar"]
# Cap parallel OCR threads to avoid GPU OOM
OCR_THREADS = min(int(os.getenv("OCR_THREADS", "2")), 2)

# ----------------------------------------------------------------------
# EasyOCR reader cache
# ----------------------------------------------------------------------
_READERS = {}

def get_reader(lang_codes: Tuple[str, ...]):
    """
    Lazily initialize or retrieve an EasyOCR Reader for the given languages.
    Uses spaces.is_gpu_enabled() to decide whether to run on GPU or CPU.
    """
    key = tuple(sorted(lang_codes))
    if key not in _READERS:
        gpu_flag = spaces.is_gpu_enabled()
        _READERS[key] = easyocr.Reader(list(key), gpu=gpu_flag)
        print(f"[Init] EasyOCR reader for {key} (GPU={'yes' if gpu_flag else 'no'})")
    return _READERS[key]

# ----------------------------------------------------------------------
# OCR helpers
# ----------------------------------------------------------------------
@spaces.GPU(duration=600)
def run_ocr_pages(pdf_path: str, page_ids: List[int], lang_codes: Tuple[str, ...]) -> List[Tuple[int, str]]:
    """
    OCR the specified pages of a PDF.
    Runs only when GPU is allocated (ZeroGPU); falls back to CPU if unavailable.
    Processes pages in parallel threads, with per-page error handling.
    """
    reader = get_reader(lang_codes)
    results = []

    with fitz.open(pdf_path) as doc:
        def ocr_page(idx: int) -> Tuple[int, str]:
            try:
                page = doc[idx - 1]
                # Adaptive resolution: up to ~300dpi on normal pages
                scale = 2 if max(page.rect.width, page.rect.height) <= 600 else 1.5
                pix = page.get_pixmap(matrix=fitz.Matrix(scale, scale))
                img_path = os.path.join(tempfile.gettempdir(), f"ocr_{uuid.uuid4().hex}.png")
                pix.save(img_path)

                # Single-language => detail mode with confidence filtering
                if len(lang_codes) == 1:
                    items = reader.readtext(img_path, detail=1)
                    lines = [t for _, t, conf in items if conf > 0.2]
                else:
                    lines = reader.readtext(img_path, detail=0)

                os.remove(img_path)
                return idx, "\n".join(lines)
            except Exception as e:
                # Emit a warning instead of halting the entire batch
                msg = f"⚠️ OCR error on page {idx}: {e}"
                print(msg)
                return idx, msg

        # Cap threadpool size to avoid overloading GPU
        workers = min(OCR_THREADS, len(page_ids))
        with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as pool:
            futures = {pool.submit(ocr_page, pid): pid for pid in page_ids}
            for fut in concurrent.futures.as_completed(futures):
                results.append(fut.result())

    return results

def run_ocr_image(image_path: str, lang_codes: Tuple[str, ...]) -> str:
    """
    OCR a single image file.
    Mirrors run_ocr_pages' logic but for one-shot image inputs.
    """
    reader = get_reader(lang_codes)
    try:
        if len(lang_codes) == 1:
            items = reader.readtext(image_path, detail=1)
            lines = [t for _, t, conf in items if conf > 0.2]
        else:
            lines = reader.readtext(image_path, detail=0)
        return "\n".join(lines)
    except Exception as e:
        msg = f"⚠️ OCR error on image: {e}"
        print(msg)
        return msg

# ----------------------------------------------------------------------
# Streamed output helper
# ----------------------------------------------------------------------
def emit_chunk(chunk: str, combined: str, tmp_file) -> Tuple[str, None]:
    """
    Append 'chunk' to the in-memory combined text and the temp file,
    then return the updated combined text for streaming.
    """
    combined += chunk
    tmp_file.write(chunk.encode("utf-8"))
    return combined, None

# ----------------------------------------------------------------------
# Main extraction pipeline
# ----------------------------------------------------------------------
def pipeline(upload, langs, mode):
    """
    Handles PDF or image uploads, emits native and OCR text incrementally,
    and provides a downloadable .txt at the end.
    """
    if upload is None:
        raise gr.Error("Please upload a file.")
    # File-size guard (200MB)
    if os.path.getsize(upload.name) > 200 * 1024 * 1024:
        raise gr.Error("File larger than 200 MB; please split it.")

    # Prepare languages and temp output
    langs = langs if isinstance(langs, list) else [langs]
    lang_tuple = tuple(langs)
    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".txt")
    combined = ""

    ext = os.path.splitext(upload.name)[1].lower()

    # PDF flow
    if ext == ".pdf":
        # Streaming progress bar
        progress = gr.Progress(track_tqdm=False)
        with fitz.open(upload.name) as doc:
            total_pages = doc.page_count

        # Phase 1: Native-text extraction & OCR scheduling
        ocr_pages = []
        with fitz.open(upload.name) as doc:
            for i, page in enumerate(doc, start=1):
                text = page.get_text("text") if mode in ("native", "auto") else ""
                if text.strip():
                    chunk = f"--- Page {i} (native) ---\n{text}\n"
                    combined, _ = emit_chunk(chunk, combined, tmp)
                    yield combined, None
                else:
                    if mode in ("ocr", "auto"):
                        ocr_pages.append(i)
                progress(i / total_pages)

        # Phase 2: OCR pass on scheduled pages
        if ocr_pages:
            ocr_results = run_ocr_pages(upload.name, ocr_pages, lang_tuple)
            for idx, txt in sorted(ocr_results, key=lambda x: x[0]):
                chunk = f"--- Page {idx} (OCR) ---\n{txt}\n"
                combined, _ = emit_chunk(chunk, combined, tmp)
                yield combined, None

    # Image flow
    else:
        txt = run_ocr_image(upload.name, lang_tuple)
        chunk = f"--- Image OCR ---\n{txt}\n"
        combined, _ = emit_chunk(chunk, combined, tmp)
        yield combined, None

    tmp.close()
    # Final step: offer download link
    yield combined or "⚠️ No text detected.", tmp.name

# ----------------------------------------------------------------------
# Gradio UI (Blocks + streaming)
# ----------------------------------------------------------------------
theme = gr.themes.Base(primary_hue="purple")
with gr.Blocks(theme=theme, title="ZeroGPU OCR PDF & Image Extractor") as demo:
    gr.Markdown("## 📚 ZeroGPU Multilingual OCR Extractor")
    with gr.Row():
        with gr.Column(scale=1):
            file_in = gr.File(label="Upload PDF or image",
                              file_types=SUPPORTED_FILE_TYPES)
            lang_in = gr.Dropdown(LANGUAGES, multiselect=True, value=["en"],
                                  label="OCR language(s)")
            mode_in = gr.Radio(["native", "ocr", "auto"], value="auto",
                               label="Mode",
                               info="native=text · ocr=image · auto=mix")
            btn = gr.Button("Extract", variant="primary")
        with gr.Column(scale=2):
            out_txt = gr.Textbox(label="Extracted Text", lines=18,
                                 show_copy_button=True)
            dl = gr.File(label="Download .txt")

    # Use a list for outputs to match Gradio API
    btn.click(
        fn=pipeline,
        inputs=[file_in, lang_in, mode_in],
        outputs=[out_txt, dl]
    )
    demo.queue()

if __name__ == "__main__":
    demo.launch()