First Commit
Browse files- README.md +18 -14
- app.py +79 -0
- requirements.txt +6 -0
README.md
CHANGED
@@ -1,14 +1,18 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
|
|
|
|
|
|
|
|
|
1 |
+
# ZeroGPU OCR PDF Extractor
|
2 |
+
|
3 |
+
**Key features**
|
4 |
+
|
5 |
+
* ⚡️ *On‑demand GPU* — the `@spaces.GPU` decorator grabs a GPU only while OCR is running. Perfect for HuggingFace **ZeroGPU** Spaces.
|
6 |
+
* 📝 Combines native PDF text (via **pdfplumber**) with OCR from images (via **EasyOCR**).
|
7 |
+
* 🌍 Multilingual: add language codes to the `LANGS` list in `app.py`.
|
8 |
+
|
9 |
+
## Deploy
|
10 |
+
|
11 |
+
1. Create a *Gradio* Space and pick **ZeroGPU** in the **Hardware** dropdown (requires a PRO subscription).
|
12 |
+
2. Upload these files or the ZIP bundle.
|
13 |
+
3. Commit — the Space will build automatically. The first call downloads EasyOCR model weights (~200 MB).
|
14 |
+
|
15 |
+
## Usage Tips
|
16 |
+
|
17 |
+
* Large PDFs can take several minutes; the decorator is set to `duration=600` s. Adjust if needed.
|
18 |
+
* For faster queues, lower the duration if your documents are small.
|
app.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
"""
|
3 |
+
ZeroGPU‑ready OCR PDF extractor for HuggingFace Spaces
|
4 |
+
-----------------------------------------------------
|
5 |
+
• Uses @spaces.GPU to request a GPU only while needed (ZeroGPU compatible)
|
6 |
+
• Extracts native text with `pdfplumber`
|
7 |
+
• Runs GPU‑accelerated OCR on page images with `EasyOCR`
|
8 |
+
"""
|
9 |
+
|
10 |
+
import gradio as gr
|
11 |
+
import fitz # PyMuPDF
|
12 |
+
import pdfplumber
|
13 |
+
import easyocr
|
14 |
+
import torch
|
15 |
+
import tempfile
|
16 |
+
import os
|
17 |
+
import spaces # <-- ZeroGPU decorator
|
18 |
+
|
19 |
+
# Global reader object (lazy‑loaded after GPU is allocated)
|
20 |
+
READER = None
|
21 |
+
LANGS = ['en'] # add more language codes as desired
|
22 |
+
|
23 |
+
@spaces.GPU(duration=600) # request a GPU for up to 10 min per call
|
24 |
+
def extract_text(pdf_file):
|
25 |
+
"""Extract text (native + OCR) from an uploaded PDF"""
|
26 |
+
global READER
|
27 |
+
|
28 |
+
# Initialise EasyOCR reader after GPU becomes available
|
29 |
+
if READER is None:
|
30 |
+
READER = easyocr.Reader(LANGS, gpu=torch.cuda.is_available())
|
31 |
+
|
32 |
+
native_chunks = []
|
33 |
+
ocr_chunks = []
|
34 |
+
|
35 |
+
# Pass 1 — native text via pdfplumber
|
36 |
+
with pdfplumber.open(pdf_file.name) as pdf:
|
37 |
+
for idx, page in enumerate(pdf.pages, start=1):
|
38 |
+
txt = page.extract_text() or ""
|
39 |
+
if txt.strip():
|
40 |
+
native_chunks.append(f"--- Page {idx} (native) ---\n{txt}\n")
|
41 |
+
|
42 |
+
# Pass 2 — OCR each rendered page image with PyMuPDF + EasyOCR
|
43 |
+
doc = fitz.open(pdf_file.name)
|
44 |
+
for idx, page in enumerate(doc, start=1):
|
45 |
+
# Render page image at ~300 dpi
|
46 |
+
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
|
47 |
+
tmp_path = os.path.join(tempfile.gettempdir(), f"page_{idx}.png")
|
48 |
+
pix.save(tmp_path)
|
49 |
+
|
50 |
+
ocr_result = READER.readtext(tmp_path, detail=0)
|
51 |
+
os.remove(tmp_path)
|
52 |
+
|
53 |
+
if any(line.strip() for line in ocr_result):
|
54 |
+
ocr_text = "\n".join(ocr_result)
|
55 |
+
ocr_chunks.append(f"--- Page {idx} (OCR) ---\n{ocr_text}\n")
|
56 |
+
|
57 |
+
combined = "\n".join(native_chunks + ocr_chunks)
|
58 |
+
return combined or "⚠️ No text detected in the document."
|
59 |
+
|
60 |
+
DESCRIPTION = (
|
61 |
+
"Drop a PDF to extract **all** text. "
|
62 |
+
"Native PDF text is captured first; any remaining text in images is "
|
63 |
+
"recognized using EasyOCR. On ZeroGPU hardware, the app requests a "
|
64 |
+
"GPU *only* while OCR is running."
|
65 |
+
)
|
66 |
+
|
67 |
+
iface = gr.Interface(
|
68 |
+
fn=extract_text,
|
69 |
+
inputs=gr.File(label="Upload PDF"),
|
70 |
+
outputs=gr.Textbox(label="Extracted Text", show_copy_button=True),
|
71 |
+
title="ZeroGPU OCR PDF Extractor",
|
72 |
+
description=DESCRIPTION,
|
73 |
+
allow_flagging="never",
|
74 |
+
examples=None,
|
75 |
+
theme="default",
|
76 |
+
)
|
77 |
+
|
78 |
+
if __name__ == "__main__":
|
79 |
+
iface.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio>=4.0
|
2 |
+
easyocr>=1.7.1
|
3 |
+
torch>=2.0
|
4 |
+
pdfplumber>=0.10.3
|
5 |
+
PyMuPDF>=1.23.9
|
6 |
+
spaces
|