CultriX commited on
Commit
1ea570a
·
verified ·
1 Parent(s): 8a0a8c6

First Commit

Browse files
Files changed (3) hide show
  1. README.md +18 -14
  2. app.py +79 -0
  3. requirements.txt +6 -0
README.md CHANGED
@@ -1,14 +1,18 @@
1
- ---
2
- title: Easy OCR
3
- emoji: 🔥
4
- colorFrom: green
5
- colorTo: pink
6
- sdk: gradio
7
- sdk_version: 5.34.1
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- short_description: GPU-Accelerated OCR
12
- ---
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
1
+ # ZeroGPU OCR PDF Extractor
2
+
3
+ **Key features**
4
+
5
+ * ⚡️ *On‑demand GPU* — the `@spaces.GPU` decorator grabs a GPU only while OCR is running. Perfect for HuggingFace **ZeroGPU** Spaces.
6
+ * 📝 Combines native PDF text (via **pdfplumber**) with OCR from images (via **EasyOCR**).
7
+ * 🌍 Multilingual: add language codes to the `LANGS` list in `app.py`.
8
+
9
+ ## Deploy
10
+
11
+ 1. Create a *Gradio* Space and pick **ZeroGPU** in the **Hardware** dropdown (requires a PRO subscription).
12
+ 2. Upload these files or the ZIP bundle.
13
+ 3. Commit — the Space will build automatically. The first call downloads EasyOCR model weights (~200 MB).
14
+
15
+ ## Usage Tips
16
+
17
+ * Large PDFs can take several minutes; the decorator is set to `duration=600` s. Adjust if needed.
18
+ * For faster queues, lower the duration if your documents are small.
app.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ ZeroGPU‑ready OCR PDF extractor for HuggingFace Spaces
4
+ -----------------------------------------------------
5
+ • Uses @spaces.GPU to request a GPU only while needed (ZeroGPU compatible)
6
+ • Extracts native text with `pdfplumber`
7
+ • Runs GPU‑accelerated OCR on page images with `EasyOCR`
8
+ """
9
+
10
+ import gradio as gr
11
+ import fitz # PyMuPDF
12
+ import pdfplumber
13
+ import easyocr
14
+ import torch
15
+ import tempfile
16
+ import os
17
+ import spaces # <-- ZeroGPU decorator
18
+
19
+ # Global reader object (lazy‑loaded after GPU is allocated)
20
+ READER = None
21
+ LANGS = ['en'] # add more language codes as desired
22
+
23
+ @spaces.GPU(duration=600) # request a GPU for up to 10 min per call
24
+ def extract_text(pdf_file):
25
+ """Extract text (native + OCR) from an uploaded PDF"""
26
+ global READER
27
+
28
+ # Initialise EasyOCR reader after GPU becomes available
29
+ if READER is None:
30
+ READER = easyocr.Reader(LANGS, gpu=torch.cuda.is_available())
31
+
32
+ native_chunks = []
33
+ ocr_chunks = []
34
+
35
+ # Pass 1 — native text via pdfplumber
36
+ with pdfplumber.open(pdf_file.name) as pdf:
37
+ for idx, page in enumerate(pdf.pages, start=1):
38
+ txt = page.extract_text() or ""
39
+ if txt.strip():
40
+ native_chunks.append(f"--- Page {idx} (native) ---\n{txt}\n")
41
+
42
+ # Pass 2 — OCR each rendered page image with PyMuPDF + EasyOCR
43
+ doc = fitz.open(pdf_file.name)
44
+ for idx, page in enumerate(doc, start=1):
45
+ # Render page image at ~300 dpi
46
+ pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
47
+ tmp_path = os.path.join(tempfile.gettempdir(), f"page_{idx}.png")
48
+ pix.save(tmp_path)
49
+
50
+ ocr_result = READER.readtext(tmp_path, detail=0)
51
+ os.remove(tmp_path)
52
+
53
+ if any(line.strip() for line in ocr_result):
54
+ ocr_text = "\n".join(ocr_result)
55
+ ocr_chunks.append(f"--- Page {idx} (OCR) ---\n{ocr_text}\n")
56
+
57
+ combined = "\n".join(native_chunks + ocr_chunks)
58
+ return combined or "⚠️ No text detected in the document."
59
+
60
+ DESCRIPTION = (
61
+ "Drop a PDF to extract **all** text. "
62
+ "Native PDF text is captured first; any remaining text in images is "
63
+ "recognized using EasyOCR. On ZeroGPU hardware, the app requests a "
64
+ "GPU *only* while OCR is running."
65
+ )
66
+
67
+ iface = gr.Interface(
68
+ fn=extract_text,
69
+ inputs=gr.File(label="Upload PDF"),
70
+ outputs=gr.Textbox(label="Extracted Text", show_copy_button=True),
71
+ title="ZeroGPU OCR PDF Extractor",
72
+ description=DESCRIPTION,
73
+ allow_flagging="never",
74
+ examples=None,
75
+ theme="default",
76
+ )
77
+
78
+ if __name__ == "__main__":
79
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio>=4.0
2
+ easyocr>=1.7.1
3
+ torch>=2.0
4
+ pdfplumber>=0.10.3
5
+ PyMuPDF>=1.23.9
6
+ spaces