Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
f55c2c2
1
Parent(s):
83e370e
better description
Browse files
app.py
CHANGED
@@ -21,18 +21,26 @@ try:
|
|
21 |
MODELS["RolmOCR"] = AutoModelForImageTextToText.from_pretrained(
|
22 |
"reducto/RolmOCR", torch_dtype=torch.bfloat16, device_map="auto"
|
23 |
)
|
24 |
-
PIPELINES["RolmOCR"] = pipeline(
|
|
|
|
|
25 |
except Exception as e:
|
26 |
MODEL_LOAD_ERROR_MSG["RolmOCR"] = f"Failed to load RolmOCR: {str(e)}"
|
27 |
print(f"Error loading RolmOCR: {e}")
|
28 |
|
29 |
# Load Nanonets-OCR-s
|
30 |
try:
|
31 |
-
PROCESSORS["Nanonets-OCR-s"] = AutoProcessor.from_pretrained(
|
|
|
|
|
32 |
MODELS["Nanonets-OCR-s"] = AutoModelForImageTextToText.from_pretrained(
|
33 |
"nanonets/Nanonets-OCR-s", torch_dtype=torch.bfloat16, device_map="auto"
|
34 |
)
|
35 |
-
PIPELINES["Nanonets-OCR-s"] = pipeline(
|
|
|
|
|
|
|
|
|
36 |
except Exception as e:
|
37 |
MODEL_LOAD_ERROR_MSG["Nanonets-OCR-s"] = f"Failed to load Nanonets-OCR-s: {str(e)}"
|
38 |
print(f"Error loading Nanonets-OCR-s: {e}")
|
@@ -165,7 +173,7 @@ def predict(pil_image, model_name="RolmOCR"):
|
|
165 |
if model_name not in PIPELINES:
|
166 |
error_to_report = MODEL_LOAD_ERROR_MSG.get(
|
167 |
model_name,
|
168 |
-
f"Model {model_name} could not be initialized or is not available."
|
169 |
)
|
170 |
raise RuntimeError(error_to_report)
|
171 |
|
@@ -214,7 +222,9 @@ def run_hf_ocr(image_path, model_name="RolmOCR"):
|
|
214 |
|
215 |
try:
|
216 |
pil_image = Image.open(image_path).convert("RGB")
|
217 |
-
ocr_results = predict(
|
|
|
|
|
218 |
|
219 |
# Parse the output based on the user's example structure
|
220 |
if (
|
@@ -289,16 +299,14 @@ def process_files(image_path, xml_path, model_name):
|
|
289 |
try:
|
290 |
img_to_display = Image.open(image_path).convert("RGB")
|
291 |
hf_ocr_text_output = run_hf_ocr(image_path, model_name)
|
292 |
-
|
293 |
# Create download file for OCR output
|
294 |
if hf_ocr_text_output and not hf_ocr_text_output.startswith("Error"):
|
295 |
ocr_filename = f"vlm_ocr_output_{model_name}.txt"
|
296 |
with open(ocr_filename, "w", encoding="utf-8") as f:
|
297 |
f.write(hf_ocr_text_output)
|
298 |
ocr_download = gr.DownloadButton(
|
299 |
-
label="Download VLM OCR",
|
300 |
-
value=ocr_filename,
|
301 |
-
visible=True
|
302 |
)
|
303 |
except Exception as e:
|
304 |
img_to_display = None # Clear image if it failed to load
|
@@ -308,16 +316,14 @@ def process_files(image_path, xml_path, model_name):
|
|
308 |
|
309 |
if xml_path:
|
310 |
xml_text_output = parse_xml_for_text(xml_path)
|
311 |
-
|
312 |
# Create download file for XML text
|
313 |
if xml_text_output and not xml_text_output.startswith("Error"):
|
314 |
xml_filename = "traditional_ocr_output.txt"
|
315 |
with open(xml_filename, "w", encoding="utf-8") as f:
|
316 |
f.write(xml_text_output)
|
317 |
xml_download = gr.DownloadButton(
|
318 |
-
label="Download XML Text",
|
319 |
-
value=xml_filename,
|
320 |
-
visible=True
|
321 |
)
|
322 |
else:
|
323 |
xml_text_output = "No XML file uploaded."
|
@@ -327,16 +333,28 @@ def process_files(image_path, xml_path, model_name):
|
|
327 |
img_to_display = None # No image to display
|
328 |
hf_ocr_text_output = "Upload an image to perform OCR."
|
329 |
|
330 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
331 |
|
332 |
|
333 |
# --- Create Gradio App ---
|
334 |
|
335 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
336 |
-
gr.Markdown("# OCR
|
337 |
gr.Markdown(
|
338 |
-
"
|
339 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
340 |
)
|
341 |
|
342 |
with gr.Row():
|
@@ -345,7 +363,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
345 |
choices=AVAILABLE_MODELS,
|
346 |
value="RolmOCR",
|
347 |
label="Select OCR Model",
|
348 |
-
info="RolmOCR: Fast extraction, clean readable output | Nanonets-OCR-s: Detailed extraction with tables/math support, outputs structured Markdown"
|
349 |
)
|
350 |
image_input = gr.File(
|
351 |
label="Upload Image (PNG, JPG, etc.)", type="filepath"
|
@@ -366,8 +384,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
366 |
show_copy_button=True,
|
367 |
)
|
368 |
ocr_download_btn = gr.DownloadButton(
|
369 |
-
label="Download VLM OCR",
|
370 |
-
visible=False
|
371 |
)
|
372 |
xml_output_textbox = gr.Textbox(
|
373 |
label="Traditional OCR (XML Reading Order)",
|
@@ -376,14 +393,19 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
376 |
show_copy_button=True,
|
377 |
)
|
378 |
xml_download_btn = gr.DownloadButton(
|
379 |
-
label="Download XML Text",
|
380 |
-
visible=False
|
381 |
)
|
382 |
|
383 |
submit_button.click(
|
384 |
fn=process_files,
|
385 |
inputs=[image_input, xml_input, model_selector],
|
386 |
-
outputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
387 |
)
|
388 |
|
389 |
gr.Markdown("---")
|
|
|
21 |
MODELS["RolmOCR"] = AutoModelForImageTextToText.from_pretrained(
|
22 |
"reducto/RolmOCR", torch_dtype=torch.bfloat16, device_map="auto"
|
23 |
)
|
24 |
+
PIPELINES["RolmOCR"] = pipeline(
|
25 |
+
"image-text-to-text", model=MODELS["RolmOCR"], processor=PROCESSORS["RolmOCR"]
|
26 |
+
)
|
27 |
except Exception as e:
|
28 |
MODEL_LOAD_ERROR_MSG["RolmOCR"] = f"Failed to load RolmOCR: {str(e)}"
|
29 |
print(f"Error loading RolmOCR: {e}")
|
30 |
|
31 |
# Load Nanonets-OCR-s
|
32 |
try:
|
33 |
+
PROCESSORS["Nanonets-OCR-s"] = AutoProcessor.from_pretrained(
|
34 |
+
"nanonets/Nanonets-OCR-s"
|
35 |
+
)
|
36 |
MODELS["Nanonets-OCR-s"] = AutoModelForImageTextToText.from_pretrained(
|
37 |
"nanonets/Nanonets-OCR-s", torch_dtype=torch.bfloat16, device_map="auto"
|
38 |
)
|
39 |
+
PIPELINES["Nanonets-OCR-s"] = pipeline(
|
40 |
+
"image-text-to-text",
|
41 |
+
model=MODELS["Nanonets-OCR-s"],
|
42 |
+
processor=PROCESSORS["Nanonets-OCR-s"],
|
43 |
+
)
|
44 |
except Exception as e:
|
45 |
MODEL_LOAD_ERROR_MSG["Nanonets-OCR-s"] = f"Failed to load Nanonets-OCR-s: {str(e)}"
|
46 |
print(f"Error loading Nanonets-OCR-s: {e}")
|
|
|
173 |
if model_name not in PIPELINES:
|
174 |
error_to_report = MODEL_LOAD_ERROR_MSG.get(
|
175 |
model_name,
|
176 |
+
f"Model {model_name} could not be initialized or is not available.",
|
177 |
)
|
178 |
raise RuntimeError(error_to_report)
|
179 |
|
|
|
222 |
|
223 |
try:
|
224 |
pil_image = Image.open(image_path).convert("RGB")
|
225 |
+
ocr_results = predict(
|
226 |
+
pil_image, model_name
|
227 |
+
) # predict handles model loading and inference
|
228 |
|
229 |
# Parse the output based on the user's example structure
|
230 |
if (
|
|
|
299 |
try:
|
300 |
img_to_display = Image.open(image_path).convert("RGB")
|
301 |
hf_ocr_text_output = run_hf_ocr(image_path, model_name)
|
302 |
+
|
303 |
# Create download file for OCR output
|
304 |
if hf_ocr_text_output and not hf_ocr_text_output.startswith("Error"):
|
305 |
ocr_filename = f"vlm_ocr_output_{model_name}.txt"
|
306 |
with open(ocr_filename, "w", encoding="utf-8") as f:
|
307 |
f.write(hf_ocr_text_output)
|
308 |
ocr_download = gr.DownloadButton(
|
309 |
+
label="Download VLM OCR", value=ocr_filename, visible=True
|
|
|
|
|
310 |
)
|
311 |
except Exception as e:
|
312 |
img_to_display = None # Clear image if it failed to load
|
|
|
316 |
|
317 |
if xml_path:
|
318 |
xml_text_output = parse_xml_for_text(xml_path)
|
319 |
+
|
320 |
# Create download file for XML text
|
321 |
if xml_text_output and not xml_text_output.startswith("Error"):
|
322 |
xml_filename = "traditional_ocr_output.txt"
|
323 |
with open(xml_filename, "w", encoding="utf-8") as f:
|
324 |
f.write(xml_text_output)
|
325 |
xml_download = gr.DownloadButton(
|
326 |
+
label="Download XML Text", value=xml_filename, visible=True
|
|
|
|
|
327 |
)
|
328 |
else:
|
329 |
xml_text_output = "No XML file uploaded."
|
|
|
333 |
img_to_display = None # No image to display
|
334 |
hf_ocr_text_output = "Upload an image to perform OCR."
|
335 |
|
336 |
+
return (
|
337 |
+
img_to_display,
|
338 |
+
xml_text_output,
|
339 |
+
hf_ocr_text_output,
|
340 |
+
ocr_download,
|
341 |
+
xml_download,
|
342 |
+
)
|
343 |
|
344 |
|
345 |
# --- Create Gradio App ---
|
346 |
|
347 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
348 |
+
gr.Markdown("# 🕰️ OCR Time Machine")
|
349 |
gr.Markdown(
|
350 |
+
"Travel through time to see how OCR technology has evolved! "
|
351 |
+
"For decades, galleries, libraries, archives, and museums (GLAMs) have used Optical Character Recognition "
|
352 |
+
"to transform digitized books, newspapers, and manuscripts into machine-readable text. Traditional OCR "
|
353 |
+
"produces complex XML formats like ALTO, packed with layout details but difficult to use. "
|
354 |
+
"Now, cutting-edge Vision-Language Models (VLMs) are revolutionizing OCR with simpler, cleaner Markdown output. "
|
355 |
+
"This Space makes it easy to compare these two approaches and see which works best for your historical documents. "
|
356 |
+
"Upload a historical document image and its XML file to compare these approaches side-by-side. "
|
357 |
+
"We'll extract the reading order from your XML for an apples-to-apples comparison of the actual text content."
|
358 |
)
|
359 |
|
360 |
with gr.Row():
|
|
|
363 |
choices=AVAILABLE_MODELS,
|
364 |
value="RolmOCR",
|
365 |
label="Select OCR Model",
|
366 |
+
info="RolmOCR: Fast extraction, clean readable output | Nanonets-OCR-s: Detailed extraction with tables/math support, outputs structured Markdown",
|
367 |
)
|
368 |
image_input = gr.File(
|
369 |
label="Upload Image (PNG, JPG, etc.)", type="filepath"
|
|
|
384 |
show_copy_button=True,
|
385 |
)
|
386 |
ocr_download_btn = gr.DownloadButton(
|
387 |
+
label="Download VLM OCR", visible=False
|
|
|
388 |
)
|
389 |
xml_output_textbox = gr.Textbox(
|
390 |
label="Traditional OCR (XML Reading Order)",
|
|
|
393 |
show_copy_button=True,
|
394 |
)
|
395 |
xml_download_btn = gr.DownloadButton(
|
396 |
+
label="Download XML Text", visible=False
|
|
|
397 |
)
|
398 |
|
399 |
submit_button.click(
|
400 |
fn=process_files,
|
401 |
inputs=[image_input, xml_input, model_selector],
|
402 |
+
outputs=[
|
403 |
+
output_image_display,
|
404 |
+
xml_output_textbox,
|
405 |
+
hf_ocr_output_textbox,
|
406 |
+
ocr_download_btn,
|
407 |
+
xml_download_btn,
|
408 |
+
],
|
409 |
)
|
410 |
|
411 |
gr.Markdown("---")
|