davanstrien HF Staff commited on
Commit
f55c2c2
·
1 Parent(s): 83e370e

better description

Browse files
Files changed (1) hide show
  1. app.py +45 -23
app.py CHANGED
@@ -21,18 +21,26 @@ try:
21
  MODELS["RolmOCR"] = AutoModelForImageTextToText.from_pretrained(
22
  "reducto/RolmOCR", torch_dtype=torch.bfloat16, device_map="auto"
23
  )
24
- PIPELINES["RolmOCR"] = pipeline("image-text-to-text", model=MODELS["RolmOCR"], processor=PROCESSORS["RolmOCR"])
 
 
25
  except Exception as e:
26
  MODEL_LOAD_ERROR_MSG["RolmOCR"] = f"Failed to load RolmOCR: {str(e)}"
27
  print(f"Error loading RolmOCR: {e}")
28
 
29
  # Load Nanonets-OCR-s
30
  try:
31
- PROCESSORS["Nanonets-OCR-s"] = AutoProcessor.from_pretrained("nanonets/Nanonets-OCR-s")
 
 
32
  MODELS["Nanonets-OCR-s"] = AutoModelForImageTextToText.from_pretrained(
33
  "nanonets/Nanonets-OCR-s", torch_dtype=torch.bfloat16, device_map="auto"
34
  )
35
- PIPELINES["Nanonets-OCR-s"] = pipeline("image-text-to-text", model=MODELS["Nanonets-OCR-s"], processor=PROCESSORS["Nanonets-OCR-s"])
 
 
 
 
36
  except Exception as e:
37
  MODEL_LOAD_ERROR_MSG["Nanonets-OCR-s"] = f"Failed to load Nanonets-OCR-s: {str(e)}"
38
  print(f"Error loading Nanonets-OCR-s: {e}")
@@ -165,7 +173,7 @@ def predict(pil_image, model_name="RolmOCR"):
165
  if model_name not in PIPELINES:
166
  error_to_report = MODEL_LOAD_ERROR_MSG.get(
167
  model_name,
168
- f"Model {model_name} could not be initialized or is not available."
169
  )
170
  raise RuntimeError(error_to_report)
171
 
@@ -214,7 +222,9 @@ def run_hf_ocr(image_path, model_name="RolmOCR"):
214
 
215
  try:
216
  pil_image = Image.open(image_path).convert("RGB")
217
- ocr_results = predict(pil_image, model_name) # predict handles model loading and inference
 
 
218
 
219
  # Parse the output based on the user's example structure
220
  if (
@@ -289,16 +299,14 @@ def process_files(image_path, xml_path, model_name):
289
  try:
290
  img_to_display = Image.open(image_path).convert("RGB")
291
  hf_ocr_text_output = run_hf_ocr(image_path, model_name)
292
-
293
  # Create download file for OCR output
294
  if hf_ocr_text_output and not hf_ocr_text_output.startswith("Error"):
295
  ocr_filename = f"vlm_ocr_output_{model_name}.txt"
296
  with open(ocr_filename, "w", encoding="utf-8") as f:
297
  f.write(hf_ocr_text_output)
298
  ocr_download = gr.DownloadButton(
299
- label="Download VLM OCR",
300
- value=ocr_filename,
301
- visible=True
302
  )
303
  except Exception as e:
304
  img_to_display = None # Clear image if it failed to load
@@ -308,16 +316,14 @@ def process_files(image_path, xml_path, model_name):
308
 
309
  if xml_path:
310
  xml_text_output = parse_xml_for_text(xml_path)
311
-
312
  # Create download file for XML text
313
  if xml_text_output and not xml_text_output.startswith("Error"):
314
  xml_filename = "traditional_ocr_output.txt"
315
  with open(xml_filename, "w", encoding="utf-8") as f:
316
  f.write(xml_text_output)
317
  xml_download = gr.DownloadButton(
318
- label="Download XML Text",
319
- value=xml_filename,
320
- visible=True
321
  )
322
  else:
323
  xml_text_output = "No XML file uploaded."
@@ -327,16 +333,28 @@ def process_files(image_path, xml_path, model_name):
327
  img_to_display = None # No image to display
328
  hf_ocr_text_output = "Upload an image to perform OCR."
329
 
330
- return img_to_display, xml_text_output, hf_ocr_text_output, ocr_download, xml_download
 
 
 
 
 
 
331
 
332
 
333
  # --- Create Gradio App ---
334
 
335
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
336
- gr.Markdown("# OCR Comparison Tool: Traditional vs VLM-based")
337
  gr.Markdown(
338
- "Compare traditional OCR outputs (ALTO/PAGE XML) with modern Vision-Language Model OCR that produces clean Markdown. "
339
- "Upload an image and its XML file to see how VLMs simplify document text extraction."
 
 
 
 
 
 
340
  )
341
 
342
  with gr.Row():
@@ -345,7 +363,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
345
  choices=AVAILABLE_MODELS,
346
  value="RolmOCR",
347
  label="Select OCR Model",
348
- info="RolmOCR: Fast extraction, clean readable output | Nanonets-OCR-s: Detailed extraction with tables/math support, outputs structured Markdown"
349
  )
350
  image_input = gr.File(
351
  label="Upload Image (PNG, JPG, etc.)", type="filepath"
@@ -366,8 +384,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
366
  show_copy_button=True,
367
  )
368
  ocr_download_btn = gr.DownloadButton(
369
- label="Download VLM OCR",
370
- visible=False
371
  )
372
  xml_output_textbox = gr.Textbox(
373
  label="Traditional OCR (XML Reading Order)",
@@ -376,14 +393,19 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
376
  show_copy_button=True,
377
  )
378
  xml_download_btn = gr.DownloadButton(
379
- label="Download XML Text",
380
- visible=False
381
  )
382
 
383
  submit_button.click(
384
  fn=process_files,
385
  inputs=[image_input, xml_input, model_selector],
386
- outputs=[output_image_display, xml_output_textbox, hf_ocr_output_textbox, ocr_download_btn, xml_download_btn],
 
 
 
 
 
 
387
  )
388
 
389
  gr.Markdown("---")
 
21
  MODELS["RolmOCR"] = AutoModelForImageTextToText.from_pretrained(
22
  "reducto/RolmOCR", torch_dtype=torch.bfloat16, device_map="auto"
23
  )
24
+ PIPELINES["RolmOCR"] = pipeline(
25
+ "image-text-to-text", model=MODELS["RolmOCR"], processor=PROCESSORS["RolmOCR"]
26
+ )
27
  except Exception as e:
28
  MODEL_LOAD_ERROR_MSG["RolmOCR"] = f"Failed to load RolmOCR: {str(e)}"
29
  print(f"Error loading RolmOCR: {e}")
30
 
31
  # Load Nanonets-OCR-s
32
  try:
33
+ PROCESSORS["Nanonets-OCR-s"] = AutoProcessor.from_pretrained(
34
+ "nanonets/Nanonets-OCR-s"
35
+ )
36
  MODELS["Nanonets-OCR-s"] = AutoModelForImageTextToText.from_pretrained(
37
  "nanonets/Nanonets-OCR-s", torch_dtype=torch.bfloat16, device_map="auto"
38
  )
39
+ PIPELINES["Nanonets-OCR-s"] = pipeline(
40
+ "image-text-to-text",
41
+ model=MODELS["Nanonets-OCR-s"],
42
+ processor=PROCESSORS["Nanonets-OCR-s"],
43
+ )
44
  except Exception as e:
45
  MODEL_LOAD_ERROR_MSG["Nanonets-OCR-s"] = f"Failed to load Nanonets-OCR-s: {str(e)}"
46
  print(f"Error loading Nanonets-OCR-s: {e}")
 
173
  if model_name not in PIPELINES:
174
  error_to_report = MODEL_LOAD_ERROR_MSG.get(
175
  model_name,
176
+ f"Model {model_name} could not be initialized or is not available.",
177
  )
178
  raise RuntimeError(error_to_report)
179
 
 
222
 
223
  try:
224
  pil_image = Image.open(image_path).convert("RGB")
225
+ ocr_results = predict(
226
+ pil_image, model_name
227
+ ) # predict handles model loading and inference
228
 
229
  # Parse the output based on the user's example structure
230
  if (
 
299
  try:
300
  img_to_display = Image.open(image_path).convert("RGB")
301
  hf_ocr_text_output = run_hf_ocr(image_path, model_name)
302
+
303
  # Create download file for OCR output
304
  if hf_ocr_text_output and not hf_ocr_text_output.startswith("Error"):
305
  ocr_filename = f"vlm_ocr_output_{model_name}.txt"
306
  with open(ocr_filename, "w", encoding="utf-8") as f:
307
  f.write(hf_ocr_text_output)
308
  ocr_download = gr.DownloadButton(
309
+ label="Download VLM OCR", value=ocr_filename, visible=True
 
 
310
  )
311
  except Exception as e:
312
  img_to_display = None # Clear image if it failed to load
 
316
 
317
  if xml_path:
318
  xml_text_output = parse_xml_for_text(xml_path)
319
+
320
  # Create download file for XML text
321
  if xml_text_output and not xml_text_output.startswith("Error"):
322
  xml_filename = "traditional_ocr_output.txt"
323
  with open(xml_filename, "w", encoding="utf-8") as f:
324
  f.write(xml_text_output)
325
  xml_download = gr.DownloadButton(
326
+ label="Download XML Text", value=xml_filename, visible=True
 
 
327
  )
328
  else:
329
  xml_text_output = "No XML file uploaded."
 
333
  img_to_display = None # No image to display
334
  hf_ocr_text_output = "Upload an image to perform OCR."
335
 
336
+ return (
337
+ img_to_display,
338
+ xml_text_output,
339
+ hf_ocr_text_output,
340
+ ocr_download,
341
+ xml_download,
342
+ )
343
 
344
 
345
  # --- Create Gradio App ---
346
 
347
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
348
+ gr.Markdown("# 🕰️ OCR Time Machine")
349
  gr.Markdown(
350
+ "Travel through time to see how OCR technology has evolved! "
351
+ "For decades, galleries, libraries, archives, and museums (GLAMs) have used Optical Character Recognition "
352
+ "to transform digitized books, newspapers, and manuscripts into machine-readable text. Traditional OCR "
353
+ "produces complex XML formats like ALTO, packed with layout details but difficult to use. "
354
+ "Now, cutting-edge Vision-Language Models (VLMs) are revolutionizing OCR with simpler, cleaner Markdown output. "
355
+ "This Space makes it easy to compare these two approaches and see which works best for your historical documents. "
356
+ "Upload a historical document image and its XML file to compare these approaches side-by-side. "
357
+ "We'll extract the reading order from your XML for an apples-to-apples comparison of the actual text content."
358
  )
359
 
360
  with gr.Row():
 
363
  choices=AVAILABLE_MODELS,
364
  value="RolmOCR",
365
  label="Select OCR Model",
366
+ info="RolmOCR: Fast extraction, clean readable output | Nanonets-OCR-s: Detailed extraction with tables/math support, outputs structured Markdown",
367
  )
368
  image_input = gr.File(
369
  label="Upload Image (PNG, JPG, etc.)", type="filepath"
 
384
  show_copy_button=True,
385
  )
386
  ocr_download_btn = gr.DownloadButton(
387
+ label="Download VLM OCR", visible=False
 
388
  )
389
  xml_output_textbox = gr.Textbox(
390
  label="Traditional OCR (XML Reading Order)",
 
393
  show_copy_button=True,
394
  )
395
  xml_download_btn = gr.DownloadButton(
396
+ label="Download XML Text", visible=False
 
397
  )
398
 
399
  submit_button.click(
400
  fn=process_files,
401
  inputs=[image_input, xml_input, model_selector],
402
+ outputs=[
403
+ output_image_display,
404
+ xml_output_textbox,
405
+ hf_ocr_output_textbox,
406
+ ocr_download_btn,
407
+ xml_download_btn,
408
+ ],
409
  )
410
 
411
  gr.Markdown("---")