davanstrien HF Staff commited on
Commit
82483a0
Β·
1 Parent(s): ccfce2b

try new ui

Browse files
Files changed (1) hide show
  1. app.py +136 -79
app.py CHANGED
@@ -130,8 +130,7 @@ def parse_alto_xml_for_text(xml_file_path):
130
  for text_line in root.findall(f".//{ns_prefix}TextLine"):
131
  line_text_parts = []
132
  for string_element in text_line.findall(f"{ns_prefix}String"):
133
- text = string_element.get("CONTENT")
134
- if text:
135
  line_text_parts.append(text)
136
  if line_text_parts:
137
  full_text_lines.append(" ".join(line_text_parts))
@@ -193,7 +192,6 @@ def predict(pil_image, model_name="RolmOCR"):
193
  ],
194
  }
195
  ]
196
- max_tokens = 8096
197
  else: # Nanonets-OCR-s
198
  messages = [
199
  {
@@ -207,8 +205,7 @@ def predict(pil_image, model_name="RolmOCR"):
207
  ],
208
  }
209
  ]
210
- max_tokens = 8096
211
-
212
  # Use the pipeline with the properly formatted messages
213
  return selected_pipe(messages, max_new_tokens=max_tokens)
214
 
@@ -347,7 +344,7 @@ def process_files(image_path, xml_path, model_name):
347
  with gr.Blocks() as demo:
348
  gr.Markdown("# πŸ•°οΈ OCR Time Machine")
349
  gr.Markdown(
350
- "Travel through time to see how OCR technology has evolved! "
351
  "For decades, galleries, libraries, archives, and museums (GLAMs) have used Optical Character Recognition "
352
  "to transform digitized books, newspapers, and manuscripts into machine-readable text. Traditional OCR "
353
  "produces complex XML formats like ALTO, packed with layout details but difficult to use. "
@@ -359,43 +356,80 @@ with gr.Blocks() as demo:
359
  "[Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s)"
360
  )
361
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
  with gr.Row():
363
  with gr.Column(scale=1):
364
- model_selector = gr.Radio(
365
- choices=AVAILABLE_MODELS,
366
- value="RolmOCR",
367
- label="Select OCR Model",
368
- # info="RolmOCR: Fast extraction, clean readable output | Nanonets-OCR-s: Detailed extraction with tables/math support, outputs structured Markdown",
369
- )
370
- image_input = gr.File(
371
- label="Upload Image (PNG, JPG, etc.)", type="filepath"
372
- )
373
- xml_input = gr.File(
374
- label="Upload XML File (Optional, ALTO or PAGE format)", type="filepath"
 
 
 
 
 
 
 
 
 
 
 
 
 
375
  )
376
- submit_button = gr.Button("Compare OCR Methods", variant="primary")
377
 
 
 
378
  with gr.Row():
379
  with gr.Column(scale=1):
380
- output_image_display = gr.Image(
381
- label="Uploaded Image", type="pil", interactive=False
382
- )
 
 
383
  with gr.Column(scale=1):
384
- hf_ocr_output_textbox = gr.Markdown(
385
- label="VLM OCR Output (Markdown)",
386
- show_copy_button=True,
387
- )
388
- ocr_download_btn = gr.DownloadButton(
389
- label="Download VLM OCR", visible=False
390
- )
391
- xml_output_textbox = gr.Markdown(
392
- label="Traditional OCR (XML Reading Order)",
393
- interactive=False,
394
- show_copy_button=True,
395
- )
396
- xml_download_btn = gr.DownloadButton(
397
- label="Download XML Text", visible=False
398
- )
 
 
 
 
 
399
 
400
  submit_button.click(
401
  fn=process_files,
@@ -410,62 +444,85 @@ with gr.Blocks() as demo:
410
  )
411
 
412
  gr.Markdown("---")
413
- gr.Markdown("### Try an Example")
414
- gr.Examples(
415
- examples=[
416
- ["examples/one/74442232.3.jpg", "examples/one/74442232.34.xml", "RolmOCR"],
417
- [
418
- "examples/one/74442232.3.jpg",
419
- "examples/one/74442232.34.xml",
420
- "Nanonets-OCR-s",
 
 
 
 
 
 
 
 
421
  ],
422
- ],
423
- inputs=[image_input, xml_input, model_selector],
424
- outputs=[
425
- output_image_display,
426
- xml_output_textbox,
427
- hf_ocr_output_textbox,
428
- ocr_download_btn,
429
- xml_download_btn,
430
- ],
431
- fn=process_files,
432
- cache_examples=False,
433
- )
434
- gr.Markdown(
435
- "*Example from ['A Medical History of British India'](https://data.nls.uk/data/digitised-collections/a-medical-history-of-british-india/) "
436
- "collection, National Library of Scotland*"
437
- )
438
 
439
  gr.Markdown("---")
440
- gr.Markdown("### Example ALTO XML Snippet (for `String` element extraction):")
441
- gr.Code(
442
- value=(
443
- """<alto xmlns="http://www.loc.gov/standards/alto/v3/alto.xsd">
444
- <Description>...</Description>
445
- <Styles>...</Styles>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
446
  <Layout>
447
- <Page ID="Page13" PHYSICAL_IMG_NR="13" WIDTH="2394" HEIGHT="3612">
448
  <PrintSpace>
449
- <TextLine WIDTH="684" HEIGHT="108" ID="p13_t1" HPOS="465" VPOS="196">
450
- <String ID="p13_w1" CONTENT="Introduction" HPOS="465" VPOS="196" WIDTH="684" HEIGHT="108" STYLEREFS="font0"/>
451
- </TextLine>
452
- <TextLine WIDTH="1798" HEIGHT="51" ID="p13_t2" HPOS="492" VPOS="523">
453
- <String ID="p13_w2" CONTENT="Britain" HPOS="492" VPOS="523" WIDTH="166" HEIGHT="51" STYLEREFS="font1"/>
454
- <SP WIDTH="24" VPOS="523" HPOS="658"/>
455
- <String ID="p13_w3" CONTENT="1981" HPOS="682" VPOS="523" WIDTH="117" HEIGHT="51" STYLEREFS="font1"/>
456
- <!-- ... more String and SP elements ... -->
457
  </TextLine>
458
- <!-- ... more TextLine elements ... -->
459
  </PrintSpace>
460
  </Page>
461
  </Layout>
462
  </alto>"""
463
- ),
464
- interactive=False,
 
 
 
 
 
 
 
 
 
 
465
  )
466
 
467
  if __name__ == "__main__":
468
- # Removed dummy file creation as it's less relevant for single file focus
469
  print("Attempting to launch Gradio demo...")
470
  print(
471
  "If the Hugging Face model is large, initial startup might take some time due to model download/loading (on first OCR attempt)."
 
130
  for text_line in root.findall(f".//{ns_prefix}TextLine"):
131
  line_text_parts = []
132
  for string_element in text_line.findall(f"{ns_prefix}String"):
133
+ if text := string_element.get("CONTENT"):
 
134
  line_text_parts.append(text)
135
  if line_text_parts:
136
  full_text_lines.append(" ".join(line_text_parts))
 
192
  ],
193
  }
194
  ]
 
195
  else: # Nanonets-OCR-s
196
  messages = [
197
  {
 
205
  ],
206
  }
207
  ]
208
+ max_tokens = 8096
 
209
  # Use the pipeline with the properly formatted messages
210
  return selected_pipe(messages, max_new_tokens=max_tokens)
211
 
 
344
  with gr.Blocks() as demo:
345
  gr.Markdown("# πŸ•°οΈ OCR Time Machine")
346
  gr.Markdown(
347
+ "Travel through time to see how OCR technology has evolved! \n\n "
348
  "For decades, galleries, libraries, archives, and museums (GLAMs) have used Optical Character Recognition "
349
  "to transform digitized books, newspapers, and manuscripts into machine-readable text. Traditional OCR "
350
  "produces complex XML formats like ALTO, packed with layout details but difficult to use. "
 
356
  "[Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s)"
357
  )
358
 
359
+ gr.Markdown("---")
360
+
361
+ # How it works section
362
+ gr.Markdown("## πŸš€ How it works")
363
+ gr.Markdown(
364
+ "1. πŸ“€ **Upload Image**: Select a historical document image (JPG, PNG, JP2)\n"
365
+ "2. πŸ“„ **Upload XML** (Optional): Add the corresponding ALTO or PAGE XML file for comparison\n"
366
+ "3. πŸ€– **Choose Model**: Select between RolmOCR (fast) or Nanonets-OCR-s (detailed)\n"
367
+ "4. πŸ” **Compare**: Click 'Compare OCR Methods' to process\n"
368
+ "5. πŸ’Ύ **Download**: Save the results for further analysis"
369
+ )
370
+
371
+ gr.Markdown("---")
372
+
373
+ # Input section
374
+ gr.Markdown("## πŸ“₯ Upload Files")
375
  with gr.Row():
376
  with gr.Column(scale=1):
377
+ with gr.Group():
378
+ gr.Markdown("### πŸ“€ Step 1: Upload your document")
379
+ image_input = gr.File(
380
+ label="Historical Document Image",
381
+ type="filepath",
382
+ file_types=["image"],
383
+ )
384
+ xml_input = gr.File(
385
+ label="XML File (Optional - ALTO or PAGE format)",
386
+ type="filepath",
387
+ file_types=[".xml"],
388
+ )
389
+
390
+ with gr.Group():
391
+ gr.Markdown("### πŸ€– Step 2: Select OCR Model")
392
+ model_selector = gr.Radio(
393
+ choices=AVAILABLE_MODELS,
394
+ value="RolmOCR",
395
+ label="Choose Model",
396
+ info="RolmOCR: Fast & general-purpose | Nanonets: Advanced with table/math support",
397
+ )
398
+
399
+ submit_button = gr.Button(
400
+ "πŸ” Compare OCR Methods", variant="primary", size="lg"
401
  )
 
402
 
403
+ # Results section
404
+ gr.Markdown("## πŸ“Š Results")
405
  with gr.Row():
406
  with gr.Column(scale=1):
407
+ with gr.Group():
408
+ gr.Markdown("### πŸ–ΌοΈ Document Image")
409
+ output_image_display = gr.Image(
410
+ label="Uploaded Document", type="pil", interactive=False
411
+ )
412
  with gr.Column(scale=1):
413
+ with gr.Group():
414
+ gr.Markdown("### πŸ€– Modern VLM OCR Output")
415
+ hf_ocr_output_textbox = gr.Markdown(
416
+ label="Markdown Format",
417
+ show_copy_button=True,
418
+ )
419
+ ocr_download_btn = gr.DownloadButton(
420
+ label="πŸ’Ύ Download VLM OCR", visible=False, size="sm"
421
+ )
422
+ with gr.Group():
423
+ gr.Markdown("### πŸ“œ Traditional OCR Output")
424
+ xml_output_textbox = gr.Textbox(
425
+ label="XML Reading Order",
426
+ lines=10,
427
+ interactive=False,
428
+ show_copy_button=True,
429
+ )
430
+ xml_download_btn = gr.DownloadButton(
431
+ label="πŸ’Ύ Download XML Text", visible=False, size="sm"
432
+ )
433
 
434
  submit_button.click(
435
  fn=process_files,
 
444
  )
445
 
446
  gr.Markdown("---")
447
+
448
+ # Examples section
449
+ with gr.Group():
450
+ gr.Markdown("## 🎯 Try an Example")
451
+ gr.Examples(
452
+ examples=[
453
+ [
454
+ "examples/one/74442232.3.jpg",
455
+ "examples/one/74442232.34.xml",
456
+ "RolmOCR",
457
+ ],
458
+ [
459
+ "examples/one/74442232.3.jpg",
460
+ "examples/one/74442232.34.xml",
461
+ "Nanonets-OCR-s",
462
+ ],
463
  ],
464
+ inputs=[image_input, xml_input, model_selector],
465
+ outputs=[
466
+ output_image_display,
467
+ xml_output_textbox,
468
+ hf_ocr_output_textbox,
469
+ ocr_download_btn,
470
+ xml_download_btn,
471
+ ],
472
+ fn=process_files,
473
+ cache_examples=False,
474
+ )
475
+ gr.Markdown(
476
+ "*Example from ['A Medical History of British India'](https://data.nls.uk/data/digitised-collections/a-medical-history-of-british-india/) "
477
+ "collection, National Library of Scotland*"
478
+ )
 
479
 
480
  gr.Markdown("---")
481
+
482
+ # Tips section
483
+ with gr.Accordion("πŸ’‘ Tips & Information", open=False):
484
+ gr.Markdown(
485
+ "### πŸ“š About ALTO/PAGE XML\n"
486
+ "- **ALTO** (Analyzed Layout and Text Object) and **PAGE** are XML formats that store OCR results with detailed layout information\n"
487
+ "- These files are typically generated by traditional OCR software and include position data for each text element\n"
488
+ "- This tool extracts just the reading order text for easier comparison\n\n"
489
+ "### 🎯 Best Practices\n"
490
+ "- Use high-resolution scans (300+ DPI) for best results\n"
491
+ "- Historical documents with clear text work best\n"
492
+ "- The VLM models can handle complex layouts, tables, and mathematical notation\n\n"
493
+ "### ⏱️ Processing Time\n"
494
+ "- RolmOCR: ~5-10 seconds per page\n"
495
+ "- Nanonets-OCR-s: ~10-20 seconds per page (more detailed analysis)\n\n"
496
+ "### πŸ“„ Example ALTO XML Structure"
497
+ )
498
+ gr.Code(
499
+ value=(
500
+ """<alto xmlns="http://www.loc.gov/standards/alto/v3/alto.xsd">
501
  <Layout>
502
+ <Page>
503
  <PrintSpace>
504
+ <TextLine>
505
+ <String CONTENT="Hello World"/>
 
 
 
 
 
 
506
  </TextLine>
 
507
  </PrintSpace>
508
  </Page>
509
  </Layout>
510
  </alto>"""
511
+ ),
512
+ interactive=False,
513
+ )
514
+
515
+ # Footer
516
+ gr.Markdown("---")
517
+ gr.Markdown(
518
+ "<center>\n\n"
519
+ "Built with ❀️ for the GLAM community | "
520
+ "[Learn more about OCR formats](https://www.loc.gov/standards/alto/) | "
521
+ "Questions? [Open an issue](https://github.com/davanstrien/ocr-playground/issues)\n\n"
522
+ "</center>"
523
  )
524
 
525
  if __name__ == "__main__":
 
526
  print("Attempting to launch Gradio demo...")
527
  print(
528
  "If the Hugging Face model is large, initial startup might take some time due to model download/loading (on first OCR attempt)."