Spaces:

davanstrien
/

ocr-time-machine

Running on Zero

App Files Files Community

davanstrien HF Staff commited on 2 days ago

Commit

82483a0

1 Parent(s): ccfce2b

try new ui

Browse files

Files changed (1) hide show

app.py +136 -79

app.py CHANGED Viewed

@@ -130,8 +130,7 @@ def parse_alto_xml_for_text(xml_file_path):
         for text_line in root.findall(f".//{ns_prefix}TextLine"):
             line_text_parts = []
             for string_element in text_line.findall(f"{ns_prefix}String"):
-                text = string_element.get("CONTENT")
-                if text:
                     line_text_parts.append(text)
             if line_text_parts:
                 full_text_lines.append(" ".join(line_text_parts))
@@ -193,7 +192,6 @@ def predict(pil_image, model_name="RolmOCR"):
                 ],
             }
         ]
-        max_tokens = 8096
     else:  # Nanonets-OCR-s
         messages = [
             {
@@ -207,8 +205,7 @@ def predict(pil_image, model_name="RolmOCR"):
                 ],
             }
         ]
-        max_tokens = 8096
     # Use the pipeline with the properly formatted messages
     return selected_pipe(messages, max_new_tokens=max_tokens)
@@ -347,7 +344,7 @@ def process_files(image_path, xml_path, model_name):
 with gr.Blocks() as demo:
     gr.Markdown("# 🕰️ OCR Time Machine")
     gr.Markdown(
-        "Travel through time to see how OCR technology has evolved! "
         "For decades, galleries, libraries, archives, and museums (GLAMs) have used Optical Character Recognition "
         "to transform digitized books, newspapers, and manuscripts into machine-readable text. Traditional OCR "
         "produces complex XML formats like ALTO, packed with layout details but difficult to use. "
@@ -359,43 +356,80 @@ with gr.Blocks() as demo:
         "[Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s)"
     )
     with gr.Row():
         with gr.Column(scale=1):
-            model_selector = gr.Radio(
-                choices=AVAILABLE_MODELS,
-                value="RolmOCR",
-                label="Select OCR Model",
-                # info="RolmOCR: Fast extraction, clean readable output | Nanonets-OCR-s: Detailed extraction with tables/math support, outputs structured Markdown",
-            )
-            image_input = gr.File(
-                label="Upload Image (PNG, JPG, etc.)", type="filepath"
-            )
-            xml_input = gr.File(
-                label="Upload XML File (Optional, ALTO or PAGE format)", type="filepath"
             )
-            submit_button = gr.Button("Compare OCR Methods", variant="primary")
     with gr.Row():
         with gr.Column(scale=1):
-            output_image_display = gr.Image(
-                label="Uploaded Image", type="pil", interactive=False
-            )
         with gr.Column(scale=1):
-            hf_ocr_output_textbox = gr.Markdown(
-                label="VLM OCR Output (Markdown)",
-                show_copy_button=True,
-            )
-            ocr_download_btn = gr.DownloadButton(
-                label="Download VLM OCR", visible=False
-            )
-            xml_output_textbox = gr.Markdown(
-                label="Traditional OCR (XML Reading Order)",
-                interactive=False,
-                show_copy_button=True,
-            )
-            xml_download_btn = gr.DownloadButton(
-                label="Download XML Text", visible=False
-            )
     submit_button.click(
         fn=process_files,
@@ -410,62 +444,85 @@ with gr.Blocks() as demo:
     )
     gr.Markdown("---")
-    gr.Markdown("### Try an Example")
-    gr.Examples(
-        examples=[
-            ["examples/one/74442232.3.jpg", "examples/one/74442232.34.xml", "RolmOCR"],
-            [
-                "examples/one/74442232.3.jpg",
-                "examples/one/74442232.34.xml",
-                "Nanonets-OCR-s",
             ],
-        ],
-        inputs=[image_input, xml_input, model_selector],
-        outputs=[
-            output_image_display,
-            xml_output_textbox,
-            hf_ocr_output_textbox,
-            ocr_download_btn,
-            xml_download_btn,
-        ],
-        fn=process_files,
-        cache_examples=False,
-    )
-    gr.Markdown(
-        "*Example from ['A Medical History of British India'](https://data.nls.uk/data/digitised-collections/a-medical-history-of-british-india/) "
-        "collection, National Library of Scotland*"
-    )
     gr.Markdown("---")
-    gr.Markdown("### Example ALTO XML Snippet (for `String` element extraction):")
-    gr.Code(
-        value=(
-            """<alto xmlns="http://www.loc.gov/standards/alto/v3/alto.xsd">
-  <Description>...</Description>
-  <Styles>...</Styles>
   <Layout>
-    <Page ID="Page13" PHYSICAL_IMG_NR="13" WIDTH="2394" HEIGHT="3612">
       <PrintSpace>
-        <TextLine WIDTH="684" HEIGHT="108" ID="p13_t1" HPOS="465" VPOS="196">
-          <String ID="p13_w1" CONTENT="Introduction" HPOS="465" VPOS="196" WIDTH="684" HEIGHT="108" STYLEREFS="font0"/>
-        </TextLine>
-        <TextLine WIDTH="1798" HEIGHT="51" ID="p13_t2" HPOS="492" VPOS="523">
-          <String ID="p13_w2" CONTENT="Britain" HPOS="492" VPOS="523" WIDTH="166" HEIGHT="51" STYLEREFS="font1"/>
-          <SP WIDTH="24" VPOS="523" HPOS="658"/>
-          <String ID="p13_w3" CONTENT="1981" HPOS="682" VPOS="523" WIDTH="117" HEIGHT="51" STYLEREFS="font1"/>
-          <!-- ... more String and SP elements ... -->
         </TextLine>
-        <!-- ... more TextLine elements ... -->
       </PrintSpace>
     </Page>
   </Layout>
 </alto>"""
-        ),
-        interactive=False,
     )
 if __name__ == "__main__":
-    # Removed dummy file creation as it's less relevant for single file focus
     print("Attempting to launch Gradio demo...")
     print(
         "If the Hugging Face model is large, initial startup might take some time due to model download/loading (on first OCR attempt)."

         for text_line in root.findall(f".//{ns_prefix}TextLine"):
             line_text_parts = []
             for string_element in text_line.findall(f"{ns_prefix}String"):
+                if text := string_element.get("CONTENT"):
                     line_text_parts.append(text)
             if line_text_parts:
                 full_text_lines.append(" ".join(line_text_parts))
                 ],
             }
         ]
     else:  # Nanonets-OCR-s
         messages = [
             {
                 ],
             }
         ]
+    max_tokens = 8096
     # Use the pipeline with the properly formatted messages
     return selected_pipe(messages, max_new_tokens=max_tokens)
 with gr.Blocks() as demo:
     gr.Markdown("# 🕰️ OCR Time Machine")
     gr.Markdown(
+        "Travel through time to see how OCR technology has evolved! \n\n "
         "For decades, galleries, libraries, archives, and museums (GLAMs) have used Optical Character Recognition "
         "to transform digitized books, newspapers, and manuscripts into machine-readable text. Traditional OCR "
         "produces complex XML formats like ALTO, packed with layout details but difficult to use. "
         "[Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s)"
     )
+    gr.Markdown("---")
+    # How it works section
+    gr.Markdown("## 🚀 How it works")
+    gr.Markdown(
+        "1. 📤 **Upload Image**: Select a historical document image (JPG, PNG, JP2)\n"
+        "2. 📄 **Upload XML** (Optional): Add the corresponding ALTO or PAGE XML file for comparison\n"
+        "3. 🤖 **Choose Model**: Select between RolmOCR (fast) or Nanonets-OCR-s (detailed)\n"
+        "4. 🔍 **Compare**: Click 'Compare OCR Methods' to process\n"
+        "5. 💾 **Download**: Save the results for further analysis"
+    )
+    gr.Markdown("---")
+    # Input section
+    gr.Markdown("## 📥 Upload Files")
     with gr.Row():
         with gr.Column(scale=1):
+            with gr.Group():
+                gr.Markdown("### 📤 Step 1: Upload your document")
+                image_input = gr.File(
+                    label="Historical Document Image",
+                    type="filepath",
+                    file_types=["image"],
+                )
+                xml_input = gr.File(
+                    label="XML File (Optional - ALTO or PAGE format)",
+                    type="filepath",
+                    file_types=[".xml"],
+                )
+            with gr.Group():
+                gr.Markdown("### 🤖 Step 2: Select OCR Model")
+                model_selector = gr.Radio(
+                    choices=AVAILABLE_MODELS,
+                    value="RolmOCR",
+                    label="Choose Model",
+                    info="RolmOCR: Fast & general-purpose | Nanonets: Advanced with table/math support",
+                )
+            submit_button = gr.Button(
+                "🔍 Compare OCR Methods", variant="primary", size="lg"
             )
+    # Results section
+    gr.Markdown("## 📊 Results")
     with gr.Row():
         with gr.Column(scale=1):
+            with gr.Group():
+                gr.Markdown("### 🖼️ Document Image")
+                output_image_display = gr.Image(
+                    label="Uploaded Document", type="pil", interactive=False
+                )
         with gr.Column(scale=1):
+            with gr.Group():
+                gr.Markdown("### 🤖 Modern VLM OCR Output")
+                hf_ocr_output_textbox = gr.Markdown(
+                    label="Markdown Format",
+                    show_copy_button=True,
+                )
+                ocr_download_btn = gr.DownloadButton(
+                    label="💾 Download VLM OCR", visible=False, size="sm"
+                )
+            with gr.Group():
+                gr.Markdown("### 📜 Traditional OCR Output")
+                xml_output_textbox = gr.Textbox(
+                    label="XML Reading Order",
+                    lines=10,
+                    interactive=False,
+                    show_copy_button=True,
+                )
+                xml_download_btn = gr.DownloadButton(
+                    label="💾 Download XML Text", visible=False, size="sm"
+                )
     submit_button.click(
         fn=process_files,
     )
     gr.Markdown("---")
+    # Examples section
+    with gr.Group():
+        gr.Markdown("## 🎯 Try an Example")
+        gr.Examples(
+            examples=[
+                [
+                    "examples/one/74442232.3.jpg",
+                    "examples/one/74442232.34.xml",
+                    "RolmOCR",
+                ],
+                [
+                    "examples/one/74442232.3.jpg",
+                    "examples/one/74442232.34.xml",
+                    "Nanonets-OCR-s",
+                ],
             ],
+            inputs=[image_input, xml_input, model_selector],
+            outputs=[
+                output_image_display,
+                xml_output_textbox,
+                hf_ocr_output_textbox,
+                ocr_download_btn,
+                xml_download_btn,
+            ],
+            fn=process_files,
+            cache_examples=False,
+        )
+        gr.Markdown(
+            "*Example from ['A Medical History of British India'](https://data.nls.uk/data/digitised-collections/a-medical-history-of-british-india/) "
+            "collection, National Library of Scotland*"
+        )
     gr.Markdown("---")
+    # Tips section
+    with gr.Accordion("💡 Tips & Information", open=False):
+        gr.Markdown(
+            "### 📚 About ALTO/PAGE XML\n"
+            "- **ALTO** (Analyzed Layout and Text Object) and **PAGE** are XML formats that store OCR results with detailed layout information\n"
+            "- These files are typically generated by traditional OCR software and include position data for each text element\n"
+            "- This tool extracts just the reading order text for easier comparison\n\n"
+            "### 🎯 Best Practices\n"
+            "- Use high-resolution scans (300+ DPI) for best results\n"
+            "- Historical documents with clear text work best\n"
+            "- The VLM models can handle complex layouts, tables, and mathematical notation\n\n"
+            "### ⏱️ Processing Time\n"
+            "- RolmOCR: ~5-10 seconds per page\n"
+            "- Nanonets-OCR-s: ~10-20 seconds per page (more detailed analysis)\n\n"
+            "### 📄 Example ALTO XML Structure"
+        )
+        gr.Code(
+            value=(
+                """<alto xmlns="http://www.loc.gov/standards/alto/v3/alto.xsd">
   <Layout>
+    <Page>
       <PrintSpace>
+        <TextLine>
+          <String CONTENT="Hello World"/>
         </TextLine>
       </PrintSpace>
     </Page>
   </Layout>
 </alto>"""
+            ),
+            interactive=False,
+        )
+    # Footer
+    gr.Markdown("---")
+    gr.Markdown(
+        "<center>\n\n"
+        "Built with ❤️ for the GLAM community | "
+        "[Learn more about OCR formats](https://www.loc.gov/standards/alto/) | "
+        "Questions? [Open an issue](https://github.com/davanstrien/ocr-playground/issues)\n\n"
+        "</center>"
     )
 if __name__ == "__main__":
     print("Attempting to launch Gradio demo...")
     print(
         "If the Hugging Face model is large, initial startup might take some time due to model download/loading (on first OCR attempt)."