Spaces:

davanstrien
/

ocr-time-machine

Running on Zero

App Files Files Community

davanstrien HF Staff commited on 2 days ago

Commit

83e370e

1 Parent(s): 003891a

improve ui

Browse files

Files changed (1) hide show

app.py +42 -10

app.py CHANGED Viewed

@@ -151,7 +151,7 @@ def parse_xml_for_text(xml_file_path):
         elif xml_format == "ALTO":
             return parse_alto_xml_for_text(xml_file_path)
         else:
-            return f"Error: Unsupported XML format. Expected ALTO or PAGE XML."
     except Exception as e:
         return f"Error determining XML format: {str(e)}"
@@ -282,11 +282,24 @@ def process_files(image_path, xml_path, model_name):
     img_to_display = None
     xml_text_output = "XML not provided or not processed."
     hf_ocr_text_output = "Image not provided or OCR not run."
     if image_path:
         try:
             img_to_display = Image.open(image_path).convert("RGB")
             hf_ocr_text_output = run_hf_ocr(image_path, model_name)
         except Exception as e:
             img_to_display = None  # Clear image if it failed to load
             hf_ocr_text_output = f"Error loading image or running {model_name} OCR: {e}"
@@ -295,6 +308,17 @@ def process_files(image_path, xml_path, model_name):
     if xml_path:
         xml_text_output = parse_xml_for_text(xml_path)
     else:
         xml_text_output = "No XML file uploaded."
@@ -303,16 +327,16 @@ def process_files(image_path, xml_path, model_name):
         img_to_display = None  # No image to display
         hf_ocr_text_output = "Upload an image to perform OCR."
-    return img_to_display, xml_text_output, hf_ocr_text_output
 # --- Create Gradio App ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# OCR Viewer and Extractor")
     gr.Markdown(
-        "Upload an image to perform OCR using a Hugging Face model. "
-        "Optionally, upload its corresponding ALTO or PAGE XML file to compare the extracted text."
     )
     with gr.Row():
@@ -321,7 +345,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
                 choices=AVAILABLE_MODELS,
                 value="RolmOCR",
                 label="Select OCR Model",
-                info="Choose between RolmOCR (fast, general purpose) or Nanonets-OCR-s (detailed extraction)"
             )
             image_input = gr.File(
                 label="Upload Image (PNG, JPG, etc.)", type="filepath"
@@ -329,7 +353,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
             xml_input = gr.File(
                 label="Upload XML File (Optional, ALTO or PAGE format)", type="filepath"
             )
-            submit_button = gr.Button("Process Image and XML", variant="primary")
     with gr.Row():
         with gr.Column(scale=1):
@@ -338,20 +362,28 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
             )
         with gr.Column(scale=1):
             hf_ocr_output_textbox = gr.Markdown(
-                label="OCR Output",
                 show_copy_button=True,
             )
             xml_output_textbox = gr.Textbox(
-                label="Text from XML",
                 lines=15,
                 interactive=False,
                 show_copy_button=True,
             )
     submit_button.click(
         fn=process_files,
         inputs=[image_input, xml_input, model_selector],
-        outputs=[output_image_display, xml_output_textbox, hf_ocr_output_textbox],
     )
     gr.Markdown("---")

         elif xml_format == "ALTO":
             return parse_alto_xml_for_text(xml_file_path)
         else:
+            return "Error: Unsupported XML format. Expected ALTO or PAGE XML."
     except Exception as e:
         return f"Error determining XML format: {str(e)}"
     img_to_display = None
     xml_text_output = "XML not provided or not processed."
     hf_ocr_text_output = "Image not provided or OCR not run."
+    ocr_download = gr.DownloadButton(visible=False)
+    xml_download = gr.DownloadButton(visible=False)
     if image_path:
         try:
             img_to_display = Image.open(image_path).convert("RGB")
             hf_ocr_text_output = run_hf_ocr(image_path, model_name)
+            # Create download file for OCR output
+            if hf_ocr_text_output and not hf_ocr_text_output.startswith("Error"):
+                ocr_filename = f"vlm_ocr_output_{model_name}.txt"
+                with open(ocr_filename, "w", encoding="utf-8") as f:
+                    f.write(hf_ocr_text_output)
+                ocr_download = gr.DownloadButton(
+                    label="Download VLM OCR",
+                    value=ocr_filename,
+                    visible=True
+                )
         except Exception as e:
             img_to_display = None  # Clear image if it failed to load
             hf_ocr_text_output = f"Error loading image or running {model_name} OCR: {e}"
     if xml_path:
         xml_text_output = parse_xml_for_text(xml_path)
+        # Create download file for XML text
+        if xml_text_output and not xml_text_output.startswith("Error"):
+            xml_filename = "traditional_ocr_output.txt"
+            with open(xml_filename, "w", encoding="utf-8") as f:
+                f.write(xml_text_output)
+            xml_download = gr.DownloadButton(
+                label="Download XML Text",
+                value=xml_filename,
+                visible=True
+            )
     else:
         xml_text_output = "No XML file uploaded."
         img_to_display = None  # No image to display
         hf_ocr_text_output = "Upload an image to perform OCR."
+    return img_to_display, xml_text_output, hf_ocr_text_output, ocr_download, xml_download
 # --- Create Gradio App ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# OCR Comparison Tool: Traditional vs VLM-based")
     gr.Markdown(
+        "Compare traditional OCR outputs (ALTO/PAGE XML) with modern Vision-Language Model OCR that produces clean Markdown. "
+        "Upload an image and its XML file to see how VLMs simplify document text extraction."
     )
     with gr.Row():
                 choices=AVAILABLE_MODELS,
                 value="RolmOCR",
                 label="Select OCR Model",
+                info="RolmOCR: Fast extraction, clean readable output | Nanonets-OCR-s: Detailed extraction with tables/math support, outputs structured Markdown"
             )
             image_input = gr.File(
                 label="Upload Image (PNG, JPG, etc.)", type="filepath"
             xml_input = gr.File(
                 label="Upload XML File (Optional, ALTO or PAGE format)", type="filepath"
             )
+            submit_button = gr.Button("Compare OCR Methods", variant="primary")
     with gr.Row():
         with gr.Column(scale=1):
             )
         with gr.Column(scale=1):
             hf_ocr_output_textbox = gr.Markdown(
+                label="VLM OCR Output (Markdown)",
                 show_copy_button=True,
             )
+            ocr_download_btn = gr.DownloadButton(
+                label="Download VLM OCR",
+                visible=False
+            )
             xml_output_textbox = gr.Textbox(
+                label="Traditional OCR (XML Reading Order)",
                 lines=15,
                 interactive=False,
                 show_copy_button=True,
             )
+            xml_download_btn = gr.DownloadButton(
+                label="Download XML Text",
+                visible=False
+            )
     submit_button.click(
         fn=process_files,
         inputs=[image_input, xml_input, model_selector],
+        outputs=[output_image_display, xml_output_textbox, hf_ocr_output_textbox, ocr_download_btn, xml_download_btn],
     )
     gr.Markdown("---")