Spaces:

davanstrien
/

ocr-time-machine

Running on Zero

App Files Files Community

davanstrien HF Staff commited on Jun 24

Commit

003891a

1 Parent(s): 5639776

add second model

Browse files

Files changed (1) hide show

app.py +84 -42

app.py CHANGED Viewed

@@ -7,16 +7,35 @@ from transformers import AutoProcessor, AutoModelForImageTextToText, pipeline
 import spaces
 # --- Global Model and Processor ---
-HF_PROCESSOR = None
-HF_MODEL = None
-HF_PIPE = None
-MODEL_LOAD_ERROR_MSG = None
-HF_PROCESSOR = AutoProcessor.from_pretrained("reducto/RolmOCR")
-HF_MODEL = AutoModelForImageTextToText.from_pretrained(
-    "reducto/RolmOCR", torch_dtype=torch.bfloat16, device_map="auto"
-)
-HF_PIPE = pipeline("image-text-to-text", model=HF_MODEL, processor=HF_PROCESSOR)
 # --- Helper Functions ---
@@ -139,46 +158,63 @@ def parse_xml_for_text(xml_file_path):
 @spaces.GPU
-def predict(pil_image):
-    """Performs OCR prediction using the Hugging Face model."""
-    global HF_PIPE, MODEL_LOAD_ERROR_MSG
-    if HF_PIPE is None:
-        error_to_report = (
-            MODEL_LOAD_ERROR_MSG
-            if MODEL_LOAD_ERROR_MSG
-            else "OCR model could not be initialized."
         )
         raise RuntimeError(error_to_report)
-    # Format the message in the expected structure
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "image", "image": pil_image},
-                {
-                    "type": "text",
-                    "text": "Return the plain text representation of this document as if you were reading it naturally.\n",
-                },
-            ],
-        }
-    ]
     # Use the pipeline with the properly formatted messages
-    return HF_PIPE(messages, max_new_tokens=8096)
-def run_hf_ocr(image_path):
     """
-    Runs OCR on the provided image using the Hugging Face model (via predict function).
     """
     if image_path is None:
         return "No image provided for OCR."
     try:
         pil_image = Image.open(image_path).convert("RGB")
-        ocr_results = predict(pil_image)  # predict handles model loading and inference
         # Parse the output based on the user's example structure
         if (
@@ -237,10 +273,10 @@ def run_hf_ocr(image_path):
 # --- Gradio Interface Function ---
-def process_files(image_path, xml_path):
     """
     Main function for the Gradio interface.
-    Processes the image for display, runs OCR (Hugging Face model),
     and parses XML if provided.
     """
     img_to_display = None
@@ -250,10 +286,10 @@ def process_files(image_path, xml_path):
     if image_path:
         try:
             img_to_display = Image.open(image_path).convert("RGB")
-            hf_ocr_text_output = run_hf_ocr(image_path)
         except Exception as e:
             img_to_display = None  # Clear image if it failed to load
-            hf_ocr_text_output = f"Error loading image or running HF OCR: {e}"
     else:
         hf_ocr_text_output = "Please upload an image to perform OCR."
@@ -281,6 +317,12 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     with gr.Row():
         with gr.Column(scale=1):
             image_input = gr.File(
                 label="Upload Image (PNG, JPG, etc.)", type="filepath"
             )
@@ -296,7 +338,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
             )
         with gr.Column(scale=1):
             hf_ocr_output_textbox = gr.Markdown(
-                label="OCR Output (Hugging Face Model)",
                 show_copy_button=True,
             )
             xml_output_textbox = gr.Textbox(
@@ -308,7 +350,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     submit_button.click(
         fn=process_files,
-        inputs=[image_input, xml_input],
         outputs=[output_image_display, xml_output_textbox, hf_ocr_output_textbox],
     )

 import spaces
 # --- Global Model and Processor ---
+MODELS = {}
+PROCESSORS = {}
+PIPELINES = {}
+MODEL_LOAD_ERROR_MSG = {}
+# Available models
+AVAILABLE_MODELS = ["RolmOCR", "Nanonets-OCR-s"]
+# Load RolmOCR
+try:
+    PROCESSORS["RolmOCR"] = AutoProcessor.from_pretrained("reducto/RolmOCR")
+    MODELS["RolmOCR"] = AutoModelForImageTextToText.from_pretrained(
+        "reducto/RolmOCR", torch_dtype=torch.bfloat16, device_map="auto"
+    )
+    PIPELINES["RolmOCR"] = pipeline("image-text-to-text", model=MODELS["RolmOCR"], processor=PROCESSORS["RolmOCR"])
+except Exception as e:
+    MODEL_LOAD_ERROR_MSG["RolmOCR"] = f"Failed to load RolmOCR: {str(e)}"
+    print(f"Error loading RolmOCR: {e}")
+# Load Nanonets-OCR-s
+try:
+    PROCESSORS["Nanonets-OCR-s"] = AutoProcessor.from_pretrained("nanonets/Nanonets-OCR-s")
+    MODELS["Nanonets-OCR-s"] = AutoModelForImageTextToText.from_pretrained(
+        "nanonets/Nanonets-OCR-s", torch_dtype=torch.bfloat16, device_map="auto"
+    )
+    PIPELINES["Nanonets-OCR-s"] = pipeline("image-text-to-text", model=MODELS["Nanonets-OCR-s"], processor=PROCESSORS["Nanonets-OCR-s"])
+except Exception as e:
+    MODEL_LOAD_ERROR_MSG["Nanonets-OCR-s"] = f"Failed to load Nanonets-OCR-s: {str(e)}"
+    print(f"Error loading Nanonets-OCR-s: {e}")
 # --- Helper Functions ---
 @spaces.GPU
+def predict(pil_image, model_name="RolmOCR"):
+    """Performs OCR prediction using the selected Hugging Face model."""
+    global PIPELINES, MODEL_LOAD_ERROR_MSG
+    if model_name not in PIPELINES:
+        error_to_report = MODEL_LOAD_ERROR_MSG.get(
+            model_name,
+            f"Model {model_name} could not be initialized or is not available."
         )
         raise RuntimeError(error_to_report)
+    selected_pipe = PIPELINES[model_name]
+    # Format the message based on the model
+    if model_name == "RolmOCR":
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": pil_image},
+                    {
+                        "type": "text",
+                        "text": "Return the plain text representation of this document as if you were reading it naturally.\n",
+                    },
+                ],
+            }
+        ]
+        max_tokens = 8096
+    else:  # Nanonets-OCR-s
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": pil_image},
+                    {
+                        "type": "text",
+                        "text": "Extract and return all the text from this image. Include all text elements and maintain the reading order. If there are tables, convert them to markdown format. If there are mathematical equations, convert them to LaTeX format.",
+                    },
+                ],
+            }
+        ]
+        max_tokens = 8096
     # Use the pipeline with the properly formatted messages
+    return selected_pipe(messages, max_new_tokens=max_tokens)
+def run_hf_ocr(image_path, model_name="RolmOCR"):
     """
+    Runs OCR on the provided image using the selected Hugging Face model (via predict function).
     """
     if image_path is None:
         return "No image provided for OCR."
     try:
         pil_image = Image.open(image_path).convert("RGB")
+        ocr_results = predict(pil_image, model_name)  # predict handles model loading and inference
         # Parse the output based on the user's example structure
         if (
 # --- Gradio Interface Function ---
+def process_files(image_path, xml_path, model_name):
     """
     Main function for the Gradio interface.
+    Processes the image for display, runs OCR with selected model,
     and parses XML if provided.
     """
     img_to_display = None
     if image_path:
         try:
             img_to_display = Image.open(image_path).convert("RGB")
+            hf_ocr_text_output = run_hf_ocr(image_path, model_name)
         except Exception as e:
             img_to_display = None  # Clear image if it failed to load
+            hf_ocr_text_output = f"Error loading image or running {model_name} OCR: {e}"
     else:
         hf_ocr_text_output = "Please upload an image to perform OCR."
     with gr.Row():
         with gr.Column(scale=1):
+            model_selector = gr.Radio(
+                choices=AVAILABLE_MODELS,
+                value="RolmOCR",
+                label="Select OCR Model",
+                info="Choose between RolmOCR (fast, general purpose) or Nanonets-OCR-s (detailed extraction)"
+            )
             image_input = gr.File(
                 label="Upload Image (PNG, JPG, etc.)", type="filepath"
             )
             )
         with gr.Column(scale=1):
             hf_ocr_output_textbox = gr.Markdown(
+                label="OCR Output",
                 show_copy_button=True,
             )
             xml_output_textbox = gr.Textbox(
     submit_button.click(
         fn=process_files,
+        inputs=[image_input, xml_input, model_selector],
         outputs=[output_image_display, xml_output_textbox, hf_ocr_output_textbox],
     )