Spaces:

snehasanjana
/

image_to_text

Running

snehasanjana commited on Jun 17

Commit

b5d2f15

verified ·

1 Parent(s): cef9e35

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -2,22 +2,29 @@ import gradio as gr
 from transformers import AutoProcessor, AutoModelForVision2Seq
 from PIL import Image
 import torch
 # Load model and processor
 processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")
 model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224")
 model.eval()
 def grounding(image, prompt):
     inputs = processor(text=prompt, images=image, return_tensors="pt")
     with torch.no_grad():
         generated_ids = model.generate(**inputs, max_new_tokens=256)
     generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-    return generated_text
 gr.Interface(
     fn=grounding,
     inputs=[gr.Image(type="pil"), gr.Textbox(label="Text Prompt")],
     outputs="text",
-    title="Image to text Generation"
 ).launch()

 from transformers import AutoProcessor, AutoModelForVision2Seq
 from PIL import Image
 import torch
+import re
 # Load model and processor
 processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")
 model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224")
 model.eval()
+def clean_caption(caption):
+    # Remove non-alphanumeric characters and extra whitespace, capitalize result
+    return re.sub(r'[^\w\s]', '', caption).strip().capitalize()
 def grounding(image, prompt):
     inputs = processor(text=prompt, images=image, return_tensors="pt")
     with torch.no_grad():
         generated_ids = model.generate(**inputs, max_new_tokens=256)
     generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    return clean_caption(generated_text)
+# Gradio Interface
 gr.Interface(
     fn=grounding,
     inputs=[gr.Image(type="pil"), gr.Textbox(label="Text Prompt")],
     outputs="text",
+    title="Image to Text Generation",
+    description="Kosmos-2: Upload an image and provide a text prompt for grounded captioning."
 ).launch()