import gradio as gr
from transformers import AutoProcessor, AutoModelForVision2Seq
from PIL import Image
import torch
import re

# Load model and processor
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")
model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224")
model.eval()

def clean_caption(caption):
    # Remove non-alphanumeric characters and extra whitespace, capitalize result
    return re.sub(r'[^\w\s]', '', caption).strip().capitalize()

def grounding(image, prompt):
    inputs = processor(text=prompt, images=image, return_tensors="pt")
    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_new_tokens=256)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return clean_caption(generated_text)

# Gradio Interface
gr.Interface(
    fn=grounding,
    inputs=[gr.Image(type="pil"), gr.Textbox(label="Text Prompt")],
    outputs="text",
    title="Image to Text Generation",
    description="Kosmos-2: Upload an image and provide a text prompt for grounded captioning."
).launch()