import torch import gradio as gr from transformers import pipeline # Load the ViT+GPT2 image-to-text pipeline with bfloat16 precision captioner = pipeline( "image-to-text", model="nlpconnect/vit-gpt2-image-captioning", torch_dtype=torch.bfloat16 ) def generate_caption(image): """ Takes a PIL image and returns a generated caption. """ outputs = captioner(image) return outputs[0]["generated_text"] # Build the Gradio interface with gr.Blocks(theme=gr.themes.Default()) as demo: gr.Markdown( """ # 🖼️ Image Caption Generator Upload an image to generate a descriptive caption using ViT+GPT2. """ ) with gr.Row(): input_image = gr.Image(type="pil", label="Upload Image") caption_output = gr.Textbox(label="Generated Caption", lines=2) generate_btn = gr.Button("Generate Caption") generate_btn.click(fn=generate_caption, inputs=input_image, outputs=caption_output) gr.Markdown( """ --- Built with 🤗 Transformers (`nlpconnect/vit-gpt2-image-captioning`) and 🚀 Gradio """ ) demo.launch()