task2 / app.py
Abdul Mutaal
kuch bhi
2338212
raw
history blame
2.11 kB
import torch
from PIL import Image
import gradio as gr
from transformers import Blip2Processor, Blip2ForConditionalGeneration, pipeline
from gtts import gTTS
import os
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load BLIP-2
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained(
"Salesforce/blip2-opt-2.7b",
torch_dtype=torch.float16 if device == "cuda" else torch.float32
).to(device)
# Load Whisper pipeline for speech-to-text
whisper_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base")
# Store image globally
current_image = {"image": None}
def load_image(image):
current_image["image"] = image
return "Image uploaded. Now ask a question via voice."
def ask_question(audio):
if current_image["image"] is None:
return "Please upload an image first.", None
# Transcribe speech
question = whisper_pipe(audio)["text"]
# Ask BLIP-2
inputs = processor(current_image["image"], question, return_tensors="pt").to(device, torch.float16 if device == "cuda" else torch.float32)
output = model.generate(**inputs, max_new_tokens=100)
answer = processor.decode(output[0], skip_special_tokens=True)
# Convert to speech
tts = gTTS(answer)
tts.save("answer.mp3")
return f"Q: {question}\nA: {answer}", "answer.mp3"
# Gradio UI
with gr.Blocks() as app:
gr.Markdown("# 🧠🖼️ Ask-the-Image with BLIP-2 + Whisper + gTTS")
with gr.Row():
image_input = gr.Image(type="pil", label="Upload Image")
image_status = gr.Textbox(label="Status", interactive=False)
audio_input = gr.Audio(source="microphone", type="filepath", label="Ask a Question (voice)")
output_text = gr.Textbox(label="Q&A", lines=4)
output_audio = gr.Audio(label="Answer (speech)")
image_input.change(fn=load_image, inputs=image_input, outputs=image_status)
audio_input.change(fn=ask_question, inputs=audio_input, outputs=[output_text, output_audio])
app.launch()