task2 / app.py
thorfine's picture
Update app.py
f41df8b verified
raw
history blame
1.58 kB
import gradio as gr
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from gtts import gTTS
from tempfile import NamedTemporaryFile
from PIL import Image
import torch
import whisper
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load BLIP-2 (smaller model)
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b").to(device)
# Load Whisper
whisper_model = whisper.load_model("base")
def transcribe(audio_path):
result = whisper_model.transcribe(audio_path)
return result["text"]
def ask_image(image, audio):
question = transcribe(audio)
inputs = processor(images=image, text=question, return_tensors="pt").to(device)
generated_ids = model.generate(**inputs)
answer = processor.decode(generated_ids[0], skip_special_tokens=True)
tts = gTTS(answer)
with NamedTemporaryFile(delete=False, suffix=".mp3") as f:
tts.save(f.name)
audio_out = f.name
return answer, audio_out
with gr.Blocks() as demo:
gr.Markdown("## 🎤🖼️ Ask-the-Image: Ask questions about an image using your voice")
image_input = gr.Image(type="pil", label="Upload an Image")
audio_input = gr.Audio(source="microphone", type="filepath", label="Ask a Question (voice)")
text_output = gr.Textbox(label="Answer")
audio_output = gr.Audio(label="Answer in Speech")
btn = gr.Button("Ask")
btn.click(fn=ask_image, inputs=[image_input, audio_input], outputs=[text_output, audio_output])
demo.launch()