task2 / app.py
thorfine's picture
Update app.py
48c77e7 verified
import gradio as gr
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from gtts import gTTS
from tempfile import NamedTemporaryFile
from PIL import Image
import torch
import whisper
import os
# Ensure ffmpeg is installed
os.system("apt-get install -y ffmpeg")
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load BLIP-2 model
processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl", torch_dtype=torch.float16).to(device)
# Load Whisper model
whisper_model = whisper.load_model("base")
def transcribe(audio_path):
result = whisper_model.transcribe(audio_path)
return result["text"]
def ask_image(image, audio):
# Transcribe the audio question
question = transcribe(audio)
print(f"Question: {question}")
# Prepare inputs with both image and question
inputs = processor(image, question, return_tensors="pt").to(device, torch.float16)
# Generate response
generated_ids = model.generate(**inputs, max_new_tokens=100)
answer = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
print(f"Answer: {answer}")
# Convert answer to speech
tts = gTTS(answer)
with NamedTemporaryFile(delete=False, suffix=".mp3") as f:
tts.save(f.name)
audio_out = f.name
return answer, audio_out
with gr.Blocks() as demo:
gr.Markdown("## 🎤🖼️ Ask-the-Image: Ask questions about an image using your voice")
with gr.Row():
image_input = gr.Image(type="pil", label="Upload an Image")
audio_input = gr.Audio(type="filepath", label="Ask a Question (voice)")
btn = gr.Button("Ask the Image")
text_output = gr.Textbox(label="Answer")
audio_output = gr.Audio(label="Answer in Speech", autoplay=True)
btn.click(fn=ask_image, inputs=[image_input, audio_input], outputs=[text_output, audio_output])
demo.launch()