task2 / app.py
thorfine's picture
Update app.py
b4f34d6 verified
raw
history blame
1.9 kB
import gradio as gr
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from gtts import gTTS
from tempfile import NamedTemporaryFile
from PIL import Image
import torch
import whisper
import os
# Ensure ffmpeg is installed
os.system("apt-get install -y ffmpeg")
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load BLIP-2 model (smaller variant)
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b").to(device)
# Load Whisper model
whisper_model = whisper.load_model("base")
def transcribe(audio_path):
result = whisper_model.transcribe(audio_path)
return result["text"]
def ask_image(image, audio):
question = transcribe(audio) # Extract question from audio
# Ensure the question is correctly passed as text input along with the image
inputs = processor(images=image, text=question, return_tensors="pt").to(device)
# Generate the answer to the question about the image
generated_ids = model.generate(**inputs)
answer = processor.decode(generated_ids[0], skip_special_tokens=True)
print(f"Answer: {answer}")
# Convert the answer to speech
tts = gTTS(answer)
with NamedTemporaryFile(delete=False, suffix=".mp3") as f:
tts.save(f.name)
audio_out = f.name
return answer, audio_out
with gr.Blocks() as demo:
gr.Markdown("## 🎤🖼️ Ask-the-Image: Ask questions about an image using your voice")
image_input = gr.Image(type="pil", label="Upload an Image")
audio_input = gr.Audio(type="filepath", label="Ask a Question (voice)")
text_output = gr.Textbox(label="Answer")
audio_output = gr.Audio(label="Answer in Speech")
btn = gr.Button("Ask")
btn.click(fn=ask_image, inputs=[image_input, audio_input], outputs=[text_output, audio_output])
demo.launch()