|
import gradio as gr |
|
from transformers import Blip2Processor, Blip2ForConditionalGeneration |
|
from gtts import gTTS |
|
from tempfile import NamedTemporaryFile |
|
from PIL import Image |
|
import torch |
|
import whisper |
|
import os |
|
|
|
|
|
os.system("apt-get install -y ffmpeg") |
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl") |
|
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl", torch_dtype=torch.float16).to(device) |
|
|
|
|
|
whisper_model = whisper.load_model("base") |
|
|
|
def transcribe(audio_path): |
|
result = whisper_model.transcribe(audio_path) |
|
return result["text"] |
|
|
|
def ask_image(image, audio): |
|
|
|
question = transcribe(audio) |
|
print(f"Question: {question}") |
|
|
|
|
|
inputs = processor(image, question, return_tensors="pt").to(device, torch.float16) |
|
|
|
|
|
generated_ids = model.generate(**inputs, max_new_tokens=100) |
|
answer = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip() |
|
|
|
print(f"Answer: {answer}") |
|
|
|
|
|
tts = gTTS(answer) |
|
with NamedTemporaryFile(delete=False, suffix=".mp3") as f: |
|
tts.save(f.name) |
|
audio_out = f.name |
|
|
|
return answer, audio_out |
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("## 🎤🖼️ Ask-the-Image: Ask questions about an image using your voice") |
|
|
|
with gr.Row(): |
|
image_input = gr.Image(type="pil", label="Upload an Image") |
|
audio_input = gr.Audio(type="filepath", label="Ask a Question (voice)") |
|
|
|
btn = gr.Button("Ask the Image") |
|
text_output = gr.Textbox(label="Answer") |
|
audio_output = gr.Audio(label="Answer in Speech", autoplay=True) |
|
|
|
btn.click(fn=ask_image, inputs=[image_input, audio_input], outputs=[text_output, audio_output]) |
|
|
|
demo.launch() |