|
import gradio as gr |
|
from transformers import Blip2Processor, Blip2ForConditionalGeneration, BitsAndBytesConfig |
|
from gtts import gTTS |
|
from tempfile import NamedTemporaryFile |
|
from PIL import Image |
|
import torch |
|
import whisper |
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl") |
|
model = Blip2ForConditionalGeneration.from_pretrained( |
|
"Salesforce/blip2-flan-t5-xl", device_map="auto" |
|
).to(device) |
|
|
|
|
|
whisper_model = whisper.load_model("small") |
|
|
|
|
|
def transcribe(audio_path): |
|
result = whisper_model.transcribe(audio_path) |
|
return result["text"] |
|
|
|
|
|
def ask_image(image, audio): |
|
question = transcribe(audio) |
|
inputs = processor(images=image, text=question, return_tensors="pt").to(device) |
|
generated_ids = model.generate(**inputs) |
|
answer = processor.decode(generated_ids[0], skip_special_tokens=True) |
|
|
|
tts = gTTS(answer) |
|
with NamedTemporaryFile(delete=False, suffix=".mp3") as f: |
|
tts.save(f.name) |
|
audio_out = f.name |
|
|
|
return answer, audio_out |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("## 🎤🖼️ Ask-the-Image: Ask questions about an image using your voice") |
|
|
|
image_input = gr.Image(type="pil", label="Upload an Image") |
|
audio_input = gr.Audio(source="microphone", type="filepath", label="Ask a Question (voice)") |
|
|
|
text_output = gr.Textbox(label="Answer") |
|
audio_output = gr.Audio(label="Answer in Speech") |
|
|
|
btn = gr.Button("Ask") |
|
btn.click(fn=ask_image, inputs=[image_input, audio_input], outputs=[text_output, audio_output]) |
|
|
|
|
|
demo.launch() |
|
|