File size: 1,994 Bytes
10458eb f41df8b 10458eb 172761a 440e3a3 10458eb a77e40f 48c77e7 10458eb 24c917e f41df8b 10458eb 2152d81 172761a 10458eb 2152d81 a77e40f b4f34d6 a77e40f 2152d81 a77e40f 2152d81 a77e40f 2152d81 10458eb a77e40f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
import gradio as gr
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from gtts import gTTS
from tempfile import NamedTemporaryFile
from PIL import Image
import torch
import whisper
import os
# Ensure ffmpeg is installed
os.system("apt-get install -y ffmpeg")
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load BLIP-2 model
processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl", torch_dtype=torch.float16).to(device)
# Load Whisper model
whisper_model = whisper.load_model("base")
def transcribe(audio_path):
result = whisper_model.transcribe(audio_path)
return result["text"]
def ask_image(image, audio):
# Transcribe the audio question
question = transcribe(audio)
print(f"Question: {question}")
# Prepare inputs with both image and question
inputs = processor(image, question, return_tensors="pt").to(device, torch.float16)
# Generate response
generated_ids = model.generate(**inputs, max_new_tokens=100)
answer = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
print(f"Answer: {answer}")
# Convert answer to speech
tts = gTTS(answer)
with NamedTemporaryFile(delete=False, suffix=".mp3") as f:
tts.save(f.name)
audio_out = f.name
return answer, audio_out
with gr.Blocks() as demo:
gr.Markdown("## 🎤🖼️ Ask-the-Image: Ask questions about an image using your voice")
with gr.Row():
image_input = gr.Image(type="pil", label="Upload an Image")
audio_input = gr.Audio(type="filepath", label="Ask a Question (voice)")
btn = gr.Button("Ask the Image")
text_output = gr.Textbox(label="Answer")
audio_output = gr.Audio(label="Answer in Speech", autoplay=True)
btn.click(fn=ask_image, inputs=[image_input, audio_input], outputs=[text_output, audio_output])
demo.launch() |