File size: 1,898 Bytes
10458eb f41df8b 10458eb 172761a 440e3a3 10458eb 440e3a3 2321f72 f41df8b 10458eb 24c917e f41df8b 10458eb 2152d81 172761a 10458eb 2152d81 24c917e 2152d81 24c917e 2152d81 f41df8b b4f34d6 24c917e 2152d81 cd6a17b 2152d81 24c917e cd6a17b 2152d81 10458eb 440e3a3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
import gradio as gr
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from gtts import gTTS
from tempfile import NamedTemporaryFile
from PIL import Image
import torch
import whisper
import os
# Ensure ffmpeg is installed
os.system("apt-get install -y ffmpeg")
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load BLIP-2 model (smaller variant)
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b").to(device)
# Load Whisper model
whisper_model = whisper.load_model("base")
def transcribe(audio_path):
result = whisper_model.transcribe(audio_path)
return result["text"]
def ask_image(image, audio):
question = transcribe(audio) # Extract question from audio
# Ensure the question is correctly passed as text input along with the image
inputs = processor(images=image, text=question, return_tensors="pt").to(device)
# Generate the answer to the question about the image
generated_ids = model.generate(**inputs)
answer = processor.decode(generated_ids[0], skip_special_tokens=True)
print(f"Answer: {answer}")
# Convert the answer to speech
tts = gTTS(answer)
with NamedTemporaryFile(delete=False, suffix=".mp3") as f:
tts.save(f.name)
audio_out = f.name
return answer, audio_out
with gr.Blocks() as demo:
gr.Markdown("## 🎤🖼️ Ask-the-Image: Ask questions about an image using your voice")
image_input = gr.Image(type="pil", label="Upload an Image")
audio_input = gr.Audio(type="filepath", label="Ask a Question (voice)")
text_output = gr.Textbox(label="Answer")
audio_output = gr.Audio(label="Answer in Speech")
btn = gr.Button("Ask")
btn.click(fn=ask_image, inputs=[image_input, audio_input], outputs=[text_output, audio_output])
demo.launch()
|