File size: 1,697 Bytes
10458eb 2152d81 10458eb 172761a 10458eb 2152d81 10458eb 2152d81 172761a 2152d81 10458eb 2152d81 172761a 10458eb 2152d81 172761a 10458eb 2152d81 cd6a17b 2152d81 cd6a17b 2152d81 10458eb cd6a17b 2152d81 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
import gradio as gr
from transformers import Blip2Processor, Blip2ForConditionalGeneration, BitsAndBytesConfig
from gtts import gTTS
from tempfile import NamedTemporaryFile
from PIL import Image
import torch
import whisper
# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load BLIP-2 model
processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
model = Blip2ForConditionalGeneration.from_pretrained(
"Salesforce/blip2-flan-t5-xl", device_map="auto"
).to(device)
# Load Whisper model
whisper_model = whisper.load_model("small")
# Transcribe function
def transcribe(audio_path):
result = whisper_model.transcribe(audio_path)
return result["text"]
# Main function
def ask_image(image, audio):
question = transcribe(audio)
inputs = processor(images=image, text=question, return_tensors="pt").to(device)
generated_ids = model.generate(**inputs)
answer = processor.decode(generated_ids[0], skip_special_tokens=True)
tts = gTTS(answer)
with NamedTemporaryFile(delete=False, suffix=".mp3") as f:
tts.save(f.name)
audio_out = f.name
return answer, audio_out
# Gradio UI
with gr.Blocks() as demo:
gr.Markdown("## 🎤🖼️ Ask-the-Image: Ask questions about an image using your voice")
image_input = gr.Image(type="pil", label="Upload an Image")
audio_input = gr.Audio(source="microphone", type="filepath", label="Ask a Question (voice)")
text_output = gr.Textbox(label="Answer")
audio_output = gr.Audio(label="Answer in Speech")
btn = gr.Button("Ask")
btn.click(fn=ask_image, inputs=[image_input, audio_input], outputs=[text_output, audio_output])
demo.launch()
|