File size: 1,994 Bytes
10458eb
f41df8b
10458eb
172761a
 
 
 
440e3a3
 
 
 
10458eb
 
 
a77e40f
48c77e7
 
10458eb
24c917e
f41df8b
10458eb
2152d81
 
172761a
10458eb
2152d81
a77e40f
 
 
 
 
 
 
 
 
 
 
b4f34d6
 
a77e40f
2152d81
 
 
 
 
 
 
 
 
a77e40f
 
 
 
 
 
2152d81
a77e40f
 
2152d81
10458eb
a77e40f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import gradio as gr
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from gtts import gTTS
from tempfile import NamedTemporaryFile
from PIL import Image
import torch
import whisper
import os

# Ensure ffmpeg is installed
os.system("apt-get install -y ffmpeg")

device = "cuda" if torch.cuda.is_available() else "cpu"

# Load BLIP-2 model
processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl", torch_dtype=torch.float16).to(device)

# Load Whisper model
whisper_model = whisper.load_model("base")

def transcribe(audio_path):
    result = whisper_model.transcribe(audio_path)
    return result["text"]

def ask_image(image, audio):
    # Transcribe the audio question
    question = transcribe(audio)
    print(f"Question: {question}")
    
    # Prepare inputs with both image and question
    inputs = processor(image, question, return_tensors="pt").to(device, torch.float16)
    
    # Generate response
    generated_ids = model.generate(**inputs, max_new_tokens=100)
    answer = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
    
    print(f"Answer: {answer}")
    
    # Convert answer to speech
    tts = gTTS(answer)
    with NamedTemporaryFile(delete=False, suffix=".mp3") as f:
        tts.save(f.name)
        audio_out = f.name

    return answer, audio_out

with gr.Blocks() as demo:
    gr.Markdown("## 🎤🖼️ Ask-the-Image: Ask questions about an image using your voice")
    
    with gr.Row():
        image_input = gr.Image(type="pil", label="Upload an Image")
        audio_input = gr.Audio(type="filepath", label="Ask a Question (voice)")
    
    btn = gr.Button("Ask the Image")
    text_output = gr.Textbox(label="Answer")
    audio_output = gr.Audio(label="Answer in Speech", autoplay=True)
    
    btn.click(fn=ask_image, inputs=[image_input, audio_input], outputs=[text_output, audio_output])

demo.launch()