File size: 1,898 Bytes
10458eb
f41df8b
10458eb
172761a
 
 
 
440e3a3
 
 
 
10458eb
 
 
440e3a3
2321f72
f41df8b
10458eb
24c917e
f41df8b
10458eb
2152d81
 
172761a
10458eb
2152d81
24c917e
 
 
2152d81
24c917e
 
2152d81
 
f41df8b
b4f34d6
 
24c917e
2152d81
 
 
 
 
 
 
 
 
cd6a17b
2152d81
24c917e
cd6a17b
2152d81
 
 
 
 
10458eb
440e3a3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import gradio as gr
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from gtts import gTTS
from tempfile import NamedTemporaryFile
from PIL import Image
import torch
import whisper
import os

# Ensure ffmpeg is installed
os.system("apt-get install -y ffmpeg")

device = "cuda" if torch.cuda.is_available() else "cpu"

# Load BLIP-2 model (smaller variant)
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b").to(device)

# Load Whisper model
whisper_model = whisper.load_model("base")

def transcribe(audio_path):
    result = whisper_model.transcribe(audio_path)
    return result["text"]

def ask_image(image, audio):
    question = transcribe(audio)  # Extract question from audio

    # Ensure the question is correctly passed as text input along with the image
    inputs = processor(images=image, text=question, return_tensors="pt").to(device)

    # Generate the answer to the question about the image
    generated_ids = model.generate(**inputs)
    answer = processor.decode(generated_ids[0], skip_special_tokens=True)

    print(f"Answer: {answer}")
    
    # Convert the answer to speech
    tts = gTTS(answer)
    with NamedTemporaryFile(delete=False, suffix=".mp3") as f:
        tts.save(f.name)
        audio_out = f.name

    return answer, audio_out

with gr.Blocks() as demo:
    gr.Markdown("## 🎤🖼️ Ask-the-Image: Ask questions about an image using your voice")

    image_input = gr.Image(type="pil", label="Upload an Image")
    audio_input = gr.Audio(type="filepath", label="Ask a Question (voice)")

    text_output = gr.Textbox(label="Answer")
    audio_output = gr.Audio(label="Answer in Speech")

    btn = gr.Button("Ask")
    btn.click(fn=ask_image, inputs=[image_input, audio_input], outputs=[text_output, audio_output])

demo.launch()