task2 / app.py
thorfine's picture
Update app.py
cd6a17b verified
raw
history blame
1.7 kB
import gradio as gr
from transformers import Blip2Processor, Blip2ForConditionalGeneration, BitsAndBytesConfig
from gtts import gTTS
from tempfile import NamedTemporaryFile
from PIL import Image
import torch
import whisper
# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load BLIP-2 model
processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
model = Blip2ForConditionalGeneration.from_pretrained(
"Salesforce/blip2-flan-t5-xl", device_map="auto"
).to(device)
# Load Whisper model
whisper_model = whisper.load_model("small")
# Transcribe function
def transcribe(audio_path):
result = whisper_model.transcribe(audio_path)
return result["text"]
# Main function
def ask_image(image, audio):
question = transcribe(audio)
inputs = processor(images=image, text=question, return_tensors="pt").to(device)
generated_ids = model.generate(**inputs)
answer = processor.decode(generated_ids[0], skip_special_tokens=True)
tts = gTTS(answer)
with NamedTemporaryFile(delete=False, suffix=".mp3") as f:
tts.save(f.name)
audio_out = f.name
return answer, audio_out
# Gradio UI
with gr.Blocks() as demo:
gr.Markdown("## 🎤🖼️ Ask-the-Image: Ask questions about an image using your voice")
image_input = gr.Image(type="pil", label="Upload an Image")
audio_input = gr.Audio(source="microphone", type="filepath", label="Ask a Question (voice)")
text_output = gr.Textbox(label="Answer")
audio_output = gr.Audio(label="Answer in Speech")
btn = gr.Button("Ask")
btn.click(fn=ask_image, inputs=[image_input, audio_input], outputs=[text_output, audio_output])
demo.launch()