import gradio as gr from transformers import Blip2Processor, Blip2ForConditionalGeneration from gtts import gTTS from tempfile import NamedTemporaryFile from PIL import Image import torch import whisper import os # Ensure ffmpeg is installed os.system("apt-get install -y ffmpeg") device = "cuda" if torch.cuda.is_available() else "cpu" # Load BLIP-2 model processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl") model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl", torch_dtype=torch.float16).to(device) # Load Whisper model whisper_model = whisper.load_model("base") def transcribe(audio_path): result = whisper_model.transcribe(audio_path) return result["text"] def ask_image(image, audio): # Transcribe the audio question question = transcribe(audio) print(f"Question: {question}") # Prepare inputs with both image and question inputs = processor(image, question, return_tensors="pt").to(device, torch.float16) # Generate response generated_ids = model.generate(**inputs, max_new_tokens=100) answer = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip() print(f"Answer: {answer}") # Convert answer to speech tts = gTTS(answer) with NamedTemporaryFile(delete=False, suffix=".mp3") as f: tts.save(f.name) audio_out = f.name return answer, audio_out with gr.Blocks() as demo: gr.Markdown("## 🎤🖼️ Ask-the-Image: Ask questions about an image using your voice") with gr.Row(): image_input = gr.Image(type="pil", label="Upload an Image") audio_input = gr.Audio(type="filepath", label="Ask a Question (voice)") btn = gr.Button("Ask the Image") text_output = gr.Textbox(label="Answer") audio_output = gr.Audio(label="Answer in Speech", autoplay=True) btn.click(fn=ask_image, inputs=[image_input, audio_input], outputs=[text_output, audio_output]) demo.launch()