Spaces:

thorfine
/

task2

Sleeping

thorfine commited on May 1

Commit

24c917e

verified ·

1 Parent(s): 440e3a3

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -16,7 +16,7 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
 model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b").to(device)
-# Load Whisper
 whisper_model = whisper.load_model("base")
 def transcribe(audio_path):
@@ -24,11 +24,16 @@ def transcribe(audio_path):
     return result["text"]
 def ask_image(image, audio):
-    question = transcribe(audio)
     inputs = processor(images=image, text=question, return_tensors="pt").to(device)
     generated_ids = model.generate(**inputs)
     answer = processor.decode(generated_ids[0], skip_special_tokens=True)
     tts = gTTS(answer)
     with NamedTemporaryFile(delete=False, suffix=".mp3") as f:
         tts.save(f.name)
@@ -40,7 +45,7 @@ with gr.Blocks() as demo:
     gr.Markdown("## 🎤🖼️ Ask-the-Image: Ask questions about an image using your voice")
     image_input = gr.Image(type="pil", label="Upload an Image")
-    audio_input = gr.Audio(type="filepath", label="Ask a Question (voice)")  # Fixed here
     text_output = gr.Textbox(label="Answer")
     audio_output = gr.Audio(label="Answer in Speech")

 processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
 model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b").to(device)
+# Load Whisper model
 whisper_model = whisper.load_model("base")
 def transcribe(audio_path):
     return result["text"]
 def ask_image(image, audio):
+    question = transcribe(audio)  # Extract question from audio
+    # Ensure the question is correctly passed as text input along with the image
     inputs = processor(images=image, text=question, return_tensors="pt").to(device)
+    # Generate the answer to the question about the image
     generated_ids = model.generate(**inputs)
     answer = processor.decode(generated_ids[0], skip_special_tokens=True)
+    # Convert the answer to speech
     tts = gTTS(answer)
     with NamedTemporaryFile(delete=False, suffix=".mp3") as f:
         tts.save(f.name)
     gr.Markdown("## 🎤🖼️ Ask-the-Image: Ask questions about an image using your voice")
     image_input = gr.Image(type="pil", label="Upload an Image")
+    audio_input = gr.Audio(type="filepath", label="Ask a Question (voice)")
     text_output = gr.Textbox(label="Answer")
     audio_output = gr.Audio(label="Answer in Speech")