Spaces:

thorfine
/

task2

Sleeping

App Files Files Community

thorfine commited on May 1

Commit

a77e40f

verified ·

1 Parent(s): a4fdd86

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -20

app.py CHANGED Viewed

@@ -12,9 +12,9 @@ os.system("apt-get install -y ffmpeg")
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# Load BLIP-2 model (smaller variant)
 processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
-model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b").to(device)
 # Load Whisper model
 whisper_model = whisper.load_model("base")
@@ -24,18 +24,20 @@ def transcribe(audio_path):
     return result["text"]
 def ask_image(image, audio):
-    question = transcribe(audio)  # Extract question from audio
-    # Ensure the question is correctly passed as text input along with the image
-    inputs = processor(images=image, text=question, return_tensors="pt").to(device)
-    # Generate the answer to the question about the image
-    generated_ids = model.generate(**inputs)
-    answer = processor.decode(generated_ids[0], skip_special_tokens=True)
     print(f"Answer: {answer}")
-    # Convert the answer to speech
     tts = gTTS(answer)
     with NamedTemporaryFile(delete=False, suffix=".mp3") as f:
         tts.save(f.name)
@@ -45,14 +47,15 @@ def ask_image(image, audio):
 with gr.Blocks() as demo:
     gr.Markdown("## 🎤🖼️ Ask-the-Image: Ask questions about an image using your voice")
-    image_input = gr.Image(type="pil", label="Upload an Image")
-    audio_input = gr.Audio(type="filepath", label="Ask a Question (voice)")
     text_output = gr.Textbox(label="Answer")
-    audio_output = gr.Audio(label="Answer in Speech")
-    btn = gr.Button("Ask")
     btn.click(fn=ask_image, inputs=[image_input, audio_input], outputs=[text_output, audio_output])
-demo.launch()

 device = "cuda" if torch.cuda.is_available() else "cpu"
+# Load BLIP-2 model
 processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
+model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16).to(device)
 # Load Whisper model
 whisper_model = whisper.load_model("base")
     return result["text"]
 def ask_image(image, audio):
+    # Transcribe the audio question
+    question = transcribe(audio)
+    print(f"Question: {question}")
+    # Prepare inputs with both image and question
+    inputs = processor(image, question, return_tensors="pt").to(device, torch.float16)
+    # Generate response
+    generated_ids = model.generate(**inputs, max_new_tokens=100)
+    answer = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
     print(f"Answer: {answer}")
+    # Convert answer to speech
     tts = gTTS(answer)
     with NamedTemporaryFile(delete=False, suffix=".mp3") as f:
         tts.save(f.name)
 with gr.Blocks() as demo:
     gr.Markdown("## 🎤🖼️ Ask-the-Image: Ask questions about an image using your voice")
+    with gr.Row():
+        image_input = gr.Image(type="pil", label="Upload an Image")
+        audio_input = gr.Audio(type="filepath", label="Ask a Question (voice)")
+    btn = gr.Button("Ask the Image")
     text_output = gr.Textbox(label="Answer")
+    audio_output = gr.Audio(label="Answer in Speech", autoplay=True)
     btn.click(fn=ask_image, inputs=[image_input, audio_input], outputs=[text_output, audio_output])
+demo.launch()