thorfine commited on
Commit
24c917e
·
verified ·
1 Parent(s): 440e3a3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -3
app.py CHANGED
@@ -16,7 +16,7 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
16
  processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
17
  model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b").to(device)
18
 
19
- # Load Whisper
20
  whisper_model = whisper.load_model("base")
21
 
22
  def transcribe(audio_path):
@@ -24,11 +24,16 @@ def transcribe(audio_path):
24
  return result["text"]
25
 
26
  def ask_image(image, audio):
27
- question = transcribe(audio)
 
 
28
  inputs = processor(images=image, text=question, return_tensors="pt").to(device)
 
 
29
  generated_ids = model.generate(**inputs)
30
  answer = processor.decode(generated_ids[0], skip_special_tokens=True)
31
 
 
32
  tts = gTTS(answer)
33
  with NamedTemporaryFile(delete=False, suffix=".mp3") as f:
34
  tts.save(f.name)
@@ -40,7 +45,7 @@ with gr.Blocks() as demo:
40
  gr.Markdown("## 🎤🖼️ Ask-the-Image: Ask questions about an image using your voice")
41
 
42
  image_input = gr.Image(type="pil", label="Upload an Image")
43
- audio_input = gr.Audio(type="filepath", label="Ask a Question (voice)") # Fixed here
44
 
45
  text_output = gr.Textbox(label="Answer")
46
  audio_output = gr.Audio(label="Answer in Speech")
 
16
  processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
17
  model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b").to(device)
18
 
19
+ # Load Whisper model
20
  whisper_model = whisper.load_model("base")
21
 
22
  def transcribe(audio_path):
 
24
  return result["text"]
25
 
26
  def ask_image(image, audio):
27
+ question = transcribe(audio) # Extract question from audio
28
+
29
+ # Ensure the question is correctly passed as text input along with the image
30
  inputs = processor(images=image, text=question, return_tensors="pt").to(device)
31
+
32
+ # Generate the answer to the question about the image
33
  generated_ids = model.generate(**inputs)
34
  answer = processor.decode(generated_ids[0], skip_special_tokens=True)
35
 
36
+ # Convert the answer to speech
37
  tts = gTTS(answer)
38
  with NamedTemporaryFile(delete=False, suffix=".mp3") as f:
39
  tts.save(f.name)
 
45
  gr.Markdown("## 🎤🖼️ Ask-the-Image: Ask questions about an image using your voice")
46
 
47
  image_input = gr.Image(type="pil", label="Upload an Image")
48
+ audio_input = gr.Audio(type="filepath", label="Ask a Question (voice)")
49
 
50
  text_output = gr.Textbox(label="Answer")
51
  audio_output = gr.Audio(label="Answer in Speech")