thorfine commited on
Commit
a77e40f
·
verified ·
1 Parent(s): a4fdd86

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -20
app.py CHANGED
@@ -12,9 +12,9 @@ os.system("apt-get install -y ffmpeg")
12
 
13
  device = "cuda" if torch.cuda.is_available() else "cpu"
14
 
15
- # Load BLIP-2 model (smaller variant)
16
  processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
17
- model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b").to(device)
18
 
19
  # Load Whisper model
20
  whisper_model = whisper.load_model("base")
@@ -24,18 +24,20 @@ def transcribe(audio_path):
24
  return result["text"]
25
 
26
  def ask_image(image, audio):
27
- question = transcribe(audio) # Extract question from audio
28
-
29
- # Ensure the question is correctly passed as text input along with the image
30
- inputs = processor(images=image, text=question, return_tensors="pt").to(device)
31
-
32
- # Generate the answer to the question about the image
33
- generated_ids = model.generate(**inputs)
34
- answer = processor.decode(generated_ids[0], skip_special_tokens=True)
35
-
 
 
36
  print(f"Answer: {answer}")
37
 
38
- # Convert the answer to speech
39
  tts = gTTS(answer)
40
  with NamedTemporaryFile(delete=False, suffix=".mp3") as f:
41
  tts.save(f.name)
@@ -45,14 +47,15 @@ def ask_image(image, audio):
45
 
46
  with gr.Blocks() as demo:
47
  gr.Markdown("## 🎤🖼️ Ask-the-Image: Ask questions about an image using your voice")
48
-
49
- image_input = gr.Image(type="pil", label="Upload an Image")
50
- audio_input = gr.Audio(type="filepath", label="Ask a Question (voice)")
51
-
 
 
52
  text_output = gr.Textbox(label="Answer")
53
- audio_output = gr.Audio(label="Answer in Speech")
54
-
55
- btn = gr.Button("Ask")
56
  btn.click(fn=ask_image, inputs=[image_input, audio_input], outputs=[text_output, audio_output])
57
 
58
- demo.launch()
 
12
 
13
  device = "cuda" if torch.cuda.is_available() else "cpu"
14
 
15
+ # Load BLIP-2 model
16
  processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
17
+ model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16).to(device)
18
 
19
  # Load Whisper model
20
  whisper_model = whisper.load_model("base")
 
24
  return result["text"]
25
 
26
  def ask_image(image, audio):
27
+ # Transcribe the audio question
28
+ question = transcribe(audio)
29
+ print(f"Question: {question}")
30
+
31
+ # Prepare inputs with both image and question
32
+ inputs = processor(image, question, return_tensors="pt").to(device, torch.float16)
33
+
34
+ # Generate response
35
+ generated_ids = model.generate(**inputs, max_new_tokens=100)
36
+ answer = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
37
+
38
  print(f"Answer: {answer}")
39
 
40
+ # Convert answer to speech
41
  tts = gTTS(answer)
42
  with NamedTemporaryFile(delete=False, suffix=".mp3") as f:
43
  tts.save(f.name)
 
47
 
48
  with gr.Blocks() as demo:
49
  gr.Markdown("## 🎤🖼️ Ask-the-Image: Ask questions about an image using your voice")
50
+
51
+ with gr.Row():
52
+ image_input = gr.Image(type="pil", label="Upload an Image")
53
+ audio_input = gr.Audio(type="filepath", label="Ask a Question (voice)")
54
+
55
+ btn = gr.Button("Ask the Image")
56
  text_output = gr.Textbox(label="Answer")
57
+ audio_output = gr.Audio(label="Answer in Speech", autoplay=True)
58
+
 
59
  btn.click(fn=ask_image, inputs=[image_input, audio_input], outputs=[text_output, audio_output])
60
 
61
+ demo.launch()