thorfine commited on
Commit
f41df8b
·
verified ·
1 Parent(s): 0707634

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -13
app.py CHANGED
@@ -1,35 +1,30 @@
1
  import gradio as gr
2
- from transformers import Blip2Processor, Blip2ForConditionalGeneration, BitsAndBytesConfig
3
  from gtts import gTTS
4
  from tempfile import NamedTemporaryFile
5
  from PIL import Image
6
  import torch
7
  import whisper
8
 
9
- # Set device
10
  device = "cuda" if torch.cuda.is_available() else "cpu"
11
 
12
- # Load BLIP-2 model
13
  processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
14
- model = Blip2ForConditionalGeneration.from_pretrained(
15
- "Salesforce/blip2-opt-2.7b", device_map="auto"
16
- ).to(device)
17
 
18
- # Load Whisper model
19
- whisper_model = whisper.load_model("small")
20
 
21
- # Transcribe function
22
  def transcribe(audio_path):
23
  result = whisper_model.transcribe(audio_path)
24
  return result["text"]
25
 
26
- # Main function
27
  def ask_image(image, audio):
28
  question = transcribe(audio)
29
  inputs = processor(images=image, text=question, return_tensors="pt").to(device)
30
  generated_ids = model.generate(**inputs)
31
  answer = processor.decode(generated_ids[0], skip_special_tokens=True)
32
-
33
  tts = gTTS(answer)
34
  with NamedTemporaryFile(delete=False, suffix=".mp3") as f:
35
  tts.save(f.name)
@@ -37,7 +32,6 @@ def ask_image(image, audio):
37
 
38
  return answer, audio_out
39
 
40
- # Gradio UI
41
  with gr.Blocks() as demo:
42
  gr.Markdown("## 🎤🖼️ Ask-the-Image: Ask questions about an image using your voice")
43
 
@@ -50,5 +44,4 @@ with gr.Blocks() as demo:
50
  btn = gr.Button("Ask")
51
  btn.click(fn=ask_image, inputs=[image_input, audio_input], outputs=[text_output, audio_output])
52
 
53
-
54
  demo.launch()
 
1
  import gradio as gr
2
+ from transformers import Blip2Processor, Blip2ForConditionalGeneration
3
  from gtts import gTTS
4
  from tempfile import NamedTemporaryFile
5
  from PIL import Image
6
  import torch
7
  import whisper
8
 
 
9
  device = "cuda" if torch.cuda.is_available() else "cpu"
10
 
11
+ # Load BLIP-2 (smaller model)
12
  processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
13
+ model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b").to(device)
 
 
14
 
15
+ # Load Whisper
16
+ whisper_model = whisper.load_model("base")
17
 
 
18
  def transcribe(audio_path):
19
  result = whisper_model.transcribe(audio_path)
20
  return result["text"]
21
 
 
22
  def ask_image(image, audio):
23
  question = transcribe(audio)
24
  inputs = processor(images=image, text=question, return_tensors="pt").to(device)
25
  generated_ids = model.generate(**inputs)
26
  answer = processor.decode(generated_ids[0], skip_special_tokens=True)
27
+
28
  tts = gTTS(answer)
29
  with NamedTemporaryFile(delete=False, suffix=".mp3") as f:
30
  tts.save(f.name)
 
32
 
33
  return answer, audio_out
34
 
 
35
  with gr.Blocks() as demo:
36
  gr.Markdown("## 🎤🖼️ Ask-the-Image: Ask questions about an image using your voice")
37
 
 
44
  btn = gr.Button("Ask")
45
  btn.click(fn=ask_image, inputs=[image_input, audio_input], outputs=[text_output, audio_output])
46
 
 
47
  demo.launch()