thorfine commited on
Commit
10458eb
·
verified ·
1 Parent(s): 3e8b9a3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -59
app.py CHANGED
@@ -1,59 +1,59 @@
1
- import torch
2
- from PIL import Image
3
- import gradio as gr
4
- from transformers import Blip2Processor, Blip2ForConditionalGeneration, pipeline
5
- from gtts import gTTS
6
- import os
7
-
8
- device = "cuda" if torch.cuda.is_available() else "cpu"
9
-
10
- # Load BLIP-2
11
- processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
12
- model = Blip2ForConditionalGeneration.from_pretrained(
13
- "Salesforce/blip2-opt-2.7b",
14
- torch_dtype=torch.float16 if device == "cuda" else torch.float32
15
- ).to(device)
16
-
17
- # Load Whisper pipeline for speech-to-text
18
- whisper_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base")
19
-
20
- # Store image globally
21
- current_image = {"image": None}
22
-
23
- def load_image(image):
24
- current_image["image"] = image
25
- return "Image uploaded. Now ask a question via voice."
26
-
27
- def ask_question(audio):
28
- if current_image["image"] is None:
29
- return "Please upload an image first.", None
30
-
31
- # Transcribe speech
32
- question = whisper_pipe(audio)["text"]
33
-
34
- # Ask BLIP-2
35
- inputs = processor(current_image["image"], question, return_tensors="pt").to(device, torch.float16 if device == "cuda" else torch.float32)
36
- output = model.generate(**inputs, max_new_tokens=100)
37
- answer = processor.decode(output[0], skip_special_tokens=True)
38
-
39
- # Convert to speech
40
- tts = gTTS(answer)
41
- tts.save("answer.mp3")
42
-
43
- return f"Q: {question}\nA: {answer}", "answer.mp3"
44
-
45
- # Gradio UI
46
- with gr.Blocks() as app:
47
- gr.Markdown("# 🧠🖼️ Ask-the-Image with BLIP-2 + Whisper + gTTS")
48
- with gr.Row():
49
- image_input = gr.Image(type="pil", label="Upload Image")
50
- image_status = gr.Textbox(label="Status", interactive=False)
51
-
52
- audio_input = gr.Audio(source="microphone", type="filepath", label="Ask a Question (voice)")
53
- output_text = gr.Textbox(label="Q&A", lines=4)
54
- output_audio = gr.Audio(label="Answer (speech)")
55
-
56
- image_input.change(fn=load_image, inputs=image_input, outputs=image_status)
57
- audio_input.change(fn=ask_question, inputs=audio_input, outputs=[output_text, output_audio])
58
-
59
- app.launch()
 
1
+ import torch
2
+ from PIL import Image
3
+ import gradio as gr
4
+ from transformers import Blip2Processor, Blip2ForConditionalGeneration, pipeline
5
+ from gtts import gTTS
6
+ import os
7
+
8
+ device = "cuda" if torch.cuda.is_available() else "cpu"
9
+
10
+ # Load BLIP-2
11
+ processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
12
+ model = Blip2ForConditionalGeneration.from_pretrained(
13
+ "Salesforce/blip2-opt-2.7b",
14
+ torch_dtype=torch.float16 if device == "cuda" else torch.float32
15
+ ).to(device)
16
+
17
+ # Load Whisper pipeline for speech-to-text
18
+ whisper_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base")
19
+
20
+ # Store image globally
21
+ current_image = {"image": None}
22
+
23
+ def load_image(image):
24
+ current_image["image"] = image
25
+ return "Image uploaded. Now ask a question via voice."
26
+
27
+ def ask_question(audio):
28
+ if current_image["image"] is None:
29
+ return "Please upload an image first.", None
30
+
31
+ # Transcribe speech
32
+ question = whisper_pipe(audio)["text"]
33
+
34
+ # Ask BLIP-2
35
+ inputs = processor(current_image["image"], question, return_tensors="pt").to(device, torch.float16 if device == "cuda" else torch.float32)
36
+ output = model.generate(**inputs, max_new_tokens=100)
37
+ answer = processor.decode(output[0], skip_special_tokens=True)
38
+
39
+ # Convert to speech
40
+ tts = gTTS(answer)
41
+ tts.save("answer.mp3")
42
+
43
+ return f"Q: {question}\nA: {answer}", "answer.mp3"
44
+
45
+ # Gradio UI
46
+ with gr.Blocks() as app:
47
+ gr.Markdown("# 🧠🖼️ Ask-the-Image with BLIP-2 + Whisper + gTTS")
48
+ with gr.Row():
49
+ image_input = gr.Image(type="pil", label="Upload Image")
50
+ image_status = gr.Textbox(label="Status", interactive=False)
51
+
52
+ audio_input = gr.Audio(type="filepath", label="Ask a Question (voice)", microphone=True)
53
+ output_text = gr.Textbox(label="Q&A", lines=4)
54
+ output_audio = gr.Audio(label="Answer (speech)")
55
+
56
+ image_input.change(fn=load_image, inputs=image_input, outputs=image_status)
57
+ audio_input.change(fn=ask_question, inputs=audio_input, outputs=[output_text, output_audio])
58
+
59
+ app.launch()