thorfine commited on
Commit
172761a
·
verified ·
1 Parent(s): 10458eb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -49
app.py CHANGED
@@ -1,59 +1,30 @@
1
- import torch
2
- from PIL import Image
3
  import gradio as gr
4
- from transformers import Blip2Processor, Blip2ForConditionalGeneration, pipeline
5
  from gtts import gTTS
 
 
 
6
  import os
 
 
7
 
 
8
  device = "cuda" if torch.cuda.is_available() else "cpu"
9
 
10
- # Load BLIP-2
11
- processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
12
- model = Blip2ForConditionalGeneration.from_pretrained(
13
- "Salesforce/blip2-opt-2.7b",
14
- torch_dtype=torch.float16 if device == "cuda" else torch.float32
15
- ).to(device)
16
-
17
- # Load Whisper pipeline for speech-to-text
18
- whisper_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base")
19
-
20
- # Store image globally
21
- current_image = {"image": None}
22
-
23
- def load_image(image):
24
- current_image["image"] = image
25
- return "Image uploaded. Now ask a question via voice."
26
-
27
- def ask_question(audio):
28
- if current_image["image"] is None:
29
- return "Please upload an image first.", None
30
 
31
- # Transcribe speech
32
- question = whisper_pipe(audio)["text"]
33
 
34
- # Ask BLIP-2
35
- inputs = processor(current_image["image"], question, return_tensors="pt").to(device, torch.float16 if device == "cuda" else torch.float32)
36
- output = model.generate(**inputs, max_new_tokens=100)
37
- answer = processor.decode(output[0], skip_special_tokens=True)
38
 
39
- # Convert to speech
40
- tts = gTTS(answer)
41
- tts.save("answer.mp3")
42
-
43
- return f"Q: {question}\nA: {answer}", "answer.mp3"
44
-
45
- # Gradio UI
46
- with gr.Blocks() as app:
47
- gr.Markdown("# 🧠🖼️ Ask-the-Image with BLIP-2 + Whisper + gTTS")
48
- with gr.Row():
49
- image_input = gr.Image(type="pil", label="Upload Image")
50
- image_status = gr.Textbox(label="Status", interactive=False)
51
-
52
- audio_input = gr.Audio(type="filepath", label="Ask a Question (voice)", microphone=True)
53
- output_text = gr.Textbox(label="Q&A", lines=4)
54
- output_audio = gr.Audio(label="Answer (speech)")
55
-
56
- image_input.change(fn=load_image, inputs=image_input, outputs=image_status)
57
- audio_input.change(fn=ask_question, inputs=audio_input, outputs=[output_text, output_audio])
58
 
59
- app.launch()
 
 
 
1
  import gradio as gr
2
+ from transformers import AutoProcessor, Blip2ForConditionalGeneration, BitsAndBytesConfig,Blip2Processor
3
  from gtts import gTTS
4
+ from tempfile import NamedTemporaryFile
5
+ from PIL import Image
6
+ import torch
7
  import os
8
+ import torchaudio
9
+ import whisper
10
 
11
+ # Load BLIP-2 model
12
  device = "cuda" if torch.cuda.is_available() else "cpu"
13
 
14
+ quant_config = BitsAndBytesConfig(load_in_8bit=True)
15
+ processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
16
+ model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl", device_map="auto")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
+ # Load Whisper model (turbo version)
19
+ whisper_model = whisper.load_model("small")
20
 
21
+ def transcribe(audio):
22
+ # Use Whisper for transcription
23
+ result = whisper_model.transcribe(audio)
24
+ return result["text"]
25
 
26
+ from PIL import Image
27
+ import torch
28
+ from gtts import gTTS
29
+ from tempfile import NamedTemporaryFile
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30