Abdul Mutaal commited on
Commit
09baca2
·
2 Parent(s): 04a239e 6282d73

Merge branch 'main' of https://huggingface.co/spaces/thorfine/task2

Browse files
Files changed (2) hide show
  1. app.py +51 -59
  2. requirements.txt +14 -6
app.py CHANGED
@@ -1,59 +1,51 @@
1
- import torch
2
- from PIL import Image
3
- import gradio as gr
4
- from transformers import Blip2Processor, Blip2ForConditionalGeneration, pipeline
5
- from gtts import gTTS
6
- import os
7
-
8
- device = "cuda" if torch.cuda.is_available() else "cpu"
9
-
10
- # Load BLIP-2
11
- processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
12
- model = Blip2ForConditionalGeneration.from_pretrained(
13
- "Salesforce/blip2-opt-2.7b",
14
- torch_dtype=torch.float16 if device == "cuda" else torch.float32
15
- ).to(device)
16
-
17
- # Load Whisper pipeline for speech-to-text
18
- whisper_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base")
19
-
20
- # Store image globally
21
- current_image = {"image": None}
22
-
23
- def load_image(image):
24
- current_image["image"] = image
25
- return "Image uploaded. Now ask a question via voice."
26
-
27
- def ask_question(audio):
28
- if current_image["image"] is None:
29
- return "Please upload an image first.", None
30
-
31
- # Transcribe speech
32
- question = whisper_pipe(audio)["text"]
33
-
34
- # Ask BLIP-2
35
- inputs = processor(current_image["image"], question, return_tensors="pt").to(device, torch.float16 if device == "cuda" else torch.float32)
36
- output = model.generate(**inputs, max_new_tokens=100)
37
- answer = processor.decode(output[0], skip_special_tokens=True)
38
-
39
- # Convert to speech
40
- tts = gTTS(answer)
41
- tts.save("answer.mp3")
42
-
43
- return f"Q: {question}\nA: {answer}", "answer.mp3"
44
-
45
- # Gradio UI
46
- with gr.Blocks() as app:
47
- gr.Markdown("# 🧠🖼️ Ask-the-Image with BLIP-2 + Whisper + gTTS")
48
- with gr.Row():
49
- image_input = gr.Image(type="pil", label="Upload Image")
50
- image_status = gr.Textbox(label="Status", interactive=False)
51
-
52
- audio_input = gr.Audio(source="microphone", type="filepath", label="Ask a Question (voice)")
53
- output_text = gr.Textbox(label="Q&A", lines=4)
54
- output_audio = gr.Audio(label="Answer (speech)")
55
-
56
- image_input.change(fn=load_image, inputs=image_input, outputs=image_status)
57
- audio_input.change(fn=ask_question, inputs=audio_input, outputs=[output_text, output_audio])
58
-
59
- app.launch()
 
1
+ import gradio as gr
2
+ from transformers import Blip2Processor, Blip2ForConditionalGeneration, BitsAndBytesConfig
3
+ from gtts import gTTS
4
+ from tempfile import NamedTemporaryFile
5
+ from PIL import Image
6
+ import torch
7
+ import whisper
8
+
9
+ # Set device
10
+ device = "cuda" if torch.cuda.is_available() else "cpu"
11
+
12
+ # Load BLIP-2 model
13
+ processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
14
+ model = Blip2ForConditionalGeneration.from_pretrained(
15
+ "Salesforce/blip2-flan-t5-xl", device_map="auto"
16
+ ).to(device)
17
+
18
+ # Load Whisper model
19
+ whisper_model = whisper.load_model("small")
20
+
21
+ # Transcribe function
22
+ def transcribe(audio_path):
23
+ result = whisper_model.transcribe(audio_path)
24
+ return result["text"]
25
+
26
+ # Main function
27
+ def ask_image(image, audio):
28
+ question = transcribe(audio)
29
+ inputs = processor(images=image, text=question, return_tensors="pt").to(device)
30
+ generated_ids = model.generate(**inputs)
31
+ answer = processor.decode(generated_ids[0], skip_special_tokens=True)
32
+
33
+ tts = gTTS(answer)
34
+ with NamedTemporaryFile(delete=False, suffix=".mp3") as f:
35
+ tts.save(f.name)
36
+ audio_out = f.name
37
+
38
+ return answer, audio_out
39
+
40
+ # Gradio UI
41
+ with gr.Blocks() as demo:
42
+ gr.Markdown("## 🎤🖼️ Ask-the-Image: Ask questions about an image using your voice")
43
+ image_input = gr.Image(type="pil", label="Upload an Image")
44
+ audio_input = gr.Audio(type="filepath", label="Ask a Question (voice)", microphone=True)
45
+ text_output = gr.Textbox(label="Answer")
46
+ audio_output = gr.Audio(label="Answer in Speech")
47
+
48
+ btn = gr.Button("Ask")
49
+ btn.click(fn=ask_image, inputs=[image_input, audio_input], outputs=[text_output, audio_output])
50
+
51
+ demo.launch()
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,6 +1,14 @@
1
- torch
2
- torchvision
3
- transformers
4
- gradio
5
- gtts
6
- Pillow
 
 
 
 
 
 
 
 
 
1
+ transformers
2
+ git+https://github.com/openai/whisper.git
3
+ torch
4
+ accelerate
5
+ gradio
6
+ Pillow
7
+ matplotlib
8
+ bitsandbytes
9
+ gtts
10
+ torchvision
11
+ torchaudio
12
+ gradio
13
+ openai-whisper
14
+ ffmpeg-python