File size: 886 Bytes
10458eb
172761a
10458eb
172761a
 
 
10458eb
172761a
 
10458eb
172761a
10458eb
 
172761a
 
 
10458eb
172761a
 
10458eb
172761a
 
 
 
10458eb
172761a
 
 
 
10458eb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import gradio as gr
from transformers import AutoProcessor, Blip2ForConditionalGeneration, BitsAndBytesConfig,Blip2Processor
from gtts import gTTS
from tempfile import NamedTemporaryFile
from PIL import Image
import torch
import os
import torchaudio
import whisper

# Load BLIP-2 model
device = "cuda" if torch.cuda.is_available() else "cpu"

quant_config = BitsAndBytesConfig(load_in_8bit=True)
processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl", device_map="auto")

# Load Whisper model (turbo version)
whisper_model = whisper.load_model("small")

def transcribe(audio):
    # Use Whisper for transcription
    result = whisper_model.transcribe(audio)
    return result["text"]

from PIL import Image
import torch
from gtts import gTTS
from tempfile import NamedTemporaryFile