|
import gradio as gr |
|
from transformers import AutoProcessor, Blip2ForConditionalGeneration, BitsAndBytesConfig,Blip2Processor |
|
from gtts import gTTS |
|
from tempfile import NamedTemporaryFile |
|
from PIL import Image |
|
import torch |
|
import os |
|
import torchaudio |
|
import whisper |
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
quant_config = BitsAndBytesConfig(load_in_8bit=True) |
|
processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl") |
|
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl", device_map="auto") |
|
|
|
|
|
whisper_model = whisper.load_model("small") |
|
|
|
def transcribe(audio): |
|
|
|
result = whisper_model.transcribe(audio) |
|
return result["text"] |
|
|
|
from PIL import Image |
|
import torch |
|
from gtts import gTTS |
|
from tempfile import NamedTemporaryFile |
|
|
|
|