Spaces:
Runtime error
Runtime error
File size: 1,669 Bytes
c5ef34e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
from fastapi import FastAPI, UploadFile
import gradio as gr
from nemo.collections.asr import EncDecRNNTBPEModel
from speechbrain.pretrained import EncoderClassifier
from transformers import DiffusionPipeline, AutoModelForCausalLM, AutoTokenizer
from dia.model import Dia
import soundfile as sf
# Load models
asr = EncDecRNNTBPEModel.from_pretrained("nvidia/parakeet-tdt-0.6b-v2")
emotion = EncoderClassifier.from_hparams(source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP")
diffuser = DiffusionPipeline.from_pretrained("teticio/audio-diffusion-256").to("cuda")
llm_tokenizer = AutoTokenizer.from_pretrained("Vicuna-7B")
llm = AutoModelForCausalLM.from_pretrained("Vicuna-7B").half().to("cuda")
tts = Dia.from_pretrained("nari-labs/Dia-1.6B")
app = FastAPI()
def process(audio_file):
# Save
data, sr = sf.read(audio_file)
# ASR
text = asr.transcribe([audio_file])[0]
# Emotion
emo = emotion.classify_file(audio_file)["label"]
# LLM response
inputs = llm_tokenizer(text, return_tensors="pt").to("cuda")
resp = llm.generate(**inputs, max_new_tokens=128)
reply = llm_tokenizer.decode(resp[0])
# TTS
wav = tts.generate(f"[S1] {reply} [S2]")
sf.write("reply.wav", wav, 44100)
return text, emo, reply, "reply.wav"
# Gradio UI
iface = gr.Interface(fn=process, inputs=gr.Audio(source="microphone"), outputs=[
gr.Textbox(label="Transcript"),
gr.Textbox(label="Emotion"),
gr.Textbox(label="Reply"),
gr.Audio(label="Audio Reply")
], live=False, enable_queue=True)
app.mount("/", gr.routes.App.create_app(iface))
if __name__=="__main__":
import uvicorn; uvicorn.run(app, host="0.0.0.0", port=7860)
|