Spaces:
Runtime error
Runtime error
from fastapi import FastAPI, UploadFile | |
import gradio as gr | |
from nemo.collections.asr import EncDecRNNTBPEModel | |
from speechbrain.pretrained import EncoderClassifier | |
from transformers import DiffusionPipeline, AutoModelForCausalLM, AutoTokenizer | |
from dia.model import Dia | |
import soundfile as sf | |
# Load models | |
asr = EncDecRNNTBPEModel.from_pretrained("nvidia/parakeet-tdt-0.6b-v2") | |
emotion = EncoderClassifier.from_hparams(source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP") | |
diffuser = DiffusionPipeline.from_pretrained("teticio/audio-diffusion-256").to("cuda") | |
llm_tokenizer = AutoTokenizer.from_pretrained("Vicuna-7B") | |
llm = AutoModelForCausalLM.from_pretrained("Vicuna-7B").half().to("cuda") | |
tts = Dia.from_pretrained("nari-labs/Dia-1.6B") | |
app = FastAPI() | |
def process(audio_file): | |
# Save | |
data, sr = sf.read(audio_file) | |
# ASR | |
text = asr.transcribe([audio_file])[0] | |
# Emotion | |
emo = emotion.classify_file(audio_file)["label"] | |
# LLM response | |
inputs = llm_tokenizer(text, return_tensors="pt").to("cuda") | |
resp = llm.generate(**inputs, max_new_tokens=128) | |
reply = llm_tokenizer.decode(resp[0]) | |
# TTS | |
wav = tts.generate(f"[S1] {reply} [S2]") | |
sf.write("reply.wav", wav, 44100) | |
return text, emo, reply, "reply.wav" | |
# Gradio UI | |
iface = gr.Interface(fn=process, inputs=gr.Audio(source="microphone"), outputs=[ | |
gr.Textbox(label="Transcript"), | |
gr.Textbox(label="Emotion"), | |
gr.Textbox(label="Reply"), | |
gr.Audio(label="Audio Reply") | |
], live=False, enable_queue=True) | |
app.mount("/", gr.routes.App.create_app(iface)) | |
if __name__=="__main__": | |
import uvicorn; uvicorn.run(app, host="0.0.0.0", port=7860) | |