File size: 1,669 Bytes
c5ef34e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from fastapi import FastAPI, UploadFile
import gradio as gr
from nemo.collections.asr import EncDecRNNTBPEModel
from speechbrain.pretrained import EncoderClassifier
from transformers import DiffusionPipeline, AutoModelForCausalLM, AutoTokenizer
from dia.model import Dia
import soundfile as sf
# Load models
asr = EncDecRNNTBPEModel.from_pretrained("nvidia/parakeet-tdt-0.6b-v2")
emotion = EncoderClassifier.from_hparams(source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP")
diffuser = DiffusionPipeline.from_pretrained("teticio/audio-diffusion-256").to("cuda")
llm_tokenizer = AutoTokenizer.from_pretrained("Vicuna-7B")
llm = AutoModelForCausalLM.from_pretrained("Vicuna-7B").half().to("cuda")
tts = Dia.from_pretrained("nari-labs/Dia-1.6B")

app = FastAPI()
def process(audio_file):
    # Save
    data, sr = sf.read(audio_file)
    # ASR
    text = asr.transcribe([audio_file])[0]
    # Emotion
    emo = emotion.classify_file(audio_file)["label"]
    # LLM response
    inputs = llm_tokenizer(text, return_tensors="pt").to("cuda")
    resp = llm.generate(**inputs, max_new_tokens=128)
    reply = llm_tokenizer.decode(resp[0])
    # TTS
    wav = tts.generate(f"[S1] {reply} [S2]")
    sf.write("reply.wav", wav, 44100)
    return text, emo, reply, "reply.wav"

# Gradio UI
iface = gr.Interface(fn=process, inputs=gr.Audio(source="microphone"), outputs=[
    gr.Textbox(label="Transcript"),
    gr.Textbox(label="Emotion"),
    gr.Textbox(label="Reply"),
    gr.Audio(label="Audio Reply")
], live=False, enable_queue=True)
app.mount("/", gr.routes.App.create_app(iface))
if __name__=="__main__":
    import uvicorn; uvicorn.run(app, host="0.0.0.0", port=7860)