Maya-AI / app.py
Devakumar868's picture
Update app.py
c5ef34e verified
raw
history blame
1.67 kB
from fastapi import FastAPI, UploadFile
import gradio as gr
from nemo.collections.asr import EncDecRNNTBPEModel
from speechbrain.pretrained import EncoderClassifier
from transformers import DiffusionPipeline, AutoModelForCausalLM, AutoTokenizer
from dia.model import Dia
import soundfile as sf
# Load models
asr = EncDecRNNTBPEModel.from_pretrained("nvidia/parakeet-tdt-0.6b-v2")
emotion = EncoderClassifier.from_hparams(source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP")
diffuser = DiffusionPipeline.from_pretrained("teticio/audio-diffusion-256").to("cuda")
llm_tokenizer = AutoTokenizer.from_pretrained("Vicuna-7B")
llm = AutoModelForCausalLM.from_pretrained("Vicuna-7B").half().to("cuda")
tts = Dia.from_pretrained("nari-labs/Dia-1.6B")
app = FastAPI()
def process(audio_file):
# Save
data, sr = sf.read(audio_file)
# ASR
text = asr.transcribe([audio_file])[0]
# Emotion
emo = emotion.classify_file(audio_file)["label"]
# LLM response
inputs = llm_tokenizer(text, return_tensors="pt").to("cuda")
resp = llm.generate(**inputs, max_new_tokens=128)
reply = llm_tokenizer.decode(resp[0])
# TTS
wav = tts.generate(f"[S1] {reply} [S2]")
sf.write("reply.wav", wav, 44100)
return text, emo, reply, "reply.wav"
# Gradio UI
iface = gr.Interface(fn=process, inputs=gr.Audio(source="microphone"), outputs=[
gr.Textbox(label="Transcript"),
gr.Textbox(label="Emotion"),
gr.Textbox(label="Reply"),
gr.Audio(label="Audio Reply")
], live=False, enable_queue=True)
app.mount("/", gr.routes.App.create_app(iface))
if __name__=="__main__":
import uvicorn; uvicorn.run(app, host="0.0.0.0", port=7860)