Maya-AI / app.py
Devakumar868's picture
Update app.py
25ff0cb verified
raw
history blame
2.62 kB
import os, tempfile, uuid
from fastapi import FastAPI
import gradio as gr
import soundfile as sf
import torch
import numpy as np
import nemo.collections.asr as nemo_asr
from speechbrain.pretrained import EncoderClassifier
from transformers import AutoTokenizer, AutoModelForCausalLM
# Initialize FastAPI and models
app = FastAPI()
conversation_history = {}
# Model loading
asr_model = nemo_asr.models.ASRModel.from_pretrained("nvidia/parakeet-tdt-0.6b-v2") # ASR [2]
emotion_model = EncoderClassifier.from_hparams(
source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP",
savedir="emotion_cache"
) # Emotion [3]
llm_name = "microsoft/DialoGPT-medium"
llm_tokenizer = AutoTokenizer.from_pretrained(llm_name)
llm_model = AutoModelForCausalLM.from_pretrained(llm_name).to("cuda" if torch.cuda.is_available() else "cpu") # LLM [4]
def transcribe_and_emote(audio_path):
text = asr_model.transcribe([audio_path])[0].text
emotion = emotion_model.classify_file(audio_path)[0]
return text, emotion
def generate_reply(user_text, emotion, uid):
# Track and trim history
hist = conversation_history.setdefault(uid, [])
ctx = f"[Feeling:{emotion}] {user_text}"
hist.append(ctx)
hist = hist[-6:]
conversation_history[uid] = hist
prompt = " ".join(hist)
inputs = llm_tokenizer.encode(prompt, return_tensors="pt").to(llm_model.device)
out = llm_model.generate(inputs, max_new_tokens=100, pad_token_id=llm_tokenizer.eos_token_id)
reply = llm_tokenizer.decode(out[0], skip_special_tokens=True)[len(prompt):].strip()
hist.append(reply)
return reply or "I’m here to help!"
def process(audio, uid):
if not audio:
return "", "", "", uid
# Save temp file
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
data, sr = audio
sf.write(tmp.name, data, sr)
# ASR + Emotion
text, emo = transcribe_and_emote(tmp.name)
# LLM response
reply = generate_reply(text, emo, uid)
# Clean up
os.unlink(tmp.name)
return text, emo, reply, uid
# Gradio interface
with gr.Blocks() as demo:
uid_state = gr.State(value=str(uuid.uuid4()))
audio_in = gr.Audio(source="microphone", type="numpy")
txt_out = gr.Textbox(label="Transcription")
emo_out = gr.Textbox(label="Emotion")
rep_out = gr.Textbox(label="AI Reply")
btn = gr.Button("Process")
btn.click(process, inputs=[audio_in, uid_state], outputs=[txt_out, emo_out, rep_out, uid_state])
app = gr.mount_gradio_app(app, demo, path="/")
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)