Spaces:
Runtime error
Runtime error
import os, tempfile, uuid | |
from fastapi import FastAPI | |
import gradio as gr | |
import soundfile as sf | |
import torch | |
import numpy as np | |
import nemo.collections.asr as nemo_asr | |
from speechbrain.pretrained import EncoderClassifier | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
# Initialize FastAPI and models | |
app = FastAPI() | |
conversation_history = {} | |
# Model loading | |
asr_model = nemo_asr.models.ASRModel.from_pretrained("nvidia/parakeet-tdt-0.6b-v2") # ASR [2] | |
emotion_model = EncoderClassifier.from_hparams( | |
source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP", | |
savedir="emotion_cache" | |
) # Emotion [3] | |
llm_name = "microsoft/DialoGPT-medium" | |
llm_tokenizer = AutoTokenizer.from_pretrained(llm_name) | |
llm_model = AutoModelForCausalLM.from_pretrained(llm_name).to("cuda" if torch.cuda.is_available() else "cpu") # LLM [4] | |
def transcribe_and_emote(audio_path): | |
text = asr_model.transcribe([audio_path])[0].text | |
emotion = emotion_model.classify_file(audio_path)[0] | |
return text, emotion | |
def generate_reply(user_text, emotion, uid): | |
# Track and trim history | |
hist = conversation_history.setdefault(uid, []) | |
ctx = f"[Feeling:{emotion}] {user_text}" | |
hist.append(ctx) | |
hist = hist[-6:] | |
conversation_history[uid] = hist | |
prompt = " ".join(hist) | |
inputs = llm_tokenizer.encode(prompt, return_tensors="pt").to(llm_model.device) | |
out = llm_model.generate(inputs, max_new_tokens=100, pad_token_id=llm_tokenizer.eos_token_id) | |
reply = llm_tokenizer.decode(out[0], skip_special_tokens=True)[len(prompt):].strip() | |
hist.append(reply) | |
return reply or "I’m here to help!" | |
def process(audio, uid): | |
if not audio: | |
return "", "", "", uid | |
# Save temp file | |
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) | |
data, sr = audio | |
sf.write(tmp.name, data, sr) | |
# ASR + Emotion | |
text, emo = transcribe_and_emote(tmp.name) | |
# LLM response | |
reply = generate_reply(text, emo, uid) | |
# Clean up | |
os.unlink(tmp.name) | |
return text, emo, reply, uid | |
# Gradio interface | |
with gr.Blocks() as demo: | |
uid_state = gr.State(value=str(uuid.uuid4())) | |
audio_in = gr.Audio(source="microphone", type="numpy") | |
txt_out = gr.Textbox(label="Transcription") | |
emo_out = gr.Textbox(label="Emotion") | |
rep_out = gr.Textbox(label="AI Reply") | |
btn = gr.Button("Process") | |
btn.click(process, inputs=[audio_in, uid_state], outputs=[txt_out, emo_out, rep_out, uid_state]) | |
app = gr.mount_gradio_app(app, demo, path="/") | |
if __name__ == "__main__": | |
import uvicorn | |
uvicorn.run(app, host="0.0.0.0", port=7860) | |