Spaces:
Runtime error
Runtime error
File size: 2,615 Bytes
25ff0cb c5ef34e c0a635e 25ff0cb c5ef34e 25ff0cb c5ef34e c0a635e 25ff0cb c0a635e 25ff0cb c0a635e 25ff0cb c0a635e 25ff0cb c0a635e 25ff0cb c0a635e 25ff0cb c0a635e 25ff0cb c0a635e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import os, tempfile, uuid
from fastapi import FastAPI
import gradio as gr
import soundfile as sf
import torch
import numpy as np
import nemo.collections.asr as nemo_asr
from speechbrain.pretrained import EncoderClassifier
from transformers import AutoTokenizer, AutoModelForCausalLM
# Initialize FastAPI and models
app = FastAPI()
conversation_history = {}
# Model loading
asr_model = nemo_asr.models.ASRModel.from_pretrained("nvidia/parakeet-tdt-0.6b-v2") # ASR [2]
emotion_model = EncoderClassifier.from_hparams(
source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP",
savedir="emotion_cache"
) # Emotion [3]
llm_name = "microsoft/DialoGPT-medium"
llm_tokenizer = AutoTokenizer.from_pretrained(llm_name)
llm_model = AutoModelForCausalLM.from_pretrained(llm_name).to("cuda" if torch.cuda.is_available() else "cpu") # LLM [4]
def transcribe_and_emote(audio_path):
text = asr_model.transcribe([audio_path])[0].text
emotion = emotion_model.classify_file(audio_path)[0]
return text, emotion
def generate_reply(user_text, emotion, uid):
# Track and trim history
hist = conversation_history.setdefault(uid, [])
ctx = f"[Feeling:{emotion}] {user_text}"
hist.append(ctx)
hist = hist[-6:]
conversation_history[uid] = hist
prompt = " ".join(hist)
inputs = llm_tokenizer.encode(prompt, return_tensors="pt").to(llm_model.device)
out = llm_model.generate(inputs, max_new_tokens=100, pad_token_id=llm_tokenizer.eos_token_id)
reply = llm_tokenizer.decode(out[0], skip_special_tokens=True)[len(prompt):].strip()
hist.append(reply)
return reply or "I’m here to help!"
def process(audio, uid):
if not audio:
return "", "", "", uid
# Save temp file
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
data, sr = audio
sf.write(tmp.name, data, sr)
# ASR + Emotion
text, emo = transcribe_and_emote(tmp.name)
# LLM response
reply = generate_reply(text, emo, uid)
# Clean up
os.unlink(tmp.name)
return text, emo, reply, uid
# Gradio interface
with gr.Blocks() as demo:
uid_state = gr.State(value=str(uuid.uuid4()))
audio_in = gr.Audio(source="microphone", type="numpy")
txt_out = gr.Textbox(label="Transcription")
emo_out = gr.Textbox(label="Emotion")
rep_out = gr.Textbox(label="AI Reply")
btn = gr.Button("Process")
btn.click(process, inputs=[audio_in, uid_state], outputs=[txt_out, emo_out, rep_out, uid_state])
app = gr.mount_gradio_app(app, demo, path="/")
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)
|