File size: 2,615 Bytes
25ff0cb
 
c5ef34e
 
c0a635e
 
25ff0cb
 
 
c5ef34e
25ff0cb
c5ef34e
c0a635e
 
25ff0cb
 
 
 
 
 
 
 
 
c0a635e
25ff0cb
 
 
 
c0a635e
25ff0cb
 
 
 
 
 
 
c0a635e
25ff0cb
 
 
 
 
 
c0a635e
25ff0cb
 
 
 
 
 
 
 
 
 
 
 
 
 
c0a635e
25ff0cb
 
 
 
 
 
 
 
 
c0a635e
25ff0cb
c0a635e
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import os, tempfile, uuid
from fastapi import FastAPI
import gradio as gr
import soundfile as sf
import torch
import numpy as np
import nemo.collections.asr as nemo_asr
from speechbrain.pretrained import EncoderClassifier
from transformers import AutoTokenizer, AutoModelForCausalLM

# Initialize FastAPI and models
app = FastAPI()
conversation_history = {}

# Model loading
asr_model = nemo_asr.models.ASRModel.from_pretrained("nvidia/parakeet-tdt-0.6b-v2")  # ASR [2]
emotion_model = EncoderClassifier.from_hparams(
    source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP",
    savedir="emotion_cache"
)  # Emotion [3]
llm_name = "microsoft/DialoGPT-medium"
llm_tokenizer = AutoTokenizer.from_pretrained(llm_name)
llm_model = AutoModelForCausalLM.from_pretrained(llm_name).to("cuda" if torch.cuda.is_available() else "cpu")  # LLM [4]

def transcribe_and_emote(audio_path):
    text = asr_model.transcribe([audio_path])[0].text
    emotion = emotion_model.classify_file(audio_path)[0]
    return text, emotion

def generate_reply(user_text, emotion, uid):
    # Track and trim history
    hist = conversation_history.setdefault(uid, [])
    ctx = f"[Feeling:{emotion}] {user_text}"
    hist.append(ctx)
    hist = hist[-6:]
    conversation_history[uid] = hist

    prompt = " ".join(hist)
    inputs = llm_tokenizer.encode(prompt, return_tensors="pt").to(llm_model.device)
    out = llm_model.generate(inputs, max_new_tokens=100, pad_token_id=llm_tokenizer.eos_token_id)
    reply = llm_tokenizer.decode(out[0], skip_special_tokens=True)[len(prompt):].strip()
    hist.append(reply)
    return reply or "I’m here to help!" 

def process(audio, uid):
    if not audio:
        return "", "", "", uid
    # Save temp file
    tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    data, sr = audio
    sf.write(tmp.name, data, sr)
    # ASR + Emotion
    text, emo = transcribe_and_emote(tmp.name)
    # LLM response
    reply = generate_reply(text, emo, uid)
    # Clean up
    os.unlink(tmp.name)
    return text, emo, reply, uid

# Gradio interface
with gr.Blocks() as demo:
    uid_state = gr.State(value=str(uuid.uuid4()))
    audio_in = gr.Audio(source="microphone", type="numpy")
    txt_out = gr.Textbox(label="Transcription")
    emo_out = gr.Textbox(label="Emotion")
    rep_out = gr.Textbox(label="AI Reply")
    btn = gr.Button("Process")
    btn.click(process, inputs=[audio_in, uid_state], outputs=[txt_out, emo_out, rep_out, uid_state])

app = gr.mount_gradio_app(app, demo, path="/")

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=7860)