File size: 2,626 Bytes
6e55da8
c5ef34e
653911d
 
55c39a0
ee439d6
036f56f
1a24747
 
5adc99b
55c39a0
 
 
5adc99b
0e0768b
1a24747
 
d9c827c
 
036f56f
55c39a0
036f56f
 
 
42e6e01
1a24747
55c39a0
1a24747
 
 
55c39a0
1a24747
 
 
55c39a0
ee439d6
0e0768b
 
ee439d6
1a24747
55c39a0
 
 
 
 
 
 
d9c827c
0e0768b
d9c827c
0e0768b
55c39a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee439d6
55c39a0
 
 
 
 
 
653911d
 
1a24747
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import os
import gradio as gr
import torch
import numpy as np
from transformers import pipeline, AutoModel
from diffusers import DiffusionPipeline
from pyannote.audio import Pipeline as PyannotePipeline
from dia.model import Dia
from dac.utils import load_model as load_dac_model

# 1. Retrieve HF token and set device mapping
HF_TOKEN   = os.environ["HF_TOKEN"]
device_map = "auto"  # auto-shard models across 4Γ—L4 GPUs

print("Loading RVQ Codec...")
rvq = load_dac_model(tag="latest", model_type="44khz")
rvq.eval()
if torch.cuda.is_available():
    rvq = rvq.to("cuda")

print("Loading VAD pipeline...")
vad_pipe = PyannotePipeline.from_pretrained(
    "pyannote/voice-activity-detection",
    use_auth_token=HF_TOKEN
)

print("Loading Ultravox pipeline...")
ultravox_pipe = pipeline(
    model="fixie-ai/ultravox-v0_4",
    trust_remote_code=True,
    device_map=device_map,
    torch_dtype=torch.float16
)

print("Loading Audio Diffusion model...")
diff_pipe = DiffusionPipeline.from_pretrained(
    "teticio/audio-diffusion-instrumental-hiphop-256",
    torch_dtype=torch.float16
).to("cuda")

print("Loading Dia TTS (sharded across GPUs)...")
dia = Dia.from_pretrained(
    "nari-labs/Dia-1.6B",
    device_map=device_map,
    torch_dtype=torch.float16,
    trust_remote_code=True
)

print("All models loaded successfully!")

def process_audio(audio):
    sr, array = audio
    array = array.numpy() if torch.is_tensor(array) else array

    # 1. Voice activity detection
    vad_pipe({"waveform": torch.tensor(array).unsqueeze(0), "sample_rate": sr})

    # 2. RVQ encode/decode
    x = torch.tensor(array).unsqueeze(0).to("cuda")
    codes   = rvq.encode(x)
    decoded = rvq.decode(codes).squeeze().cpu().numpy()

    # 3. Ultravox ASR β†’ text
    out = ultravox_pipe({"array": decoded, "sampling_rate": sr})
    text = out.get("text", "")

    # 4. Prosody diffusion
    pros = diff_pipe(raw_audio=decoded)["audios"][0]

    # 5. Dia TTS synthesis
    tts = dia.generate(f"[emotion:neutral] {text}")
    tts_np = tts.squeeze().cpu().numpy()
    tts_np = tts_np / np.max(np.abs(tts_np)) * 0.95 if tts_np.size else tts_np

    return (sr, tts_np), text

# Gradio UI
with gr.Blocks(title="Maya AI πŸ“ˆ") as demo:
    gr.Markdown("## Maya-AI Supernatural Conversational Agent")
    audio_in  = gr.Audio(source="microphone", type="numpy", label="Your Voice")
    send_btn  = gr.Button("Send")
    audio_out = gr.Audio(label="AI Response")
    text_out  = gr.Textbox(label="Generated Text")
    send_btn.click(process_audio, inputs=audio_in, outputs=[audio_out, text_out])

if __name__ == "__main__":
    demo.launch()