File size: 2,662 Bytes
6e55da8
c5ef34e
653911d
 
3972023
ee439d6
036f56f
3972023
1a24747
3972023
5adc99b
55c39a0
0fbde27
5adc99b
3972023
1a24747
 
0fbde27
036f56f
3972023
036f56f
 
 
42e6e01
1a24747
3972023
1a24747
 
 
55c39a0
1a24747
 
 
3972023
ee439d6
0e0768b
 
ee439d6
1a24747
3972023
 
 
 
 
 
55c39a0
 
3972023
55c39a0
3972023
 
 
 
d9c827c
0e0768b
55c39a0
 
 
3972023
 
 
55c39a0
3972023
 
55c39a0
 
 
3972023
 
 
55c39a0
 
 
 
ee439d6
0fbde27
3972023
 
55c39a0
 
 
653911d
 
1a24747
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import os
import gradio as gr
import torch
import numpy as np
from transformers import pipeline, AutoTokenizer
from diffusers import DiffusionPipeline
from pyannote.audio import Pipeline as PyannotePipeline
from dia.model import DiaConfig, DiaModel, Dia
from dac.utils import load_model as load_dac_model
from accelerate import init_empty_weights, load_checkpoint_and_dispatch

HF_TOKEN   = os.environ["HF_TOKEN"]
device_map = "auto"

# RVQ Codec
rvq = load_dac_model(tag="latest", model_type="44khz")
rvq.eval()
if torch.cuda.is_available(): rvq = rvq.to("cuda")

# VAD Pipeline
vad_pipe = PyannotePipeline.from_pretrained(
    "pyannote/voice-activity-detection",
    use_auth_token=HF_TOKEN
)

# Ultravox Pipeline
ultravox_pipe = pipeline(
    model="fixie-ai/ultravox-v0_4",
    trust_remote_code=True,
    device_map=device_map,
    torch_dtype=torch.float16
)

# Audio Diffusion
diff_pipe = DiffusionPipeline.from_pretrained(
    "teticio/audio-diffusion-instrumental-hiphop-256",
    torch_dtype=torch.float16
).to("cuda")

# Dia TTS Loading
config = DiaConfig.from_pretrained("nari-labs/Dia-1.6B")
with init_empty_weights():
    base_model = DiaModel(config)
base_model = load_checkpoint_and_dispatch(
    base_model,
    "nari-labs/Dia-1.6B",
    device_map=device_map,
    dtype=torch.float16
)
dia = Dia(base_model, config)

# Save tokenizer for Dia text processing
tokenizer = AutoTokenizer.from_pretrained("nari-labs/Dia-1.6B")

def process_audio(audio):
    sr, array = audio
    array = array.numpy() if torch.is_tensor(array) else array

    vad_pipe({"waveform": torch.tensor(array).unsqueeze(0), "sample_rate": sr})
    x = torch.tensor(array).unsqueeze(0).to("cuda")
    codes = rvq.encode(x); decoded = rvq.decode(codes).squeeze().cpu().numpy()

    ultra_out = ultravox_pipe({"array": decoded, "sampling_rate": sr})
    text = ultra_out.get("text", "")

    pros = diff_pipe(raw_audio=decoded)["audios"][0]

    inputs = tokenizer(f"[emotion:neutral] {text}", return_tensors="pt").to("cuda")
    tts_tensors = dia.generate(**inputs)
    tts_np = tts_tensors.squeeze().cpu().numpy()
    tts_np = tts_np / np.max(np.abs(tts_np)) * 0.95 if tts_np.size else tts_np

    return (sr, tts_np), text

with gr.Blocks(title="Maya AI πŸ“ˆ") as demo:
    gr.Markdown("## Maya-AI: Supernatural Conversational Agent")
    audio_in = gr.Audio(source="microphone", type="numpy", label="Your Voice")
    send_btn = gr.Button("Send")
    audio_out = gr.Audio(label="AI Response")
    text_out  = gr.Textbox(label="Generated Text")
    send_btn.click(process_audio, inputs=audio_in, outputs=[audio_out, text_out])

if __name__ == "__main__":
    demo.launch()