Spaces:
Runtime error
Runtime error
File size: 2,626 Bytes
6e55da8 c5ef34e 653911d 55c39a0 ee439d6 036f56f 1a24747 5adc99b 55c39a0 5adc99b 0e0768b 1a24747 d9c827c 036f56f 55c39a0 036f56f 42e6e01 1a24747 55c39a0 1a24747 55c39a0 1a24747 55c39a0 ee439d6 0e0768b ee439d6 1a24747 55c39a0 d9c827c 0e0768b d9c827c 0e0768b 55c39a0 ee439d6 55c39a0 653911d 1a24747 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
import os
import gradio as gr
import torch
import numpy as np
from transformers import pipeline, AutoModel
from diffusers import DiffusionPipeline
from pyannote.audio import Pipeline as PyannotePipeline
from dia.model import Dia
from dac.utils import load_model as load_dac_model
# 1. Retrieve HF token and set device mapping
HF_TOKEN = os.environ["HF_TOKEN"]
device_map = "auto" # auto-shard models across 4ΓL4 GPUs
print("Loading RVQ Codec...")
rvq = load_dac_model(tag="latest", model_type="44khz")
rvq.eval()
if torch.cuda.is_available():
rvq = rvq.to("cuda")
print("Loading VAD pipeline...")
vad_pipe = PyannotePipeline.from_pretrained(
"pyannote/voice-activity-detection",
use_auth_token=HF_TOKEN
)
print("Loading Ultravox pipeline...")
ultravox_pipe = pipeline(
model="fixie-ai/ultravox-v0_4",
trust_remote_code=True,
device_map=device_map,
torch_dtype=torch.float16
)
print("Loading Audio Diffusion model...")
diff_pipe = DiffusionPipeline.from_pretrained(
"teticio/audio-diffusion-instrumental-hiphop-256",
torch_dtype=torch.float16
).to("cuda")
print("Loading Dia TTS (sharded across GPUs)...")
dia = Dia.from_pretrained(
"nari-labs/Dia-1.6B",
device_map=device_map,
torch_dtype=torch.float16,
trust_remote_code=True
)
print("All models loaded successfully!")
def process_audio(audio):
sr, array = audio
array = array.numpy() if torch.is_tensor(array) else array
# 1. Voice activity detection
vad_pipe({"waveform": torch.tensor(array).unsqueeze(0), "sample_rate": sr})
# 2. RVQ encode/decode
x = torch.tensor(array).unsqueeze(0).to("cuda")
codes = rvq.encode(x)
decoded = rvq.decode(codes).squeeze().cpu().numpy()
# 3. Ultravox ASR β text
out = ultravox_pipe({"array": decoded, "sampling_rate": sr})
text = out.get("text", "")
# 4. Prosody diffusion
pros = diff_pipe(raw_audio=decoded)["audios"][0]
# 5. Dia TTS synthesis
tts = dia.generate(f"[emotion:neutral] {text}")
tts_np = tts.squeeze().cpu().numpy()
tts_np = tts_np / np.max(np.abs(tts_np)) * 0.95 if tts_np.size else tts_np
return (sr, tts_np), text
# Gradio UI
with gr.Blocks(title="Maya AI π") as demo:
gr.Markdown("## Maya-AI Supernatural Conversational Agent")
audio_in = gr.Audio(source="microphone", type="numpy", label="Your Voice")
send_btn = gr.Button("Send")
audio_out = gr.Audio(label="AI Response")
text_out = gr.Textbox(label="Generated Text")
send_btn.click(process_audio, inputs=audio_in, outputs=[audio_out, text_out])
if __name__ == "__main__":
demo.launch()
|