Spaces:
Runtime error
Runtime error
File size: 2,752 Bytes
6e55da8 c5ef34e 653911d 6e55da8 1a24747 5adc99b 6e55da8 42e6e01 6e55da8 42e6e01 5adc99b 6e55da8 1a24747 5adc99b 6e55da8 42e6e01 1a24747 6e55da8 1a24747 6e55da8 1a24747 6e55da8 1a24747 42e6e01 1a24747 42e6e01 1a24747 6e55da8 653911d 6e55da8 653911d 1a24747 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
import os
import gradio as gr
import torch
import numpy as np
from transformers import pipeline
from dia.model import Dia
from dac.utils import load_model as load_dac_model
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
# Retrieve your HF token from the Space secrets
HF_TOKEN = os.environ["HF_TOKEN"]
# Automatically shard across 4× L4 GPUs
device_map = "auto"
# 1. Load RVQ codec
rvq = load_dac_model(tag="latest", model_type="44khz")
rvq.eval()
if torch.cuda.is_available():
rvq = rvq.to("cuda")
# 2. Load VAD via Hugging Face pipeline (no segmentation mismatch)
vad_pipe = pipeline(
"voice-activity-detection",
model="pyannote/voice-activity-detection",
use_auth_token=HF_TOKEN,
device=0 if torch.cuda.is_available() else -1
)
# 3. Load Ultravox (speech-in → text+LLM)
ultravox_pipe = pipeline(
model="fixie-ai/ultravox-v0_4",
trust_remote_code=True,
device_map=device_map,
torch_dtype=torch.float16
)
# 4. Load diffusion prosody model
diff_pipe = pipeline(
"audio-to-audio",
model="teticio/audio-diffusion-instrumental-hiphop-256",
trust_remote_code=True,
device_map=device_map,
torch_dtype=torch.float16
)
# 5. Load Dia TTS with multi-GPU dispatch
with init_empty_weights():
dia = Dia.from_pretrained(
"nari-labs/Dia-1.6B",
torch_dtype=torch.float16,
trust_remote_code=True
)
dia = load_checkpoint_and_dispatch(
dia,
"nari-labs/Dia-1.6B",
device_map=device_map,
dtype=torch.float16
)
# 6. Inference function
def process_audio(audio):
sr, array = audio
array = array.numpy() if torch.is_tensor(array) else array
# Voice activity detection
speech = vad_pipe(array, sampling_rate=sr)[0]["chunks"]
# RVQ encode/decode
x = torch.tensor(array).unsqueeze(0).to("cuda")
codes = rvq.encode(x)
decoded = rvq.decode(codes).squeeze().cpu().numpy()
# Ultravox ASR → LLM
out = ultravox_pipe({"array": decoded, "sampling_rate": sr})
text = out.get("text", "")
# Diffusion-based prosody
pros = diff_pipe({"array": decoded, "sampling_rate": sr})["array"][0]
# Dia TTS synth
tts = dia.generate(f"[emotion:neutral] {text}").squeeze().cpu().numpy()
tts = tts / np.max(np.abs(tts)) * 0.95
return (sr, tts), text
# 7. Gradio UI
with gr.Blocks() as demo:
gr.Markdown("## Maya-AI: Supernatural Conversational Agent")
audio_in = gr.Audio(source="microphone", type="numpy", label="Your Voice")
send = gr.Button("Send")
audio_out = gr.Audio(label="AI’s Response")
text_out = gr.Textbox(label="Generated Text")
send.click(process_audio, inputs=audio_in, outputs=[audio_out, text_out])
if __name__ == "__main__":
demo.launch()
|