Maya-AI / app.py
Devakumar868's picture
Update app.py
6e55da8 verified
raw
history blame
2.75 kB
import os
import gradio as gr
import torch
import numpy as np
from transformers import pipeline
from dia.model import Dia
from dac.utils import load_model as load_dac_model
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
# Retrieve your HF token from the Space secrets
HF_TOKEN = os.environ["HF_TOKEN"]
# Automatically shard across 4× L4 GPUs
device_map = "auto"
# 1. Load RVQ codec
rvq = load_dac_model(tag="latest", model_type="44khz")
rvq.eval()
if torch.cuda.is_available():
rvq = rvq.to("cuda")
# 2. Load VAD via Hugging Face pipeline (no segmentation mismatch)
vad_pipe = pipeline(
"voice-activity-detection",
model="pyannote/voice-activity-detection",
use_auth_token=HF_TOKEN,
device=0 if torch.cuda.is_available() else -1
)
# 3. Load Ultravox (speech-in → text+LLM)
ultravox_pipe = pipeline(
model="fixie-ai/ultravox-v0_4",
trust_remote_code=True,
device_map=device_map,
torch_dtype=torch.float16
)
# 4. Load diffusion prosody model
diff_pipe = pipeline(
"audio-to-audio",
model="teticio/audio-diffusion-instrumental-hiphop-256",
trust_remote_code=True,
device_map=device_map,
torch_dtype=torch.float16
)
# 5. Load Dia TTS with multi-GPU dispatch
with init_empty_weights():
dia = Dia.from_pretrained(
"nari-labs/Dia-1.6B",
torch_dtype=torch.float16,
trust_remote_code=True
)
dia = load_checkpoint_and_dispatch(
dia,
"nari-labs/Dia-1.6B",
device_map=device_map,
dtype=torch.float16
)
# 6. Inference function
def process_audio(audio):
sr, array = audio
array = array.numpy() if torch.is_tensor(array) else array
# Voice activity detection
speech = vad_pipe(array, sampling_rate=sr)[0]["chunks"]
# RVQ encode/decode
x = torch.tensor(array).unsqueeze(0).to("cuda")
codes = rvq.encode(x)
decoded = rvq.decode(codes).squeeze().cpu().numpy()
# Ultravox ASR → LLM
out = ultravox_pipe({"array": decoded, "sampling_rate": sr})
text = out.get("text", "")
# Diffusion-based prosody
pros = diff_pipe({"array": decoded, "sampling_rate": sr})["array"][0]
# Dia TTS synth
tts = dia.generate(f"[emotion:neutral] {text}").squeeze().cpu().numpy()
tts = tts / np.max(np.abs(tts)) * 0.95
return (sr, tts), text
# 7. Gradio UI
with gr.Blocks() as demo:
gr.Markdown("## Maya-AI: Supernatural Conversational Agent")
audio_in = gr.Audio(source="microphone", type="numpy", label="Your Voice")
send = gr.Button("Send")
audio_out = gr.Audio(label="AI’s Response")
text_out = gr.Textbox(label="Generated Text")
send.click(process_audio, inputs=audio_in, outputs=[audio_out, text_out])
if __name__ == "__main__":
demo.launch()