Maya-AI / app.py
Devakumar868's picture
Update app.py
1a24747 verified
raw
history blame
2.95 kB
import gradio as gr
import torch
import numpy as np
from transformers import pipeline, AutoProcessor, CsmForConditionalGeneration
from pyannote.audio import Pipeline as VAD
from dia.model import Dia
from dac.utils import load_model as load_dac_model
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
# 2.1: Initialize Accelerator for 4×L4 GPU distribution
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
device_map = "auto" # accelerate automatically shards across 4 L4 GPUs[2]
# 2.2: Load Descript Audio Codec (RVQ) at startup
rvq = load_dac_model(tag="latest", model_type="44khz")
rvq.eval()
if torch.cuda.is_available():
rvq = rvq.to("cuda")
# 2.3: Load VAD pipeline
vad = VAD.from_pretrained("pyannote/voice-activity-detection")
# 2.4: Load Ultravox via audio-text-to-text pipeline
ultravox_pipe = pipeline(
"audio-text-to-text",
model="fixie-ai/ultravox-v0_4",
trust_remote_code=True,
device_map=device_map,
torch_dtype=torch.float16
)
# 2.5: Load Diffusion model
diff_pipe = pipeline(
"audio-to-audio",
model="teticio/audio-diffusion-instrumental-hiphop-256",
trust_remote_code=True,
device_map=device_map,
torch_dtype=torch.float16
)
# 2.6: Load Dia TTS with multi-GPU dispatch
with init_empty_weights():
dia = Dia.from_pretrained("nari-labs/Dia-1.6B", torch_dtype=torch.float16, trust_remote_code=True)
dia = load_checkpoint_and_dispatch(
dia, "nari-labs/Dia-1.6B", device_map=device_map, dtype=torch.float16
)
# 2.7: Gradio inference function
def process_audio(audio):
sr, array = audio["sampling_rate"], audio["array"]
# VAD segmentation
speech_segments = vad({"waveform": torch.tensor(array).unsqueeze(0), "sample_rate": sr})
# RVQ encode/decode
audio_tensor = torch.tensor(array).unsqueeze(0)
if torch.cuda.is_available():
audio_tensor = audio_tensor.to("cuda")
codes = rvq.encode(audio_tensor)
decoded = rvq.decode(codes)
array = decoded.squeeze().cpu().numpy()
# Ultravox ASR→LLM
ultra_out = ultravox_pipe({"array": array, "sampling_rate": sr})
text = ultra_out["text"]
# Diffusion-based prosody enhancement
prosody_audio = diff_pipe({"array": decoded.cpu().numpy(), "sampling_rate": sr})["array"][0]
# Dia TTS
tts_audio = dia.generate(f"[emotion:neutral] {text}")
tts_np = tts_audio.squeeze().cpu().numpy()
# Normalize
tts_np = tts_np / np.max(np.abs(tts_np)) * 0.95
return (sr, tts_np), text
# 2.8: Gradio UI
with gr.Blocks() as demo:
gr.Markdown("## Supernatural Speech AI Agent")
audio_in = gr.Audio(source="microphone", type="numpy", label="Record Your Voice")
btn = gr.Button("Send")
audio_out = gr.Audio(label="AI Response")
txt_out = gr.Textbox(label="Transcribed & Generated Text")
btn.click(fn=process_audio, inputs=audio_in, outputs=[audio_out, txt_out])
if __name__ == "__main__":
demo.launch()