Maya-AI / app.py
Devakumar868's picture
Update app.py
ee439d6 verified
raw
history blame
2.71 kB
import os
import gradio as gr
import torch
import numpy as np
from transformers import pipeline
from diffusers import DiffusionPipeline
from pyannote.audio import Pipeline as PyannotePipeline
from dia.model import Dia
from dac.utils import load_model as load_dac_model
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
# Retrieve HF token from Secrets
HF_TOKEN = os.environ["HF_TOKEN"]
# Automatic multi-GPU sharding across 4ร— L4 GPUs
device_map = "auto"
# 1. Descript Audio Codec (RVQ)
rvq = load_dac_model(tag="latest", model_type="44khz")
rvq.eval()
if torch.cuda.is_available():
rvq = rvq.to("cuda")
# 2. Voice Activity Detection via Pyannote
vad_pipe = PyannotePipeline.from_pretrained(
"pyannote/voice-activity-detection",
use_auth_token=HF_TOKEN
)
# 3. Ultravox ASR+LLM
ultravox_pipe = pipeline(
model="fixie-ai/ultravox-v0_4",
trust_remote_code=True,
device_map=device_map,
torch_dtype=torch.float16
)
# 4. Audio Diffusion (direct load via Diffusers)
diff_pipe = DiffusionPipeline.from_pretrained(
"teticio/audio-diffusion-instrumental-hiphop-256",
torch_dtype=torch.float16
).to("cuda")
# 5. Dia TTS (multi-GPU dispatch)
with init_empty_weights():
dia = Dia.from_pretrained(
"nari-labs/Dia-1.6B",
torch_dtype=torch.float16,
trust_remote_code=True
)
dia = load_checkpoint_and_dispatch(
dia,
"nari-labs/Dia-1.6B",
device_map=device_map,
dtype=torch.float16
)
# 6. Inference Function
def process_audio(audio):
sr, array = audio
array = array.numpy() if torch.is_tensor(array) else array
# VAD
_ = vad_pipe(array, sampling_rate=sr)
# RVQ encode/decode
tensor = torch.tensor(array).unsqueeze(0).to("cuda")
codes = rvq.encode(tensor)
decoded = rvq.decode(codes).squeeze().cpu().numpy()
# Ultravox inference
ultra_out = ultravox_pipe({"array": decoded, "sampling_rate": sr})
text = ultra_out.get("text", "")
# Diffusion enhancement
pros = diff_pipe(raw_audio=decoded)["audios"][0]
# Dia TTS
tts = dia.generate(f"[emotion:neutral] {text}").squeeze().cpu().numpy()
tts = tts / np.max(np.abs(tts)) * 0.95
return (sr, tts), text
# 7. Gradio UI
with gr.Blocks(title="Maya AI ๐Ÿ“ˆ") as demo:
gr.Markdown("## Maya-AI: Supernatural Conversational Agent")
audio_input = gr.Audio(source="microphone", type="numpy", label="Your Voice")
send_button = gr.Button("Send")
audio_output = gr.Audio(label="AIโ€™s Response")
text_output = gr.Textbox(label="Generated Text")
send_button.click(process_audio, inputs=audio_input, outputs=[audio_output, text_output])
if __name__ == "__main__":
demo.launch()