Spaces:
Runtime error
Runtime error
File size: 2,662 Bytes
6e55da8 c5ef34e 653911d 3972023 ee439d6 036f56f 3972023 1a24747 3972023 5adc99b 55c39a0 0fbde27 5adc99b 3972023 1a24747 0fbde27 036f56f 3972023 036f56f 42e6e01 1a24747 3972023 1a24747 55c39a0 1a24747 3972023 ee439d6 0e0768b ee439d6 1a24747 3972023 55c39a0 3972023 55c39a0 3972023 d9c827c 0e0768b 55c39a0 3972023 55c39a0 3972023 55c39a0 3972023 55c39a0 ee439d6 0fbde27 3972023 55c39a0 653911d 1a24747 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
import os
import gradio as gr
import torch
import numpy as np
from transformers import pipeline, AutoTokenizer
from diffusers import DiffusionPipeline
from pyannote.audio import Pipeline as PyannotePipeline
from dia.model import DiaConfig, DiaModel, Dia
from dac.utils import load_model as load_dac_model
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
HF_TOKEN = os.environ["HF_TOKEN"]
device_map = "auto"
# RVQ Codec
rvq = load_dac_model(tag="latest", model_type="44khz")
rvq.eval()
if torch.cuda.is_available(): rvq = rvq.to("cuda")
# VAD Pipeline
vad_pipe = PyannotePipeline.from_pretrained(
"pyannote/voice-activity-detection",
use_auth_token=HF_TOKEN
)
# Ultravox Pipeline
ultravox_pipe = pipeline(
model="fixie-ai/ultravox-v0_4",
trust_remote_code=True,
device_map=device_map,
torch_dtype=torch.float16
)
# Audio Diffusion
diff_pipe = DiffusionPipeline.from_pretrained(
"teticio/audio-diffusion-instrumental-hiphop-256",
torch_dtype=torch.float16
).to("cuda")
# Dia TTS Loading
config = DiaConfig.from_pretrained("nari-labs/Dia-1.6B")
with init_empty_weights():
base_model = DiaModel(config)
base_model = load_checkpoint_and_dispatch(
base_model,
"nari-labs/Dia-1.6B",
device_map=device_map,
dtype=torch.float16
)
dia = Dia(base_model, config)
# Save tokenizer for Dia text processing
tokenizer = AutoTokenizer.from_pretrained("nari-labs/Dia-1.6B")
def process_audio(audio):
sr, array = audio
array = array.numpy() if torch.is_tensor(array) else array
vad_pipe({"waveform": torch.tensor(array).unsqueeze(0), "sample_rate": sr})
x = torch.tensor(array).unsqueeze(0).to("cuda")
codes = rvq.encode(x); decoded = rvq.decode(codes).squeeze().cpu().numpy()
ultra_out = ultravox_pipe({"array": decoded, "sampling_rate": sr})
text = ultra_out.get("text", "")
pros = diff_pipe(raw_audio=decoded)["audios"][0]
inputs = tokenizer(f"[emotion:neutral] {text}", return_tensors="pt").to("cuda")
tts_tensors = dia.generate(**inputs)
tts_np = tts_tensors.squeeze().cpu().numpy()
tts_np = tts_np / np.max(np.abs(tts_np)) * 0.95 if tts_np.size else tts_np
return (sr, tts_np), text
with gr.Blocks(title="Maya AI π") as demo:
gr.Markdown("## Maya-AI: Supernatural Conversational Agent")
audio_in = gr.Audio(source="microphone", type="numpy", label="Your Voice")
send_btn = gr.Button("Send")
audio_out = gr.Audio(label="AI Response")
text_out = gr.Textbox(label="Generated Text")
send_btn.click(process_audio, inputs=audio_in, outputs=[audio_out, text_out])
if __name__ == "__main__":
demo.launch()
|