Devakumar868 commited on
Commit
036f56f
·
verified ·
1 Parent(s): 6e55da8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -30
app.py CHANGED
@@ -3,31 +3,29 @@ import gradio as gr
3
  import torch
4
  import numpy as np
5
  from transformers import pipeline
 
6
  from dia.model import Dia
7
  from dac.utils import load_model as load_dac_model
8
  from accelerate import init_empty_weights, load_checkpoint_and_dispatch
9
 
10
- # Retrieve your HF token from the Space secrets
11
  HF_TOKEN = os.environ["HF_TOKEN"]
12
 
13
- # Automatically shard across 4× L4 GPUs
14
  device_map = "auto"
15
 
16
- # 1. Load RVQ codec
17
  rvq = load_dac_model(tag="latest", model_type="44khz")
18
  rvq.eval()
19
- if torch.cuda.is_available():
20
- rvq = rvq.to("cuda")
21
-
22
- # 2. Load VAD via Hugging Face pipeline (no segmentation mismatch)
23
- vad_pipe = pipeline(
24
- "voice-activity-detection",
25
- model="pyannote/voice-activity-detection",
26
- use_auth_token=HF_TOKEN,
27
- device=0 if torch.cuda.is_available() else -1
28
  )
29
 
30
- # 3. Load Ultravox (speech-in → text+LLM)
31
  ultravox_pipe = pipeline(
32
  model="fixie-ai/ultravox-v0_4",
33
  trust_remote_code=True,
@@ -35,7 +33,7 @@ ultravox_pipe = pipeline(
35
  torch_dtype=torch.float16
36
  )
37
 
38
- # 4. Load diffusion prosody model
39
  diff_pipe = pipeline(
40
  "audio-to-audio",
41
  model="teticio/audio-diffusion-instrumental-hiphop-256",
@@ -44,7 +42,7 @@ diff_pipe = pipeline(
44
  torch_dtype=torch.float16
45
  )
46
 
47
- # 5. Load Dia TTS with multi-GPU dispatch
48
  with init_empty_weights():
49
  dia = Dia.from_pretrained(
50
  "nari-labs/Dia-1.6B",
@@ -58,40 +56,42 @@ dia = load_checkpoint_and_dispatch(
58
  dtype=torch.float16
59
  )
60
 
61
- # 6. Inference function
62
  def process_audio(audio):
63
  sr, array = audio
64
- array = array.numpy() if torch.is_tensor(array) else array
 
65
 
66
- # Voice activity detection
67
- speech = vad_pipe(array, sampling_rate=sr)[0]["chunks"]
68
 
69
  # RVQ encode/decode
70
  x = torch.tensor(array).unsqueeze(0).to("cuda")
71
  codes = rvq.encode(x)
72
  decoded = rvq.decode(codes).squeeze().cpu().numpy()
73
 
74
- # Ultravox ASR LLM
75
  out = ultravox_pipe({"array": decoded, "sampling_rate": sr})
76
  text = out.get("text", "")
77
 
78
- # Diffusion-based prosody
79
- pros = diff_pipe({"array": decoded, "sampling_rate": sr})["array"][0]
80
 
81
- # Dia TTS synth
82
- tts = dia.generate(f"[emotion:neutral] {text}").squeeze().cpu().numpy()
83
- tts = tts / np.max(np.abs(tts)) * 0.95
 
84
 
85
- return (sr, tts), text
86
 
87
- # 7. Gradio UI
88
- with gr.Blocks() as demo:
89
  gr.Markdown("## Maya-AI: Supernatural Conversational Agent")
90
  audio_in = gr.Audio(source="microphone", type="numpy", label="Your Voice")
91
- send = gr.Button("Send")
92
  audio_out = gr.Audio(label="AI’s Response")
93
  text_out = gr.Textbox(label="Generated Text")
94
- send.click(process_audio, inputs=audio_in, outputs=[audio_out, text_out])
95
 
96
  if __name__ == "__main__":
97
  demo.launch()
 
3
  import torch
4
  import numpy as np
5
  from transformers import pipeline
6
+ from pyannote.audio import Pipeline as PyannotePipeline
7
  from dia.model import Dia
8
  from dac.utils import load_model as load_dac_model
9
  from accelerate import init_empty_weights, load_checkpoint_and_dispatch
10
 
11
+ # Environment token
12
  HF_TOKEN = os.environ["HF_TOKEN"]
13
 
14
+ # Shard large models across 4× L4 GPUs
15
  device_map = "auto"
16
 
17
+ # 1. RVQ codec (Descript Audio Codec)
18
  rvq = load_dac_model(tag="latest", model_type="44khz")
19
  rvq.eval()
20
+ if torch.cuda.is_available(): rvq = rvq.to("cuda")
21
+
22
+ # 2. Voice Activity Detection via Pyannote
23
+ vad_pipe = PyannotePipeline.from_pretrained(
24
+ "pyannote/voice-activity-detection",
25
+ use_auth_token=HF_TOKEN
 
 
 
26
  )
27
 
28
+ # 3. Ultravox pipeline (speech → text + LLM)
29
  ultravox_pipe = pipeline(
30
  model="fixie-ai/ultravox-v0_4",
31
  trust_remote_code=True,
 
33
  torch_dtype=torch.float16
34
  )
35
 
36
+ # 4. Diffusion-based prosody model
37
  diff_pipe = pipeline(
38
  "audio-to-audio",
39
  model="teticio/audio-diffusion-instrumental-hiphop-256",
 
42
  torch_dtype=torch.float16
43
  )
44
 
45
+ # 5. Dia TTS loaded with multi-GPU dispatch
46
  with init_empty_weights():
47
  dia = Dia.from_pretrained(
48
  "nari-labs/Dia-1.6B",
 
56
  dtype=torch.float16
57
  )
58
 
59
+ # Inference function
60
  def process_audio(audio):
61
  sr, array = audio
62
+ # Ensure numpy
63
+ if torch.is_tensor(array): array = array.numpy()
64
 
65
+ # VAD: extract speech regions
66
+ chunks = vad_pipe(array, sampling_rate=sr)
67
 
68
  # RVQ encode/decode
69
  x = torch.tensor(array).unsqueeze(0).to("cuda")
70
  codes = rvq.encode(x)
71
  decoded = rvq.decode(codes).squeeze().cpu().numpy()
72
 
73
+ # Ultravox ASR + LLM
74
  out = ultravox_pipe({"array": decoded, "sampling_rate": sr})
75
  text = out.get("text", "")
76
 
77
+ # Diffusion prosody enhancement
78
+ pros_audio = diff_pipe({"array": decoded, "sampling_rate": sr})["array"][0]
79
 
80
+ # Dia TTS synthesis
81
+ tts = dia.generate(f"[emotion:neutral] {text}")
82
+ tts_np = tts.squeeze().cpu().numpy()
83
+ tts_np = tts_np / np.max(np.abs(tts_np)) * 0.95
84
 
85
+ return (sr, tts_np), text
86
 
87
+ # Gradio UI
88
+ with gr.Blocks(title="Maya AI 📈", theme=None) as demo:
89
  gr.Markdown("## Maya-AI: Supernatural Conversational Agent")
90
  audio_in = gr.Audio(source="microphone", type="numpy", label="Your Voice")
91
+ send_btn = gr.Button("Send")
92
  audio_out = gr.Audio(label="AI’s Response")
93
  text_out = gr.Textbox(label="Generated Text")
94
+ send_btn.click(process_audio, inputs=audio_in, outputs=[audio_out, text_out])
95
 
96
  if __name__ == "__main__":
97
  demo.launch()