Devakumar868 commited on
Commit
6e55da8
·
verified ·
1 Parent(s): 42e6e01

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -84
app.py CHANGED
@@ -1,41 +1,33 @@
 
1
  import gradio as gr
2
  import torch
3
  import numpy as np
4
- import os
5
- from transformers import pipeline, AutoProcessor, CsmForConditionalGeneration
6
- from pyannote.audio import Model, Inference
7
  from dia.model import Dia
8
  from dac.utils import load_model as load_dac_model
9
  from accelerate import init_empty_weights, load_checkpoint_and_dispatch
10
 
11
- # Access HF_TOKEN from environment variables (Secrets)
12
- HF_TOKEN = os.environ.get("HF_TOKEN")
13
 
14
- # Device mapping for 4× L4 GPU distribution
15
  device_map = "auto"
16
 
17
- print("Loading models...")
18
-
19
- # Load Descript Audio Codec (RVQ) at startup
20
- print("Loading RVQ Codec...")
21
  rvq = load_dac_model(tag="latest", model_type="44khz")
22
  rvq.eval()
23
  if torch.cuda.is_available():
24
  rvq = rvq.to("cuda")
25
 
26
- # Load segmentation model with authentication
27
- print("Loading Segmentation Model...")
28
- seg_model = Model.from_pretrained(
29
- "pyannote/segmentation",
30
- use_auth_token=HF_TOKEN
 
31
  )
32
- seg_inference = Inference(seg_model, device=0 if torch.cuda.is_available() else -1)
33
 
34
- # Use segmentation model for VAD
35
- vad = seg_inference
36
-
37
- # Load Ultravox via generic pipeline (without specifying task)
38
- print("Loading Ultravox...")
39
  ultravox_pipe = pipeline(
40
  model="fixie-ai/ultravox-v0_4",
41
  trust_remote_code=True,
@@ -43,8 +35,7 @@ ultravox_pipe = pipeline(
43
  torch_dtype=torch.float16
44
  )
45
 
46
- # Load Diffusion model
47
- print("Loading Diffusion Model...")
48
  diff_pipe = pipeline(
49
  "audio-to-audio",
50
  model="teticio/audio-diffusion-instrumental-hiphop-256",
@@ -53,8 +44,7 @@ diff_pipe = pipeline(
53
  torch_dtype=torch.float16
54
  )
55
 
56
- # Load Dia TTS with multi-GPU dispatch
57
- print("Loading Dia TTS...")
58
  with init_empty_weights():
59
  dia = Dia.from_pretrained(
60
  "nari-labs/Dia-1.6B",
@@ -68,66 +58,40 @@ dia = load_checkpoint_and_dispatch(
68
  dtype=torch.float16
69
  )
70
 
71
- print("All models loaded successfully!")
72
-
73
- # Gradio inference function
74
  def process_audio(audio):
75
- try:
76
- if audio is None:
77
- return None, "No audio input provided"
78
-
79
- sr, array = audio
80
-
81
- # Ensure audio is numpy array
82
- if torch.is_tensor(array):
83
- array = array.numpy()
84
-
85
- # VAD segmentation
86
- segments = vad({"waveform": torch.tensor(array).unsqueeze(0), "sample_rate": sr})
87
-
88
- # RVQ encode/decode
89
- audio_tensor = torch.tensor(array).unsqueeze(0)
90
- if torch.cuda.is_available():
91
- audio_tensor = audio_tensor.to("cuda")
92
- codes = rvq.encode(audio_tensor)
93
- decoded = rvq.decode(codes)
94
- array = decoded.squeeze().cpu().numpy()
95
-
96
- # Ultravox ASR→LLM
97
- ultra_out = ultravox_pipe({"array": array, "sampling_rate": sr})
98
- text = ultra_out.get("text", "I understand your audio input.")
99
-
100
- # Diffusion-based prosody enhancement
101
- prosody_audio = diff_pipe({"array": decoded.cpu().numpy(), "sampling_rate": sr})["array"][0]
102
-
103
- # Dia TTS
104
- tts_audio = dia.generate(f"[emotion:neutral] {text}")
105
- tts_np = tts_audio.squeeze().cpu().numpy()
106
-
107
- # Normalize
108
- tts_np = tts_np / np.max(np.abs(tts_np)) * 0.95
109
-
110
- return (sr, tts_np), text
111
-
112
- except Exception as e:
113
- print(f"Error in process_audio: {e}")
114
- return None, f"Processing error: {str(e)}"
115
-
116
- # Gradio UI
117
- with gr.Blocks(title="Maya-AI: Supernatural Speech Agent") as demo:
118
- gr.Markdown("# Maya-AI: Supernatural Speech Agent")
119
- gr.Markdown("Record audio to interact with the AI agent that understands emotions and responds naturally.")
120
-
121
- with gr.Row():
122
- with gr.Column():
123
- audio_in = gr.Audio(source="microphone", type="numpy", label="Record Your Voice")
124
- btn = gr.Button("Send", variant="primary")
125
-
126
- with gr.Column():
127
- audio_out = gr.Audio(label="AI Response")
128
- txt_out = gr.Textbox(label="Transcribed & Generated Text", lines=3)
129
-
130
- btn.click(fn=process_audio, inputs=audio_in, outputs=[audio_out, txt_out])
131
 
132
  if __name__ == "__main__":
133
  demo.launch()
 
1
+ import os
2
  import gradio as gr
3
  import torch
4
  import numpy as np
5
+ from transformers import pipeline
 
 
6
  from dia.model import Dia
7
  from dac.utils import load_model as load_dac_model
8
  from accelerate import init_empty_weights, load_checkpoint_and_dispatch
9
 
10
+ # Retrieve your HF token from the Space secrets
11
+ HF_TOKEN = os.environ["HF_TOKEN"]
12
 
13
+ # Automatically shard across 4× L4 GPUs
14
  device_map = "auto"
15
 
16
+ # 1. Load RVQ codec
 
 
 
17
  rvq = load_dac_model(tag="latest", model_type="44khz")
18
  rvq.eval()
19
  if torch.cuda.is_available():
20
  rvq = rvq.to("cuda")
21
 
22
+ # 2. Load VAD via Hugging Face pipeline (no segmentation mismatch)
23
+ vad_pipe = pipeline(
24
+ "voice-activity-detection",
25
+ model="pyannote/voice-activity-detection",
26
+ use_auth_token=HF_TOKEN,
27
+ device=0 if torch.cuda.is_available() else -1
28
  )
 
29
 
30
+ # 3. Load Ultravox (speech-in → text+LLM)
 
 
 
 
31
  ultravox_pipe = pipeline(
32
  model="fixie-ai/ultravox-v0_4",
33
  trust_remote_code=True,
 
35
  torch_dtype=torch.float16
36
  )
37
 
38
+ # 4. Load diffusion prosody model
 
39
  diff_pipe = pipeline(
40
  "audio-to-audio",
41
  model="teticio/audio-diffusion-instrumental-hiphop-256",
 
44
  torch_dtype=torch.float16
45
  )
46
 
47
+ # 5. Load Dia TTS with multi-GPU dispatch
 
48
  with init_empty_weights():
49
  dia = Dia.from_pretrained(
50
  "nari-labs/Dia-1.6B",
 
58
  dtype=torch.float16
59
  )
60
 
61
+ # 6. Inference function
 
 
62
  def process_audio(audio):
63
+ sr, array = audio
64
+ array = array.numpy() if torch.is_tensor(array) else array
65
+
66
+ # Voice activity detection
67
+ speech = vad_pipe(array, sampling_rate=sr)[0]["chunks"]
68
+
69
+ # RVQ encode/decode
70
+ x = torch.tensor(array).unsqueeze(0).to("cuda")
71
+ codes = rvq.encode(x)
72
+ decoded = rvq.decode(codes).squeeze().cpu().numpy()
73
+
74
+ # Ultravox ASR LLM
75
+ out = ultravox_pipe({"array": decoded, "sampling_rate": sr})
76
+ text = out.get("text", "")
77
+
78
+ # Diffusion-based prosody
79
+ pros = diff_pipe({"array": decoded, "sampling_rate": sr})["array"][0]
80
+
81
+ # Dia TTS synth
82
+ tts = dia.generate(f"[emotion:neutral] {text}").squeeze().cpu().numpy()
83
+ tts = tts / np.max(np.abs(tts)) * 0.95
84
+
85
+ return (sr, tts), text
86
+
87
+ # 7. Gradio UI
88
+ with gr.Blocks() as demo:
89
+ gr.Markdown("## Maya-AI: Supernatural Conversational Agent")
90
+ audio_in = gr.Audio(source="microphone", type="numpy", label="Your Voice")
91
+ send = gr.Button("Send")
92
+ audio_out = gr.Audio(label="AI’s Response")
93
+ text_out = gr.Textbox(label="Generated Text")
94
+ send.click(process_audio, inputs=audio_in, outputs=[audio_out, text_out])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
  if __name__ == "__main__":
97
  demo.launch()