Devakumar868 commited on
Commit
3972023
Β·
verified Β·
1 Parent(s): 0fbde27

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -27
app.py CHANGED
@@ -2,28 +2,28 @@ import os
2
  import gradio as gr
3
  import torch
4
  import numpy as np
5
- from transformers import pipeline
6
  from diffusers import DiffusionPipeline
7
  from pyannote.audio import Pipeline as PyannotePipeline
8
- from dia.model import Dia
9
  from dac.utils import load_model as load_dac_model
 
10
 
11
- # Load HF token and configure multi-GPU sharding
12
  HF_TOKEN = os.environ["HF_TOKEN"]
13
  device_map = "auto"
14
 
15
- # 1. Descript Audio Codec (RVQ)
16
  rvq = load_dac_model(tag="latest", model_type="44khz")
17
  rvq.eval()
18
  if torch.cuda.is_available(): rvq = rvq.to("cuda")
19
 
20
- # 2. Voice Activity Detection via Pyannote
21
  vad_pipe = PyannotePipeline.from_pretrained(
22
  "pyannote/voice-activity-detection",
23
  use_auth_token=HF_TOKEN
24
  )
25
 
26
- # 3. Ultravox ASR+LLM (generic pipeline)
27
  ultravox_pipe = pipeline(
28
  model="fixie-ai/ultravox-v0_4",
29
  trust_remote_code=True,
@@ -31,50 +31,51 @@ ultravox_pipe = pipeline(
31
  torch_dtype=torch.float16
32
  )
33
 
34
- # 4. Audio Diffusion (Diffusers loader)
35
  diff_pipe = DiffusionPipeline.from_pretrained(
36
  "teticio/audio-diffusion-instrumental-hiphop-256",
37
  torch_dtype=torch.float16
38
  ).to("cuda")
39
 
40
- # 5. Dia TTS with device sharding
41
- dia = Dia.from_pretrained(
 
 
 
 
42
  "nari-labs/Dia-1.6B",
43
  device_map=device_map,
44
- torch_dtype=torch.float16,
45
- trust_remote_code=True
46
  )
 
 
 
 
47
 
48
  def process_audio(audio):
49
  sr, array = audio
50
  array = array.numpy() if torch.is_tensor(array) else array
51
 
52
- # VAD segmentation
53
- _ = vad_pipe({"waveform": torch.tensor(array).unsqueeze(0), "sample_rate": sr})
54
-
55
- # RVQ encode/decode
56
- x = torch.tensor(array).unsqueeze(0).to("cuda")
57
- codes = rvq.encode(x)
58
- decoded = rvq.decode(codes).squeeze().cpu().numpy()
59
 
60
- # Ultravox: speech β†’ text
61
- out = ultravox_pipe({"array": decoded, "sampling_rate": sr})
62
- text = out.get("text", "")
63
 
64
- # Diffusion-based prosody
65
  pros = diff_pipe(raw_audio=decoded)["audios"][0]
66
 
67
- # Dia TTS synthesis
68
- tts = dia.generate(f"[emotion:neutral] {text}")
69
- tts_np = tts.squeeze().cpu().numpy()
70
  tts_np = tts_np / np.max(np.abs(tts_np)) * 0.95 if tts_np.size else tts_np
71
 
72
  return (sr, tts_np), text
73
 
74
  with gr.Blocks(title="Maya AI πŸ“ˆ") as demo:
75
  gr.Markdown("## Maya-AI: Supernatural Conversational Agent")
76
- audio_in = gr.Audio(source="microphone", type="numpy", label="Your Voice")
77
- send_btn = gr.Button("Send")
78
  audio_out = gr.Audio(label="AI Response")
79
  text_out = gr.Textbox(label="Generated Text")
80
  send_btn.click(process_audio, inputs=audio_in, outputs=[audio_out, text_out])
 
2
  import gradio as gr
3
  import torch
4
  import numpy as np
5
+ from transformers import pipeline, AutoTokenizer
6
  from diffusers import DiffusionPipeline
7
  from pyannote.audio import Pipeline as PyannotePipeline
8
+ from dia.model import DiaConfig, DiaModel, Dia
9
  from dac.utils import load_model as load_dac_model
10
+ from accelerate import init_empty_weights, load_checkpoint_and_dispatch
11
 
 
12
  HF_TOKEN = os.environ["HF_TOKEN"]
13
  device_map = "auto"
14
 
15
+ # RVQ Codec
16
  rvq = load_dac_model(tag="latest", model_type="44khz")
17
  rvq.eval()
18
  if torch.cuda.is_available(): rvq = rvq.to("cuda")
19
 
20
+ # VAD Pipeline
21
  vad_pipe = PyannotePipeline.from_pretrained(
22
  "pyannote/voice-activity-detection",
23
  use_auth_token=HF_TOKEN
24
  )
25
 
26
+ # Ultravox Pipeline
27
  ultravox_pipe = pipeline(
28
  model="fixie-ai/ultravox-v0_4",
29
  trust_remote_code=True,
 
31
  torch_dtype=torch.float16
32
  )
33
 
34
+ # Audio Diffusion
35
  diff_pipe = DiffusionPipeline.from_pretrained(
36
  "teticio/audio-diffusion-instrumental-hiphop-256",
37
  torch_dtype=torch.float16
38
  ).to("cuda")
39
 
40
+ # Dia TTS Loading
41
+ config = DiaConfig.from_pretrained("nari-labs/Dia-1.6B")
42
+ with init_empty_weights():
43
+ base_model = DiaModel(config)
44
+ base_model = load_checkpoint_and_dispatch(
45
+ base_model,
46
  "nari-labs/Dia-1.6B",
47
  device_map=device_map,
48
+ dtype=torch.float16
 
49
  )
50
+ dia = Dia(base_model, config)
51
+
52
+ # Save tokenizer for Dia text processing
53
+ tokenizer = AutoTokenizer.from_pretrained("nari-labs/Dia-1.6B")
54
 
55
  def process_audio(audio):
56
  sr, array = audio
57
  array = array.numpy() if torch.is_tensor(array) else array
58
 
59
+ vad_pipe({"waveform": torch.tensor(array).unsqueeze(0), "sample_rate": sr})
60
+ x = torch.tensor(array).unsqueeze(0).to("cuda")
61
+ codes = rvq.encode(x); decoded = rvq.decode(codes).squeeze().cpu().numpy()
 
 
 
 
62
 
63
+ ultra_out = ultravox_pipe({"array": decoded, "sampling_rate": sr})
64
+ text = ultra_out.get("text", "")
 
65
 
 
66
  pros = diff_pipe(raw_audio=decoded)["audios"][0]
67
 
68
+ inputs = tokenizer(f"[emotion:neutral] {text}", return_tensors="pt").to("cuda")
69
+ tts_tensors = dia.generate(**inputs)
70
+ tts_np = tts_tensors.squeeze().cpu().numpy()
71
  tts_np = tts_np / np.max(np.abs(tts_np)) * 0.95 if tts_np.size else tts_np
72
 
73
  return (sr, tts_np), text
74
 
75
  with gr.Blocks(title="Maya AI πŸ“ˆ") as demo:
76
  gr.Markdown("## Maya-AI: Supernatural Conversational Agent")
77
+ audio_in = gr.Audio(source="microphone", type="numpy", label="Your Voice")
78
+ send_btn = gr.Button("Send")
79
  audio_out = gr.Audio(label="AI Response")
80
  text_out = gr.Textbox(label="Generated Text")
81
  send_btn.click(process_audio, inputs=audio_in, outputs=[audio_out, text_out])