Devakumar868 commited on
Commit
c11bb04
Β·
verified Β·
1 Parent(s): 38980d2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -34
app.py CHANGED
@@ -2,80 +2,82 @@ import os
2
  import gradio as gr
3
  import torch
4
  import numpy as np
5
- from transformers import pipeline, AutoTokenizer
6
  from diffusers import DiffusionPipeline
7
  from pyannote.audio import Pipeline as PyannotePipeline
8
- from dia.model import DiaConfig, DiaModel, Dia
9
  from dac.utils import load_model as load_dac_model
10
  from accelerate import init_empty_weights, load_checkpoint_and_dispatch
11
 
12
- HF_TOKEN = os.environ["HF_TOKEN"]
13
- device_map = "auto"
 
14
 
15
- # RVQ Codec
16
- rvq = load_dac_model(tag="latest", model_type="44khz")
17
  rvq.eval()
18
  if torch.cuda.is_available(): rvq = rvq.to("cuda")
19
 
20
- # VAD Pipeline
21
  vad_pipe = PyannotePipeline.from_pretrained(
22
  "pyannote/voice-activity-detection",
23
  use_auth_token=HF_TOKEN
24
- )
25
 
26
- # Ultravox Pipeline
27
  ultravox_pipe = pipeline(
28
  model="fixie-ai/ultravox-v0_4",
29
  trust_remote_code=True,
30
  device_map=device_map,
31
  torch_dtype=torch.float16
32
- )
33
 
34
- # Audio Diffusion
35
  diff_pipe = DiffusionPipeline.from_pretrained(
36
  "teticio/audio-diffusion-instrumental-hiphop-256",
37
  torch_dtype=torch.float16
38
- ).to("cuda")
39
 
40
- # Dia TTS Loading
41
- config = DiaConfig.from_pretrained("nari-labs/Dia-1.6B")
42
- with init_empty_weights():
43
- base_model = DiaModel(config)
44
- base_model = load_checkpoint_and_dispatch(
45
- base_model,
46
  "nari-labs/Dia-1.6B",
47
  device_map=device_map,
48
- dtype=torch.float16
49
- )
50
- dia = Dia(base_model, config)
51
-
52
- # Save tokenizer for Dia text processing
53
- tokenizer = AutoTokenizer.from_pretrained("nari-labs/Dia-1.6B")
54
 
 
55
  def process_audio(audio):
56
- sr, array = audio
57
- array = array.numpy() if torch.is_tensor(array) else array
 
 
 
58
 
59
- vad_pipe({"waveform": torch.tensor(array).unsqueeze(0), "sample_rate": sr})
60
- x = torch.tensor(array).unsqueeze(0).to("cuda")
61
- codes = rvq.encode(x); decoded = rvq.decode(codes).squeeze().cpu().numpy()
 
62
 
 
63
  ultra_out = ultravox_pipe({"array": decoded, "sampling_rate": sr})
64
  text = ultra_out.get("text", "")
65
 
 
66
  pros = diff_pipe(raw_audio=decoded)["audios"][0]
67
 
68
- inputs = tokenizer(f"[emotion:neutral] {text}", return_tensors="pt").to("cuda")
69
- tts_tensors = dia.generate(**inputs)
70
- tts_np = tts_tensors.squeeze().cpu().numpy()
71
  tts_np = tts_np / np.max(np.abs(tts_np)) * 0.95 if tts_np.size else tts_np
72
 
73
  return (sr, tts_np), text
74
 
 
75
  with gr.Blocks(title="Maya AI πŸ“ˆ") as demo:
76
  gr.Markdown("## Maya-AI: Supernatural Conversational Agent")
77
- audio_in = gr.Audio(source="microphone", type="numpy", label="Your Voice")
78
- send_btn = gr.Button("Send")
79
  audio_out = gr.Audio(label="AI Response")
80
  text_out = gr.Textbox(label="Generated Text")
81
  send_btn.click(process_audio, inputs=audio_in, outputs=[audio_out, text_out])
 
2
  import gradio as gr
3
  import torch
4
  import numpy as np
5
+ from transformers import pipeline
6
  from diffusers import DiffusionPipeline
7
  from pyannote.audio import Pipeline as PyannotePipeline
8
+ from dia.model import Dia
9
  from dac.utils import load_model as load_dac_model
10
  from accelerate import init_empty_weights, load_checkpoint_and_dispatch
11
 
12
+ #-- Configuration
13
+ HF_TOKEN = os.environ["HF_TOKEN"] # Gated model access[2]
14
+ device_map = "auto" # Distribute models on 4Γ—L4 GPUs[3]
15
 
16
+ #-- 1. Descript Audio Codec (RVQ)
17
+ rvq = load_dac_model(tag="latest", model_type="44khz") # RVQ encoder/decoder[4]
18
  rvq.eval()
19
  if torch.cuda.is_available(): rvq = rvq.to("cuda")
20
 
21
+ #-- 2. Voice Activity Detection via Pyannote
22
  vad_pipe = PyannotePipeline.from_pretrained(
23
  "pyannote/voice-activity-detection",
24
  use_auth_token=HF_TOKEN
25
+ ) # Proper gated VAD load[2]
26
 
27
+ #-- 3. Ultravox ASR+LLM Pipeline
28
  ultravox_pipe = pipeline(
29
  model="fixie-ai/ultravox-v0_4",
30
  trust_remote_code=True,
31
  device_map=device_map,
32
  torch_dtype=torch.float16
33
+ ) # Custom speech pipeline[2]
34
 
35
+ #-- 4. Audio Diffusion Model (Prosody)
36
  diff_pipe = DiffusionPipeline.from_pretrained(
37
  "teticio/audio-diffusion-instrumental-hiphop-256",
38
  torch_dtype=torch.float16
39
+ ).to("cuda") # Diffusers-based load[2]
40
 
41
+ #-- 5. Dia TTS Model Sharded Across GPUs
42
+ dia = Dia.from_pretrained(
 
 
 
 
43
  "nari-labs/Dia-1.6B",
44
  device_map=device_map,
45
+ torch_dtype=torch.float16,
46
+ trust_remote_code=True
47
+ ) # Auto-sharding in Transformers[2]
 
 
 
48
 
49
+ #-- Inference Function
50
  def process_audio(audio):
51
+ sr, arr = audio
52
+ arr = arr.numpy() if torch.is_tensor(arr) else arr
53
+
54
+ # VAD segmentation
55
+ _ = vad_pipe({"waveform": torch.tensor(arr).unsqueeze(0), "sample_rate": sr})
56
 
57
+ # RVQ encode/decode
58
+ x = torch.tensor(arr).unsqueeze(0).to("cuda")
59
+ codes = rvq.encode(x)
60
+ decoded = rvq.decode(codes).squeeze().cpu().numpy()
61
 
62
+ # Ultravox ASR β†’ text
63
  ultra_out = ultravox_pipe({"array": decoded, "sampling_rate": sr})
64
  text = ultra_out.get("text", "")
65
 
66
+ # Diffusion-based prosody enhancement
67
  pros = diff_pipe(raw_audio=decoded)["audios"][0]
68
 
69
+ # Dia TTS synthesis
70
+ tts = dia.generate(f"[emotion:neutral] {text}")
71
+ tts_np = tts.squeeze().cpu().numpy()
72
  tts_np = tts_np / np.max(np.abs(tts_np)) * 0.95 if tts_np.size else tts_np
73
 
74
  return (sr, tts_np), text
75
 
76
+ #-- Gradio UI
77
  with gr.Blocks(title="Maya AI πŸ“ˆ") as demo:
78
  gr.Markdown("## Maya-AI: Supernatural Conversational Agent")
79
+ audio_in = gr.Audio(source="microphone", type="numpy", label="Your Voice")
80
+ send_btn = gr.Button("Send")
81
  audio_out = gr.Audio(label="AI Response")
82
  text_out = gr.Textbox(label="Generated Text")
83
  send_btn.click(process_audio, inputs=audio_in, outputs=[audio_out, text_out])