mrfakename commited on
Commit
9315afa
·
1 Parent(s): 0e28cb3

update audio

Browse files
Files changed (1) hide show
  1. app.py +38 -37
app.py CHANGED
@@ -44,6 +44,32 @@ def transcribe(ref_audio, language=None):
44
  return_timestamps=False,
45
  )["text"].strip()
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
  @spaces.GPU(duration=120)
49
  def generate_speech(
@@ -67,24 +93,11 @@ def generate_speech(
67
  prompt_text = transcribe(prompt_audio)
68
 
69
 
70
- if mode == "Student Only (4 steps)":
71
- teacher_steps = 0
72
- student_start_step = 0
73
- teacher_stopping_time = 1.0
74
- elif mode == "Teacher-Guided (8 steps)":
75
- teacher_steps = 16
76
- teacher_stopping_time = 0.07
77
- student_start_step = 1
78
- elif mode == "High Diversity (16 steps)":
79
- teacher_steps = 24
80
- teacher_stopping_time = 0.3
81
- student_start_step = 2
82
- else: # Custom
83
- teacher_steps = custom_teacher_steps
84
- teacher_stopping_time = custom_teacher_stopping_time
85
- student_start_step = custom_student_start_step
86
-
87
- # Generate speech
88
  generated_audio = model.generate(
89
  gen_text=target_text,
90
  audio_path=prompt_audio,
@@ -97,27 +110,15 @@ def generate_speech(
97
  )
98
 
99
 
100
- # Save audio
101
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
102
- output_path = tmp_file.name
103
-
104
- if isinstance(generated_audio, np.ndarray):
105
- generated_audio = torch.from_numpy(generated_audio)
106
-
107
- if generated_audio.dim() == 1:
108
- generated_audio = generated_audio.unsqueeze(0)
109
 
110
- torchaudio.save(output_path, generated_audio, 24000)
 
111
 
112
- return (
113
- output_path,
114
- "Success!",
115
- (
116
- f"Mode: {mode} | Transcribed: {prompt_text[:50]}..."
117
- if not prompt_text
118
- else f"Mode: {mode}"
119
- ),
120
- )
121
 
122
 
123
  # Create Gradio interface
 
44
  return_timestamps=False,
45
  )["text"].strip()
46
 
47
+ MODES = {
48
+ "Student Only (4 steps)": {
49
+ "teacher_steps": 0,
50
+ "teacher_stopping_time": 1.0,
51
+ "student_start_step": 0,
52
+ "description": "Fastest (4 steps), good quality"
53
+ },
54
+ "Teacher-Guided (8 steps)": {
55
+ "teacher_steps": 16,
56
+ "teacher_stopping_time": 0.07,
57
+ "student_start_step": 1,
58
+ "description": "Best balance (8 steps), recommended"
59
+ },
60
+ "High Diversity (16 steps)": {
61
+ "teacher_steps": 24,
62
+ "teacher_stopping_time": 0.3,
63
+ "student_start_step": 2,
64
+ "description": "More natural prosody (16 steps)"
65
+ },
66
+ "Custom": {
67
+ "teacher_steps": None,
68
+ "teacher_stopping_time": None,
69
+ "student_start_step": None,
70
+ "description": "Fine-tune all parameters"
71
+ }
72
+ }
73
 
74
  @spaces.GPU(duration=120)
75
  def generate_speech(
 
93
  prompt_text = transcribe(prompt_audio)
94
 
95
 
96
+ if mode == "Custom":
97
+ teacher_steps, teacher_stopping_time, student_start_step = custom_teacher_steps, custom_teacher_stopping_time, custom_student_start_step
98
+ else:
99
+ teacher_steps, teacher_stopping_time, student_start_step = MODES[mode].values()
100
+
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  generated_audio = model.generate(
102
  gen_text=target_text,
103
  audio_path=prompt_audio,
 
110
  )
111
 
112
 
113
+ if isinstance(generated_audio, torch.Tensor):
114
+ audio_np = generated_audio.cpu().numpy()
115
+ else:
116
+ audio_np = generated_audio
 
 
 
 
 
117
 
118
+ if audio_np.ndim == 1:
119
+ audio_np = np.expand_dims(audio_np, axis=0)
120
 
121
+ return (24000, audio_np)
 
 
 
 
 
 
 
 
122
 
123
 
124
  # Create Gradio interface