ollieollie victor HF Staff commited on
Commit
af25078
·
verified ·
1 Parent(s): dd21b03

- args update (1b278bb5670635d48418931730722a3654a5b86b)


Co-authored-by: Victor Mustar <victor@users.noreply.huggingface.co>

Files changed (1) hide show
  1. app.py +26 -18
app.py CHANGED
@@ -45,26 +45,26 @@ def set_seed(seed: int):
45
  @spaces.GPU
46
  def generate_tts_audio(
47
  text_input: str,
48
- audio_prompt_path_input: str,
49
- exaggeration_input: float,
50
- temperature_input: float,
51
- seed_num_input: int,
52
- cfgw_input: float
53
  ) -> tuple[int, np.ndarray]:
54
  """
55
- Generate high-quality speech audio from text using ChatterboxTTS model with reference audio styling.
56
 
57
- This tool synthesizes natural-sounding speech from input text, using a reference audio file
58
- to capture the speaker's voice characteristics and speaking style. The generated audio
59
- maintains the prosody, tone, and vocal qualities of the reference speaker.
60
 
61
  Args:
62
  text_input (str): The text to synthesize into speech (maximum 300 characters)
63
- audio_prompt_path_input (str): File path or URL to the reference audio file that defines the target voice style
64
- exaggeration_input (float): Controls speech expressiveness (0.25-2.0, neutral=0.5, extreme values may be unstable)
65
- temperature_input (float): Controls randomness in generation (0.05-5.0, higher=more varied, default=0.8)
66
- seed_num_input (int): Random seed for reproducible results (0 for random generation)
67
- cfgw_input (float): CFG/Pace weight controlling generation guidance (0.2-1.0, default=0.5)
68
 
69
  Returns:
70
  tuple[int, np.ndarray]: A tuple containing the sample rate (int) and the generated audio waveform (numpy.ndarray)
@@ -78,12 +78,20 @@ def generate_tts_audio(
78
  set_seed(int(seed_num_input))
79
 
80
  print(f"Generating audio for text: '{text_input[:50]}...'")
 
 
 
 
 
 
 
 
 
 
 
81
  wav = current_model.generate(
82
  text_input[:300], # Truncate text to max chars
83
- audio_prompt_path=audio_prompt_path_input,
84
- exaggeration=exaggeration_input,
85
- temperature=temperature_input,
86
- cfg_weight=cfgw_input,
87
  )
88
  print("Audio generation complete.")
89
  return (current_model.sr, wav.squeeze(0).numpy())
 
45
  @spaces.GPU
46
  def generate_tts_audio(
47
  text_input: str,
48
+ audio_prompt_path_input: str = None,
49
+ exaggeration_input: float = 0.5,
50
+ temperature_input: float = 0.8,
51
+ seed_num_input: int = 0,
52
+ cfgw_input: float = 0.5
53
  ) -> tuple[int, np.ndarray]:
54
  """
55
+ Generate high-quality speech audio from text using ChatterboxTTS model with optional reference audio styling.
56
 
57
+ This tool synthesizes natural-sounding speech from input text. When a reference audio file
58
+ is provided, it captures the speaker's voice characteristics and speaking style. The generated audio
59
+ maintains the prosody, tone, and vocal qualities of the reference speaker, or uses default voice if no reference is provided.
60
 
61
  Args:
62
  text_input (str): The text to synthesize into speech (maximum 300 characters)
63
+ audio_prompt_path_input (str, optional): File path or URL to the reference audio file that defines the target voice style. Defaults to None.
64
+ exaggeration_input (float, optional): Controls speech expressiveness (0.25-2.0, neutral=0.5, extreme values may be unstable). Defaults to 0.5.
65
+ temperature_input (float, optional): Controls randomness in generation (0.05-5.0, higher=more varied). Defaults to 0.8.
66
+ seed_num_input (int, optional): Random seed for reproducible results (0 for random generation). Defaults to 0.
67
+ cfgw_input (float, optional): CFG/Pace weight controlling generation guidance (0.2-1.0). Defaults to 0.5.
68
 
69
  Returns:
70
  tuple[int, np.ndarray]: A tuple containing the sample rate (int) and the generated audio waveform (numpy.ndarray)
 
78
  set_seed(int(seed_num_input))
79
 
80
  print(f"Generating audio for text: '{text_input[:50]}...'")
81
+
82
+ # Handle optional audio prompt
83
+ generate_kwargs = {
84
+ "exaggeration": exaggeration_input,
85
+ "temperature": temperature_input,
86
+ "cfg_weight": cfgw_input,
87
+ }
88
+
89
+ if audio_prompt_path_input:
90
+ generate_kwargs["audio_prompt_path"] = audio_prompt_path_input
91
+
92
  wav = current_model.generate(
93
  text_input[:300], # Truncate text to max chars
94
+ **generate_kwargs
 
 
 
95
  )
96
  print("Audio generation complete.")
97
  return (current_model.sr, wav.squeeze(0).numpy())