Willing to explore ChartterBox

#10
Files changed (1) hide show
  1. app.py +18 -30
app.py CHANGED
@@ -45,29 +45,25 @@ def set_seed(seed: int):
45
  @spaces.GPU
46
  def generate_tts_audio(
47
  text_input: str,
48
- audio_prompt_path_input: str = None,
49
- exaggeration_input: float = 0.5,
50
- temperature_input: float = 0.8,
51
- seed_num_input: int = 0,
52
- cfgw_input: float = 0.5
53
  ) -> tuple[int, np.ndarray]:
54
  """
55
- Generate high-quality speech audio from text using ChatterboxTTS model with optional reference audio styling.
56
-
57
- This tool synthesizes natural-sounding speech from input text. When a reference audio file
58
- is provided, it captures the speaker's voice characteristics and speaking style. The generated audio
59
- maintains the prosody, tone, and vocal qualities of the reference speaker, or uses default voice if no reference is provided.
60
 
61
  Args:
62
- text_input (str): The text to synthesize into speech (maximum 300 characters)
63
- audio_prompt_path_input (str, optional): File path or URL to the reference audio file that defines the target voice style. Defaults to None.
64
- exaggeration_input (float, optional): Controls speech expressiveness (0.25-2.0, neutral=0.5, extreme values may be unstable). Defaults to 0.5.
65
- temperature_input (float, optional): Controls randomness in generation (0.05-5.0, higher=more varied). Defaults to 0.8.
66
- seed_num_input (int, optional): Random seed for reproducible results (0 for random generation). Defaults to 0.
67
- cfgw_input (float, optional): CFG/Pace weight controlling generation guidance (0.2-1.0). Defaults to 0.5.
68
 
69
  Returns:
70
- tuple[int, np.ndarray]: A tuple containing the sample rate (int) and the generated audio waveform (numpy.ndarray)
71
  """
72
  current_model = get_or_load_model()
73
 
@@ -78,20 +74,12 @@ def generate_tts_audio(
78
  set_seed(int(seed_num_input))
79
 
80
  print(f"Generating audio for text: '{text_input[:50]}...'")
81
-
82
- # Handle optional audio prompt
83
- generate_kwargs = {
84
- "exaggeration": exaggeration_input,
85
- "temperature": temperature_input,
86
- "cfg_weight": cfgw_input,
87
- }
88
-
89
- if audio_prompt_path_input:
90
- generate_kwargs["audio_prompt_path"] = audio_prompt_path_input
91
-
92
  wav = current_model.generate(
93
  text_input[:300], # Truncate text to max chars
94
- **generate_kwargs
 
 
 
95
  )
96
  print("Audio generation complete.")
97
  return (current_model.sr, wav.squeeze(0).numpy())
@@ -145,4 +133,4 @@ with gr.Blocks() as demo:
145
  outputs=[audio_output],
146
  )
147
 
148
- demo.launch(mcp_server=True)
 
45
  @spaces.GPU
46
  def generate_tts_audio(
47
  text_input: str,
48
+ audio_prompt_path_input: str,
49
+ exaggeration_input: float,
50
+ temperature_input: float,
51
+ seed_num_input: int,
52
+ cfgw_input: float
53
  ) -> tuple[int, np.ndarray]:
54
  """
55
+ Generates TTS audio using the ChatterboxTTS model.
 
 
 
 
56
 
57
  Args:
58
+ text_input: The text to synthesize (max 300 characters).
59
+ audio_prompt_path_input: Path to the reference audio file.
60
+ exaggeration_input: Exaggeration parameter for the model.
61
+ temperature_input: Temperature parameter for the model.
62
+ seed_num_input: Random seed (0 for random).
63
+ cfgw_input: CFG/Pace weight.
64
 
65
  Returns:
66
+ A tuple containing the sample rate (int) and the audio waveform (numpy.ndarray).
67
  """
68
  current_model = get_or_load_model()
69
 
 
74
  set_seed(int(seed_num_input))
75
 
76
  print(f"Generating audio for text: '{text_input[:50]}...'")
 
 
 
 
 
 
 
 
 
 
 
77
  wav = current_model.generate(
78
  text_input[:300], # Truncate text to max chars
79
+ audio_prompt_path=audio_prompt_path_input,
80
+ exaggeration=exaggeration_input,
81
+ temperature=temperature_input,
82
+ cfg_weight=cfgw_input,
83
  )
84
  print("Audio generation complete.")
85
  return (current_model.sr, wav.squeeze(0).numpy())
 
133
  outputs=[audio_output],
134
  )
135
 
136
+ demo.launch()