Spaces:

ResembleAI
/

Chatterbox

Running on Zero

App Files Files Community

ollieollie

victor HF Staff commited on 1 day ago

Commit

af25078

verified ·

1 Parent(s): dd21b03

mcp-2 (#13)

Browse files

- args update (1b278bb5670635d48418931730722a3654a5b86b)

Co-authored-by: Victor Mustar <victor@users.noreply.huggingface.co>

Files changed (1) hide show

app.py +26 -18

app.py CHANGED Viewed

@@ -45,26 +45,26 @@ def set_seed(seed: int):
 @spaces.GPU
 def generate_tts_audio(
     text_input: str,
-    audio_prompt_path_input: str,
-    exaggeration_input: float,
-    temperature_input: float,
-    seed_num_input: int,
-    cfgw_input: float
 ) -> tuple[int, np.ndarray]:
     """
-    Generate high-quality speech audio from text using ChatterboxTTS model with reference audio styling.
-    This tool synthesizes natural-sounding speech from input text, using a reference audio file
-    to capture the speaker's voice characteristics and speaking style. The generated audio
-    maintains the prosody, tone, and vocal qualities of the reference speaker.
     Args:
         text_input (str): The text to synthesize into speech (maximum 300 characters)
-        audio_prompt_path_input (str): File path or URL to the reference audio file that defines the target voice style
-        exaggeration_input (float): Controls speech expressiveness (0.25-2.0, neutral=0.5, extreme values may be unstable)
-        temperature_input (float): Controls randomness in generation (0.05-5.0, higher=more varied, default=0.8)
-        seed_num_input (int): Random seed for reproducible results (0 for random generation)
-        cfgw_input (float): CFG/Pace weight controlling generation guidance (0.2-1.0, default=0.5)
     Returns:
         tuple[int, np.ndarray]: A tuple containing the sample rate (int) and the generated audio waveform (numpy.ndarray)
@@ -78,12 +78,20 @@ def generate_tts_audio(
         set_seed(int(seed_num_input))
     print(f"Generating audio for text: '{text_input[:50]}...'")
     wav = current_model.generate(
         text_input[:300],  # Truncate text to max chars
-        audio_prompt_path=audio_prompt_path_input,
-        exaggeration=exaggeration_input,
-        temperature=temperature_input,
-        cfg_weight=cfgw_input,
     )
     print("Audio generation complete.")
     return (current_model.sr, wav.squeeze(0).numpy())

 @spaces.GPU
 def generate_tts_audio(
     text_input: str,
+    audio_prompt_path_input: str = None,
+    exaggeration_input: float = 0.5,
+    temperature_input: float = 0.8,
+    seed_num_input: int = 0,
+    cfgw_input: float = 0.5
 ) -> tuple[int, np.ndarray]:
     """
+    Generate high-quality speech audio from text using ChatterboxTTS model with optional reference audio styling.
+    This tool synthesizes natural-sounding speech from input text. When a reference audio file
+    is provided, it captures the speaker's voice characteristics and speaking style. The generated audio
+    maintains the prosody, tone, and vocal qualities of the reference speaker, or uses default voice if no reference is provided.
     Args:
         text_input (str): The text to synthesize into speech (maximum 300 characters)
+        audio_prompt_path_input (str, optional): File path or URL to the reference audio file that defines the target voice style. Defaults to None.
+        exaggeration_input (float, optional): Controls speech expressiveness (0.25-2.0, neutral=0.5, extreme values may be unstable). Defaults to 0.5.
+        temperature_input (float, optional): Controls randomness in generation (0.05-5.0, higher=more varied). Defaults to 0.8.
+        seed_num_input (int, optional): Random seed for reproducible results (0 for random generation). Defaults to 0.
+        cfgw_input (float, optional): CFG/Pace weight controlling generation guidance (0.2-1.0). Defaults to 0.5.
     Returns:
         tuple[int, np.ndarray]: A tuple containing the sample rate (int) and the generated audio waveform (numpy.ndarray)
         set_seed(int(seed_num_input))
     print(f"Generating audio for text: '{text_input[:50]}...'")
+    # Handle optional audio prompt
+    generate_kwargs = {
+        "exaggeration": exaggeration_input,
+        "temperature": temperature_input,
+        "cfg_weight": cfgw_input,
+    }
+    if audio_prompt_path_input:
+        generate_kwargs["audio_prompt_path"] = audio_prompt_path_input
     wav = current_model.generate(
         text_input[:300],  # Truncate text to max chars
+        **generate_kwargs
     )
     print("Audio generation complete.")
     return (current_model.sr, wav.squeeze(0).numpy())