Spaces:

ResembleAI
/

Chatterbox

Running on Zero

App Files Files Community

Willing to explore ChartterBox

#10

by MStrange11 - opened 17 days ago

base: refs/heads/main

←

from: refs/pr/10

Discussion Files changed

+18

-30

This PR is in draft mode

Files changed (1) hide show

app.py +18 -30

app.py CHANGED Viewed

@@ -45,29 +45,25 @@ def set_seed(seed: int):
 @spaces.GPU
 def generate_tts_audio(
     text_input: str,
-    audio_prompt_path_input: str = None,
-    exaggeration_input: float = 0.5,
-    temperature_input: float = 0.8,
-    seed_num_input: int = 0,
-    cfgw_input: float = 0.5
 ) -> tuple[int, np.ndarray]:
     """
-    Generate high-quality speech audio from text using ChatterboxTTS model with optional reference audio styling.
-    This tool synthesizes natural-sounding speech from input text. When a reference audio file
-    is provided, it captures the speaker's voice characteristics and speaking style. The generated audio
-    maintains the prosody, tone, and vocal qualities of the reference speaker, or uses default voice if no reference is provided.
     Args:
-        text_input (str): The text to synthesize into speech (maximum 300 characters)
-        audio_prompt_path_input (str, optional): File path or URL to the reference audio file that defines the target voice style. Defaults to None.
-        exaggeration_input (float, optional): Controls speech expressiveness (0.25-2.0, neutral=0.5, extreme values may be unstable). Defaults to 0.5.
-        temperature_input (float, optional): Controls randomness in generation (0.05-5.0, higher=more varied). Defaults to 0.8.
-        seed_num_input (int, optional): Random seed for reproducible results (0 for random generation). Defaults to 0.
-        cfgw_input (float, optional): CFG/Pace weight controlling generation guidance (0.2-1.0). Defaults to 0.5.
     Returns:
-        tuple[int, np.ndarray]: A tuple containing the sample rate (int) and the generated audio waveform (numpy.ndarray)
     """
     current_model = get_or_load_model()
@@ -78,20 +74,12 @@ def generate_tts_audio(
         set_seed(int(seed_num_input))
     print(f"Generating audio for text: '{text_input[:50]}...'")
-    # Handle optional audio prompt
-    generate_kwargs = {
-        "exaggeration": exaggeration_input,
-        "temperature": temperature_input,
-        "cfg_weight": cfgw_input,
-    }
-    if audio_prompt_path_input:
-        generate_kwargs["audio_prompt_path"] = audio_prompt_path_input
     wav = current_model.generate(
         text_input[:300],  # Truncate text to max chars
-        **generate_kwargs
     )
     print("Audio generation complete.")
     return (current_model.sr, wav.squeeze(0).numpy())
@@ -145,4 +133,4 @@ with gr.Blocks() as demo:
         outputs=[audio_output],
     )
-demo.launch(mcp_server=True)

 @spaces.GPU
 def generate_tts_audio(
     text_input: str,
+    audio_prompt_path_input: str,
+    exaggeration_input: float,
+    temperature_input: float,
+    seed_num_input: int,
+    cfgw_input: float
 ) -> tuple[int, np.ndarray]:
     """
+    Generates TTS audio using the ChatterboxTTS model.
     Args:
+        text_input: The text to synthesize (max 300 characters).
+        audio_prompt_path_input: Path to the reference audio file.
+        exaggeration_input: Exaggeration parameter for the model.
+        temperature_input: Temperature parameter for the model.
+        seed_num_input: Random seed (0 for random).
+        cfgw_input: CFG/Pace weight.
     Returns:
+        A tuple containing the sample rate (int) and the audio waveform (numpy.ndarray).
     """
     current_model = get_or_load_model()
         set_seed(int(seed_num_input))
     print(f"Generating audio for text: '{text_input[:50]}...'")
     wav = current_model.generate(
         text_input[:300],  # Truncate text to max chars
+        audio_prompt_path=audio_prompt_path_input,
+        exaggeration=exaggeration_input,
+        temperature=temperature_input,
+        cfg_weight=cfgw_input,
     )
     print("Audio generation complete.")
     return (current_model.sr, wav.squeeze(0).numpy())
         outputs=[audio_output],
     )
+demo.launch()