Spaces:
Running
on
Zero
Running
on
Zero
mcp-2
#13
by
victor
HF Staff
- opened
app.py
CHANGED
@@ -45,26 +45,26 @@ def set_seed(seed: int):
|
|
45 |
@spaces.GPU
|
46 |
def generate_tts_audio(
|
47 |
text_input: str,
|
48 |
-
audio_prompt_path_input: str,
|
49 |
-
exaggeration_input: float,
|
50 |
-
temperature_input: float,
|
51 |
-
seed_num_input: int,
|
52 |
-
cfgw_input: float
|
53 |
) -> tuple[int, np.ndarray]:
|
54 |
"""
|
55 |
-
Generate high-quality speech audio from text using ChatterboxTTS model with reference audio styling.
|
56 |
|
57 |
-
This tool synthesizes natural-sounding speech from input text
|
58 |
-
|
59 |
-
maintains the prosody, tone, and vocal qualities of the reference speaker.
|
60 |
|
61 |
Args:
|
62 |
text_input (str): The text to synthesize into speech (maximum 300 characters)
|
63 |
-
audio_prompt_path_input (str): File path or URL to the reference audio file that defines the target voice style
|
64 |
-
exaggeration_input (float): Controls speech expressiveness (0.25-2.0, neutral=0.5, extreme values may be unstable)
|
65 |
-
temperature_input (float): Controls randomness in generation (0.05-5.0, higher=more varied
|
66 |
-
seed_num_input (int): Random seed for reproducible results (0 for random generation)
|
67 |
-
cfgw_input (float): CFG/Pace weight controlling generation guidance (0.2-1.0
|
68 |
|
69 |
Returns:
|
70 |
tuple[int, np.ndarray]: A tuple containing the sample rate (int) and the generated audio waveform (numpy.ndarray)
|
@@ -78,12 +78,20 @@ def generate_tts_audio(
|
|
78 |
set_seed(int(seed_num_input))
|
79 |
|
80 |
print(f"Generating audio for text: '{text_input[:50]}...'")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
wav = current_model.generate(
|
82 |
text_input[:300], # Truncate text to max chars
|
83 |
-
|
84 |
-
exaggeration=exaggeration_input,
|
85 |
-
temperature=temperature_input,
|
86 |
-
cfg_weight=cfgw_input,
|
87 |
)
|
88 |
print("Audio generation complete.")
|
89 |
return (current_model.sr, wav.squeeze(0).numpy())
|
|
|
45 |
@spaces.GPU
|
46 |
def generate_tts_audio(
|
47 |
text_input: str,
|
48 |
+
audio_prompt_path_input: str = None,
|
49 |
+
exaggeration_input: float = 0.5,
|
50 |
+
temperature_input: float = 0.8,
|
51 |
+
seed_num_input: int = 0,
|
52 |
+
cfgw_input: float = 0.5
|
53 |
) -> tuple[int, np.ndarray]:
|
54 |
"""
|
55 |
+
Generate high-quality speech audio from text using ChatterboxTTS model with optional reference audio styling.
|
56 |
|
57 |
+
This tool synthesizes natural-sounding speech from input text. When a reference audio file
|
58 |
+
is provided, it captures the speaker's voice characteristics and speaking style. The generated audio
|
59 |
+
maintains the prosody, tone, and vocal qualities of the reference speaker, or uses default voice if no reference is provided.
|
60 |
|
61 |
Args:
|
62 |
text_input (str): The text to synthesize into speech (maximum 300 characters)
|
63 |
+
audio_prompt_path_input (str, optional): File path or URL to the reference audio file that defines the target voice style. Defaults to None.
|
64 |
+
exaggeration_input (float, optional): Controls speech expressiveness (0.25-2.0, neutral=0.5, extreme values may be unstable). Defaults to 0.5.
|
65 |
+
temperature_input (float, optional): Controls randomness in generation (0.05-5.0, higher=more varied). Defaults to 0.8.
|
66 |
+
seed_num_input (int, optional): Random seed for reproducible results (0 for random generation). Defaults to 0.
|
67 |
+
cfgw_input (float, optional): CFG/Pace weight controlling generation guidance (0.2-1.0). Defaults to 0.5.
|
68 |
|
69 |
Returns:
|
70 |
tuple[int, np.ndarray]: A tuple containing the sample rate (int) and the generated audio waveform (numpy.ndarray)
|
|
|
78 |
set_seed(int(seed_num_input))
|
79 |
|
80 |
print(f"Generating audio for text: '{text_input[:50]}...'")
|
81 |
+
|
82 |
+
# Handle optional audio prompt
|
83 |
+
generate_kwargs = {
|
84 |
+
"exaggeration": exaggeration_input,
|
85 |
+
"temperature": temperature_input,
|
86 |
+
"cfg_weight": cfgw_input,
|
87 |
+
}
|
88 |
+
|
89 |
+
if audio_prompt_path_input:
|
90 |
+
generate_kwargs["audio_prompt_path"] = audio_prompt_path_input
|
91 |
+
|
92 |
wav = current_model.generate(
|
93 |
text_input[:300], # Truncate text to max chars
|
94 |
+
**generate_kwargs
|
|
|
|
|
|
|
95 |
)
|
96 |
print("Audio generation complete.")
|
97 |
return (current_model.sr, wav.squeeze(0).numpy())
|