ramimu commited on
Commit
deb04b6
·
1 Parent(s): 652c965

Changes to be committed:

Browse files
Files changed (1) hide show
  1. app.py +89 -243
app.py CHANGED
@@ -1,53 +1,46 @@
1
  import gradio as gr
2
  import os
3
- import traceback # For detailed error logging
4
  import torch
5
  from huggingface_hub import hf_hub_download
6
  import shutil
7
  import spaces
8
 
9
- # Import configuration
10
  try:
11
  from config import MODEL_REPO_ID, MODEL_FILES, LOCAL_MODEL_PATH
12
  except ImportError:
13
- # Fallback configuration if config.py is not found
14
  MODEL_REPO_ID = "ramimu/chatterbox-voice-cloning-model"
15
  LOCAL_MODEL_PATH = "./chatterbox_model_files"
16
  MODEL_FILES = ["s3gen.pt", "t3_cfg.pt", "ve.pt", "tokenizer.json"]
17
 
18
- # Try importing chatterbox with better error handling
19
  try:
20
  from chatterbox.tts import ChatterboxTTS
21
  chatterbox_available = True
22
  print("Chatterbox TTS imported successfully")
23
-
24
- # Inspect the ChatterboxTTS class to understand its API
25
  import inspect
26
  print(f"ChatterboxTTS methods: {[method for method in dir(ChatterboxTTS) if not method.startswith('_')]}")
27
-
28
- # Check constructor signature
29
  try:
30
  sig = inspect.signature(ChatterboxTTS.__init__)
31
  print(f"ChatterboxTTS.__init__ signature: {sig}")
32
  except:
33
  pass
34
-
35
- # Check from_local signature if it exists
36
  if hasattr(ChatterboxTTS, 'from_local'):
37
  try:
38
  sig = inspect.signature(ChatterboxTTS.from_local)
39
  print(f"ChatterboxTTS.from_local signature: {sig}")
40
  except:
41
  pass
42
-
43
- # Check from_pretrained signature if it exists
44
  if hasattr(ChatterboxTTS, 'from_pretrained'):
45
  try:
46
  sig = inspect.signature(ChatterboxTTS.from_pretrained)
47
  print(f"ChatterboxTTS.from_pretrained signature: {sig}")
48
  except:
49
  pass
50
-
51
  except ImportError as e:
52
  print(f"Failed to import ChatterboxTTS: {e}")
53
  print("Trying alternative import...")
@@ -60,16 +53,11 @@ except ImportError as e:
60
  print(f"Alternative import also failed: {e2}")
61
  chatterbox_available = False
62
 
63
- # --- Global Model Variable ---
64
  model = None
65
 
66
  def download_model_files():
67
- """Download model files from Hugging Face Hub if they don't exist locally"""
68
  print(f"Checking for model files in {LOCAL_MODEL_PATH}...")
69
-
70
- # Create model directory if it doesn't exist
71
  os.makedirs(LOCAL_MODEL_PATH, exist_ok=True)
72
-
73
  for filename in MODEL_FILES:
74
  local_path = os.path.join(LOCAL_MODEL_PATH, filename)
75
  if not os.path.exists(local_path):
@@ -79,9 +67,8 @@ def download_model_files():
79
  repo_id=MODEL_REPO_ID,
80
  filename=filename,
81
  cache_dir="./cache",
82
- force_download=False # Use cache if available
83
  )
84
- # Copy to our local model path
85
  shutil.copy2(downloaded_path, local_path)
86
  print(f"✓ Downloaded and copied {filename}")
87
  except Exception as e:
@@ -89,10 +76,8 @@ def download_model_files():
89
  raise e
90
  else:
91
  print(f"✓ {filename} already exists locally")
92
-
93
  print("All model files are ready!")
94
 
95
- # --- Load the Model ---
96
  if chatterbox_available:
97
  print("Downloading model files from Hugging Face Hub...")
98
  try:
@@ -100,7 +85,7 @@ if chatterbox_available:
100
  except Exception as e:
101
  print(f"ERROR: Failed to download model files: {e}")
102
  print("Model loading will fail without these files.")
103
-
104
  print(f"Attempting to load Chatterbox model from local directory: {LOCAL_MODEL_PATH}")
105
  if not os.path.exists(LOCAL_MODEL_PATH):
106
  print(f"ERROR: Local model directory not found at {LOCAL_MODEL_PATH}")
@@ -108,83 +93,62 @@ if chatterbox_available:
108
  else:
109
  print(f"Contents of {LOCAL_MODEL_PATH}: {os.listdir(LOCAL_MODEL_PATH)}")
110
  try:
111
- # Load the model from the specified local directory
112
- # Set device to CPU or CUDA if available
113
  device = "cuda" if torch.cuda.is_available() else "cpu"
114
  print(f"Using device: {device}")
115
-
116
- # Based on API inspection:
117
- # ChatterboxTTS.from_local signature: (ckpt_dir, device) -> 'ChatterboxTTS'
118
- # ChatterboxTTS.from_pretrained signature: (device) -> 'ChatterboxTTS'
119
-
120
  try:
121
- # Method 1: Use from_local with correct signature (ckpt_dir, device)
122
  model = ChatterboxTTS.from_local(LOCAL_MODEL_PATH, device)
123
  print("Chatterbox model loaded successfully using from_local method.")
124
  except Exception as e1:
125
  print(f"from_local attempt failed: {e1}")
126
  try:
127
- # Method 2: Use from_pretrained with device only
128
  model = ChatterboxTTS.from_pretrained(device)
129
  print("Chatterbox model loaded successfully with from_pretrained.")
130
  except Exception as e2:
131
  print(f"from_pretrained failed: {e2}")
132
  try:
133
- # Method 3: Manual loading with correct constructor signature
134
- # ChatterboxTTS.__init__ signature: (self, t3, s3gen, ve, tokenizer, device, conds=None)
135
  import pathlib
136
  import json
137
-
138
  model_path = pathlib.Path(LOCAL_MODEL_PATH)
139
-
140
  print(f"Manual loading with correct constructor signature...")
141
-
142
- # Load all components
143
  s3gen_path = model_path / "s3gen.pt"
144
  ve_path = model_path / "ve.pt"
145
  tokenizer_path = model_path / "tokenizer.json"
146
  t3_cfg_path = model_path / "t3_cfg.pt"
147
-
148
  print(f" Loading s3gen from: {s3gen_path}")
149
  s3gen = torch.load(s3gen_path, map_location=torch.device('cpu'))
150
-
151
  print(f" Loading ve from: {ve_path}")
152
  ve = torch.load(ve_path, map_location=torch.device('cpu'))
153
-
154
  print(f" Loading t3_cfg from: {t3_cfg_path}")
155
  t3_cfg = torch.load(t3_cfg_path, map_location=torch.device('cpu'))
156
-
157
  print(f" Loading tokenizer from: {tokenizer_path}")
158
  with open(tokenizer_path, 'r') as f:
159
  tokenizer_data = json.load(f)
160
-
161
- # The tokenizer might need to be instantiated as a proper object
162
- # Let's try to use the ChatterboxTTS internal tokenizer class
163
  try:
164
  from chatterbox.models.tokenizers.tokenizer import EnTokenizer
165
  tokenizer = EnTokenizer.from_dict(tokenizer_data)
166
  print(" Created EnTokenizer from JSON data")
167
  except Exception as tok_error:
168
  print(f" Could not create EnTokenizer: {tok_error}")
169
- tokenizer = tokenizer_data # Use raw data as fallback
170
-
171
  print(" Creating ChatterboxTTS instance with correct signature...")
172
-
173
- # Constructor signature: (self, t3, s3gen, ve, tokenizer, device, conds=None)
174
  model = ChatterboxTTS(
175
  t3=t3_cfg,
176
- s3gen=s3gen,
177
  ve=ve,
178
  tokenizer=tokenizer,
179
  device=device
180
  )
181
  print("Chatterbox model loaded successfully with manual constructor.")
182
-
183
  except Exception as e3:
184
  print(f"Manual loading failed: {e3}")
185
  print(f"Detailed error: {str(e3)}")
186
-
187
- # Last resort: try with different parameter orders
188
  try:
189
  print("Trying alternative parameter order...")
190
  model = ChatterboxTTS(
@@ -194,12 +158,12 @@ if chatterbox_available:
194
  except Exception as e4:
195
  print(f"Alternative parameter order failed: {e4}")
196
  raise e3
197
-
198
  except Exception as e:
199
  print(f"ERROR: Failed to load Chatterbox model from local directory: {e}")
200
  print("Detailed error trace:")
201
- traceback.print_exc() # Prints the full traceback to the Hugging Face Space logs
202
- model = None # Ensure model is None if loading fails
203
  else:
204
  print("ERROR: Chatterbox TTS library not available")
205
 
@@ -223,116 +187,86 @@ def clone_voice(text_to_speak, reference_audio_path, exaggeration=0.6, cfg_pace=
223
  print(f" Random Seed: {random_seed}")
224
  print(f" Temperature: {temperature}")
225
 
226
- # Set random seed if specified
227
  if random_seed > 0:
228
  import torch
229
  torch.manual_seed(random_seed)
230
  if torch.cuda.is_available():
231
  torch.cuda.manual_seed(random_seed)
232
 
233
- # Use the correct ChatterboxTTS generate method signature with advanced parameters
234
  output_wav_data = model.generate(
235
  text=text_to_speak,
236
  audio_prompt_path=reference_audio_path,
237
- exaggeration=exaggeration, # Controls how much the voice characteristics are emphasized
238
- cfg_weight=cfg_pace, # Classifier-free guidance weight (pace)
239
- temperature=temperature # Controls randomness in generation
240
  )
241
 
242
- # Get the sample rate from the model
243
  try:
244
- sample_rate = model.sr # ChatterboxTTS uses 'sr' attribute
245
  except:
246
- sample_rate = 24000 # Default fallback
247
 
248
  print(f"Audio generated successfully. Output data type: {type(output_wav_data)}, Sample rate: {sample_rate}")
249
-
250
- # Handle different output formats
251
  if isinstance(output_wav_data, str):
252
- # If it's a file path, return the path
253
  return output_wav_data, "Success: Audio generated successfully!"
254
  else:
255
- # If it's numpy array or tensor, return with sample rate
256
  import numpy as np
257
  if hasattr(output_wav_data, 'cpu'):
258
- # Convert tensor to numpy if needed
259
  output_wav_data = output_wav_data.cpu().numpy()
260
-
261
- # Ensure it's the right shape for Gradio (1D array)
262
  if output_wav_data.ndim > 1:
263
  output_wav_data = output_wav_data.squeeze()
264
-
265
  return (sample_rate, output_wav_data), "Success: Audio generated successfully!"
266
 
267
  except Exception as e:
268
  print(f"ERROR: Failed during audio generation: {e}")
269
  print("Detailed error trace for audio generation:")
270
- traceback.print_exc() # Prints the full traceback
271
  return None, f"Error during audio generation: {str(e)}. Check logs for more details."
272
 
273
- # --- API Endpoint Function ---
274
  def clone_voice_api(text_to_speak, reference_audio_url, exaggeration=0.6, cfg_pace=0.3, random_seed=0, temperature=0.6):
275
- """
276
- API version of clone_voice that accepts URL or base64 audio data
277
- """
278
  import requests
279
  import tempfile
280
  import os
281
  import base64
282
-
283
- # Handle different audio input formats
284
  temp_audio_path = None
285
  try:
286
  if reference_audio_url.startswith('data:audio'):
287
- # Handle base64 encoded audio
288
  header, encoded = reference_audio_url.split(',', 1)
289
  audio_data = base64.b64decode(encoded)
290
-
291
- # Determine file extension from MIME type
292
  if 'mp3' in header:
293
  ext = '.mp3'
294
  elif 'wav' in header:
295
  ext = '.wav'
296
  else:
297
- ext = '.wav' # Default
298
-
299
- # Save to temporary file
300
  with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as temp_file:
301
  temp_file.write(audio_data)
302
  temp_audio_path = temp_file.name
303
-
304
  elif reference_audio_url.startswith('http'):
305
- # Download audio from URL
306
  response = requests.get(reference_audio_url)
307
  response.raise_for_status()
308
-
309
- # Determine extension from URL or content type
310
  if reference_audio_url.endswith('.mp3'):
311
  ext = '.mp3'
312
  elif reference_audio_url.endswith('.wav'):
313
  ext = '.wav'
314
  else:
315
- ext = '.wav' # Default
316
-
317
  with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as temp_file:
318
  temp_file.write(response.content)
319
  temp_audio_path = temp_file.name
320
  else:
321
- # Assume it's a local file path
322
  temp_audio_path = reference_audio_url
323
-
324
- # Call the main clone_voice function
325
  audio_output, status = clone_voice(text_to_speak, temp_audio_path, exaggeration, cfg_pace, random_seed, temperature)
326
-
327
- # Clean up temporary file if we created one
328
  if temp_audio_path and temp_audio_path != reference_audio_url:
329
  try:
330
  os.unlink(temp_audio_path)
331
  except:
332
  pass
333
-
334
  return audio_output, status
335
-
336
  except Exception as e:
337
  if temp_audio_path and temp_audio_path != reference_audio_url:
338
  try:
@@ -341,160 +275,72 @@ def clone_voice_api(text_to_speak, reference_audio_url, exaggeration=0.6, cfg_pa
341
  pass
342
  return None, f"API Error: {str(e)}"
343
 
344
- # --- Define Gradio Interface ---
345
- # --- Define Gradio Interface ---
346
- with gr.Blocks(title="Advanced Chatterbox Voice Cloning", theme=gr.themes.Soft()) as iface:
347
- gr.Markdown("# 🎙️ Advanced Chatterbox Voice Cloning")
348
- gr.Markdown("Clone any voice using advanced AI technology with fine-tuned controls.")
349
-
350
- with gr.Row():
351
- with gr.Column(scale=2):
352
- # Main inputs
353
- text_input = gr.Textbox(
354
  label="Text to Speak",
355
  placeholder="Enter the text you want the cloned voice to say...",
356
  lines=3
357
- )
358
- audio_input = gr.Audio(
359
  type="filepath",
360
  label="Reference Audio (Upload a short .wav or .mp3 clip)",
361
  sources=["upload", "microphone"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
  )
363
-
364
- # Advanced controls in an accordion
365
- with gr.Accordion("🔧 Advanced Settings", open=False):
366
- with gr.Row():
367
- exaggeration = gr.Slider(
368
- minimum=0.25,
369
- maximum=1.0,
370
- value=0.6,
371
- step=0.05,
372
- label="Exaggeration",
373
- info="Controls voice characteristic emphasis (0.5 = neutral, higher = more exaggerated)"
374
- )
375
- cfg_pace = gr.Slider(
376
- minimum=0.2,
377
- maximum=1.0,
378
- value=0.3,
379
- step=0.05,
380
- label="CFG/Pace",
381
- info="Classifier-free guidance weight (affects generation quality and pace)"
382
- )
383
-
384
- with gr.Row():
385
- random_seed = gr.Number(
386
- value=0,
387
- label="Random Seed",
388
- info="Set to 0 for random results, or use a specific number for reproducible outputs",
389
- precision=0
390
- )
391
- temperature = gr.Slider(
392
- minimum=0.05,
393
- maximum=2.0,
394
- value=0.6,
395
- step=0.05,
396
- label="Temperature",
397
- info="Controls randomness in generation (lower = more consistent, higher = more varied)"
398
- )
399
-
400
- # Generate button
401
- generate_btn = gr.Button("🎵 Generate Voice Clone", variant="primary", size="lg")
402
-
403
- with gr.Column(scale=1):
404
- # Outputs
405
- audio_output = gr.Audio(
406
- label="Generated Audio",
407
- type="numpy",
408
- interactive=False
409
- )
410
- status_output = gr.Textbox(
411
- label="Status",
412
- interactive=False,
413
- lines=2
414
- )
415
-
416
- # API Information
417
- with gr.Accordion("🔌 API Usage", open=False):
418
- gr.Markdown("""
419
- ### Using this as an API endpoint
420
-
421
- You can use this Hugging Face Space as an API endpoint in your applications:
422
-
423
- **Endpoint URL:** `https://your-username-voice-cloning.hf.space/api/predict`
424
-
425
- **Example Python code:**
426
- ```python
427
- import requests
428
- import base64
429
-
430
- # Encode your audio file
431
- with open("reference_audio.wav", "rb") as f:
432
- audio_data = base64.b64encode(f.read()).decode()
433
- audio_url = f"data:audio/wav;base64,{audio_data}"
434
-
435
- # API request
436
- response = requests.post(
437
- "https://your-username-voice-cloning.hf.space/api/predict",
438
- json={
439
- "data": [
440
- "Hello, this is my cloned voice!", # text
441
- audio_url, # reference audio (base64 or URL)
442
- 0.6, # exaggeration
443
- 0.3, # cfg_pace
444
- 0, # random_seed
445
- 0.6 # temperature
446
- ]
447
- }
448
- )
449
- ```
450
-
451
- **Parameters:**
452
- - `text_to_speak`: Text to synthesize
453
- - `reference_audio`: Base64 encoded audio or URL
454
- - `exaggeration`: Voice emphasis (0.25-1.0, default: 0.6)
455
- - `cfg_pace`: Generation guidance (0.2-1.0, default: 0.3)
456
- - `random_seed`: Reproducibility seed (0 for random, default: 0)
457
- - `temperature`: Generation randomness (0.05-2.0, default: 0.6)
458
- """)
459
-
460
- # Examples
461
- with gr.Accordion("📝 Examples", open=False):
462
- gr.Examples(
463
- examples=[
464
- ["Hello, this is a test of the voice cloning system.", None, 0.5, 0.5, 0, 0.8],
465
- ["The quick brown fox jumps over the lazy dog.", None, 0.7, 0.3, 42, 0.6],
466
- ["Welcome to our AI voice cloning service. We hope you enjoy the experience!", None, 0.4, 0.7, 123, 1.0]
467
- ],
468
- inputs=[text_input, audio_input, exaggeration, cfg_pace, random_seed, temperature],
469
- outputs=[audio_output, status_output],
470
- fn=clone_voice,
471
- cache_examples=False
472
- )
473
-
474
- # Connect the generate button
475
- generate_btn.click(
476
- fn=clone_voice,
477
- inputs=[text_input, audio_input, exaggeration, cfg_pace, random_seed, temperature],
478
- outputs=[audio_output, status_output],
479
- api_name="clone_voice" # This enables API access
480
  )
481
-
482
- # --- Launch the Gradio App ---
483
- def main():
484
- print("Starting Advanced Gradio interface...")
485
- # Launch with specific configuration for API access and avoid manifest issues
486
  iface.launch(
487
- server_name="0.0.0.0", # Allow external connections
488
- server_port=7860, # Explicit port
489
- show_error=True, # Show detailed errors
490
- quiet=False, # Show startup logs
491
- favicon_path=None, # Disable favicon to avoid 404
492
- share=False, # Set to True if you want a public link
493
- auth=None, # Add authentication if needed: ("username", "password")
494
- app_kwargs={
495
- "docs_url": "/docs", # Enable API docs at /docs
496
- "redoc_url": "/redoc" # Enable alternative docs at /redoc
497
- }
498
  )
499
 
500
  if __name__ == "__main__":
 
1
  import gradio as gr
2
  import os
3
+ import traceback
4
  import torch
5
  from huggingface_hub import hf_hub_download
6
  import shutil
7
  import spaces
8
 
 
9
  try:
10
  from config import MODEL_REPO_ID, MODEL_FILES, LOCAL_MODEL_PATH
11
  except ImportError:
 
12
  MODEL_REPO_ID = "ramimu/chatterbox-voice-cloning-model"
13
  LOCAL_MODEL_PATH = "./chatterbox_model_files"
14
  MODEL_FILES = ["s3gen.pt", "t3_cfg.pt", "ve.pt", "tokenizer.json"]
15
 
 
16
  try:
17
  from chatterbox.tts import ChatterboxTTS
18
  chatterbox_available = True
19
  print("Chatterbox TTS imported successfully")
20
+
 
21
  import inspect
22
  print(f"ChatterboxTTS methods: {[method for method in dir(ChatterboxTTS) if not method.startswith('_')]}")
23
+
 
24
  try:
25
  sig = inspect.signature(ChatterboxTTS.__init__)
26
  print(f"ChatterboxTTS.__init__ signature: {sig}")
27
  except:
28
  pass
29
+
 
30
  if hasattr(ChatterboxTTS, 'from_local'):
31
  try:
32
  sig = inspect.signature(ChatterboxTTS.from_local)
33
  print(f"ChatterboxTTS.from_local signature: {sig}")
34
  except:
35
  pass
36
+
 
37
  if hasattr(ChatterboxTTS, 'from_pretrained'):
38
  try:
39
  sig = inspect.signature(ChatterboxTTS.from_pretrained)
40
  print(f"ChatterboxTTS.from_pretrained signature: {sig}")
41
  except:
42
  pass
43
+
44
  except ImportError as e:
45
  print(f"Failed to import ChatterboxTTS: {e}")
46
  print("Trying alternative import...")
 
53
  print(f"Alternative import also failed: {e2}")
54
  chatterbox_available = False
55
 
 
56
  model = None
57
 
58
  def download_model_files():
 
59
  print(f"Checking for model files in {LOCAL_MODEL_PATH}...")
 
 
60
  os.makedirs(LOCAL_MODEL_PATH, exist_ok=True)
 
61
  for filename in MODEL_FILES:
62
  local_path = os.path.join(LOCAL_MODEL_PATH, filename)
63
  if not os.path.exists(local_path):
 
67
  repo_id=MODEL_REPO_ID,
68
  filename=filename,
69
  cache_dir="./cache",
70
+ force_download=False
71
  )
 
72
  shutil.copy2(downloaded_path, local_path)
73
  print(f"✓ Downloaded and copied {filename}")
74
  except Exception as e:
 
76
  raise e
77
  else:
78
  print(f"✓ {filename} already exists locally")
 
79
  print("All model files are ready!")
80
 
 
81
  if chatterbox_available:
82
  print("Downloading model files from Hugging Face Hub...")
83
  try:
 
85
  except Exception as e:
86
  print(f"ERROR: Failed to download model files: {e}")
87
  print("Model loading will fail without these files.")
88
+
89
  print(f"Attempting to load Chatterbox model from local directory: {LOCAL_MODEL_PATH}")
90
  if not os.path.exists(LOCAL_MODEL_PATH):
91
  print(f"ERROR: Local model directory not found at {LOCAL_MODEL_PATH}")
 
93
  else:
94
  print(f"Contents of {LOCAL_MODEL_PATH}: {os.listdir(LOCAL_MODEL_PATH)}")
95
  try:
 
 
96
  device = "cuda" if torch.cuda.is_available() else "cpu"
97
  print(f"Using device: {device}")
98
+
 
 
 
 
99
  try:
 
100
  model = ChatterboxTTS.from_local(LOCAL_MODEL_PATH, device)
101
  print("Chatterbox model loaded successfully using from_local method.")
102
  except Exception as e1:
103
  print(f"from_local attempt failed: {e1}")
104
  try:
 
105
  model = ChatterboxTTS.from_pretrained(device)
106
  print("Chatterbox model loaded successfully with from_pretrained.")
107
  except Exception as e2:
108
  print(f"from_pretrained failed: {e2}")
109
  try:
 
 
110
  import pathlib
111
  import json
112
+
113
  model_path = pathlib.Path(LOCAL_MODEL_PATH)
 
114
  print(f"Manual loading with correct constructor signature...")
115
+
 
116
  s3gen_path = model_path / "s3gen.pt"
117
  ve_path = model_path / "ve.pt"
118
  tokenizer_path = model_path / "tokenizer.json"
119
  t3_cfg_path = model_path / "t3_cfg.pt"
120
+
121
  print(f" Loading s3gen from: {s3gen_path}")
122
  s3gen = torch.load(s3gen_path, map_location=torch.device('cpu'))
 
123
  print(f" Loading ve from: {ve_path}")
124
  ve = torch.load(ve_path, map_location=torch.device('cpu'))
 
125
  print(f" Loading t3_cfg from: {t3_cfg_path}")
126
  t3_cfg = torch.load(t3_cfg_path, map_location=torch.device('cpu'))
 
127
  print(f" Loading tokenizer from: {tokenizer_path}")
128
  with open(tokenizer_path, 'r') as f:
129
  tokenizer_data = json.load(f)
130
+
 
 
131
  try:
132
  from chatterbox.models.tokenizers.tokenizer import EnTokenizer
133
  tokenizer = EnTokenizer.from_dict(tokenizer_data)
134
  print(" Created EnTokenizer from JSON data")
135
  except Exception as tok_error:
136
  print(f" Could not create EnTokenizer: {tok_error}")
137
+ tokenizer = tokenizer_data
138
+
139
  print(" Creating ChatterboxTTS instance with correct signature...")
 
 
140
  model = ChatterboxTTS(
141
  t3=t3_cfg,
142
+ s3gen=s3gen,
143
  ve=ve,
144
  tokenizer=tokenizer,
145
  device=device
146
  )
147
  print("Chatterbox model loaded successfully with manual constructor.")
148
+
149
  except Exception as e3:
150
  print(f"Manual loading failed: {e3}")
151
  print(f"Detailed error: {str(e3)}")
 
 
152
  try:
153
  print("Trying alternative parameter order...")
154
  model = ChatterboxTTS(
 
158
  except Exception as e4:
159
  print(f"Alternative parameter order failed: {e4}")
160
  raise e3
161
+
162
  except Exception as e:
163
  print(f"ERROR: Failed to load Chatterbox model from local directory: {e}")
164
  print("Detailed error trace:")
165
+ traceback.print_exc()
166
+ model = None
167
  else:
168
  print("ERROR: Chatterbox TTS library not available")
169
 
 
187
  print(f" Random Seed: {random_seed}")
188
  print(f" Temperature: {temperature}")
189
 
 
190
  if random_seed > 0:
191
  import torch
192
  torch.manual_seed(random_seed)
193
  if torch.cuda.is_available():
194
  torch.cuda.manual_seed(random_seed)
195
 
 
196
  output_wav_data = model.generate(
197
  text=text_to_speak,
198
  audio_prompt_path=reference_audio_path,
199
+ exaggeration=exaggeration,
200
+ cfg_weight=cfg_pace,
201
+ temperature=temperature
202
  )
203
 
 
204
  try:
205
+ sample_rate = model.sr
206
  except:
207
+ sample_rate = 24000
208
 
209
  print(f"Audio generated successfully. Output data type: {type(output_wav_data)}, Sample rate: {sample_rate}")
210
+
 
211
  if isinstance(output_wav_data, str):
 
212
  return output_wav_data, "Success: Audio generated successfully!"
213
  else:
 
214
  import numpy as np
215
  if hasattr(output_wav_data, 'cpu'):
 
216
  output_wav_data = output_wav_data.cpu().numpy()
 
 
217
  if output_wav_data.ndim > 1:
218
  output_wav_data = output_wav_data.squeeze()
 
219
  return (sample_rate, output_wav_data), "Success: Audio generated successfully!"
220
 
221
  except Exception as e:
222
  print(f"ERROR: Failed during audio generation: {e}")
223
  print("Detailed error trace for audio generation:")
224
+ traceback.print_exc()
225
  return None, f"Error during audio generation: {str(e)}. Check logs for more details."
226
 
 
227
  def clone_voice_api(text_to_speak, reference_audio_url, exaggeration=0.6, cfg_pace=0.3, random_seed=0, temperature=0.6):
 
 
 
228
  import requests
229
  import tempfile
230
  import os
231
  import base64
232
+
 
233
  temp_audio_path = None
234
  try:
235
  if reference_audio_url.startswith('data:audio'):
 
236
  header, encoded = reference_audio_url.split(',', 1)
237
  audio_data = base64.b64decode(encoded)
 
 
238
  if 'mp3' in header:
239
  ext = '.mp3'
240
  elif 'wav' in header:
241
  ext = '.wav'
242
  else:
243
+ ext = '.wav'
 
 
244
  with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as temp_file:
245
  temp_file.write(audio_data)
246
  temp_audio_path = temp_file.name
 
247
  elif reference_audio_url.startswith('http'):
 
248
  response = requests.get(reference_audio_url)
249
  response.raise_for_status()
 
 
250
  if reference_audio_url.endswith('.mp3'):
251
  ext = '.mp3'
252
  elif reference_audio_url.endswith('.wav'):
253
  ext = '.wav'
254
  else:
255
+ ext = '.wav'
 
256
  with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as temp_file:
257
  temp_file.write(response.content)
258
  temp_audio_path = temp_file.name
259
  else:
 
260
  temp_audio_path = reference_audio_url
261
+
 
262
  audio_output, status = clone_voice(text_to_speak, temp_audio_path, exaggeration, cfg_pace, random_seed, temperature)
263
+
 
264
  if temp_audio_path and temp_audio_path != reference_audio_url:
265
  try:
266
  os.unlink(temp_audio_path)
267
  except:
268
  pass
 
269
  return audio_output, status
 
270
  except Exception as e:
271
  if temp_audio_path and temp_audio_path != reference_audio_url:
272
  try:
 
275
  pass
276
  return None, f"API Error: {str(e)}"
277
 
278
+ def main():
279
+ print("Starting Advanced Gradio interface...")
280
+ iface = gr.Interface(
281
+ fn=clone_voice_api,
282
+ inputs=[
283
+ gr.Textbox(
 
 
 
 
284
  label="Text to Speak",
285
  placeholder="Enter the text you want the cloned voice to say...",
286
  lines=3
287
+ ),
288
+ gr.Audio(
289
  type="filepath",
290
  label="Reference Audio (Upload a short .wav or .mp3 clip)",
291
  sources=["upload", "microphone"]
292
+ ),
293
+ gr.Slider(
294
+ minimum=0.25,
295
+ maximum=1.0,
296
+ value=0.6,
297
+ step=0.05,
298
+ label="Exaggeration",
299
+ info="Controls voice characteristic emphasis (0.5 = neutral, higher = more exaggerated)"
300
+ ),
301
+ gr.Slider(
302
+ minimum=0.2,
303
+ maximum=1.0,
304
+ value=0.3,
305
+ step=0.05,
306
+ label="CFG/Pace",
307
+ info="Classifier-free guidance weight (affects generation quality and pace)"
308
+ ),
309
+ gr.Number(
310
+ value=0,
311
+ label="Random Seed",
312
+ info="Set to 0 for random results, or use a specific number for reproducible outputs",
313
+ precision=0
314
+ ),
315
+ gr.Slider(
316
+ minimum=0.05,
317
+ maximum=2.0,
318
+ value=0.6,
319
+ step=0.05,
320
+ label="Temperature",
321
+ info="Controls randomness in generation (lower = more consistent, higher = more varied)"
322
  )
323
+ ],
324
+ outputs=[
325
+ gr.Audio(label="Generated Audio", type="numpy"),
326
+ gr.Textbox(label="Status", lines=2)
327
+ ],
328
+ title="🎙️ Advanced Chatterbox Voice Cloning",
329
+ description="Clone any voice using advanced AI technology with fine-tuned controls.",
330
+ examples=[
331
+ ["Hello, this is a test of the voice cloning system.", None, 0.5, 0.5, 0, 0.8],
332
+ ["The quick brown fox jumps over the lazy dog.", None, 0.7, 0.3, 42, 0.6],
333
+ ["Welcome to our AI voice cloning service. We hope you enjoy the experience!", None, 0.4, 0.7, 123, 1.0]
334
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
335
  )
 
 
 
 
 
336
  iface.launch(
337
+ server_name="0.0.0.0",
338
+ server_port=7860,
339
+ show_error=True,
340
+ quiet=False,
341
+ favicon_path=None,
342
+ share=False,
343
+ auth=None
 
 
 
 
344
  )
345
 
346
  if __name__ == "__main__":