Spaces:

ramimu
/

voice_cloning

Running on Zero

App Files Files Community

ramimu commited on 13 days ago

Commit

91d6893

verified ·

1 Parent(s): 7be21d2

Update app.py

Browse files

Files changed (1) hide show

app.py +189 -276

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ import gradio as gr
 import os
 import traceback
 import torch
 from huggingface_hub import hf_hub_download
 import shutil
 import spaces
@@ -17,47 +18,109 @@ try:
     from chatterbox.tts import ChatterboxTTS
     chatterbox_available = True
     print("Chatterbox TTS imported successfully")
-    import inspect
-    print(f"ChatterboxTTS methods: {[method for method in dir(ChatterboxTTS) if not method.startswith('_')]}")
     try:
-        sig = inspect.signature(ChatterboxTTS.__init__)
-        print(f"ChatterboxTTS.__init__ signature: {sig}")
-    except:
-        pass
-    if hasattr(ChatterboxTTS, 'from_local'):
         try:
-            sig = inspect.signature(ChatterboxTTS.from_local)
-            print(f"ChatterboxTTS.from_local signature: {sig}")
-        except:
-            pass
-    if hasattr(ChatterboxTTS, 'from_pretrained'):
-        try:
-            sig = inspect.signature(ChatterboxTTS.from_pretrained)
-            print(f"ChatterboxTTS.from_pretrained signature: {sig}")
-        except:
-            pass
-except ImportError as e:
-    print(f"Failed to import ChatterboxTTS: {e}")
-    print("Trying alternative import...")
     try:
-        import chatterbox
-        from chatterbox import ChatterboxTTS
-        chatterbox_available = True
-        print("Chatterbox TTS imported with alternative method")
-    except ImportError as e2:
-        print(f"Alternative import also failed: {e2}")
-        chatterbox_available = False
-model = None
 def download_model_files():
     print(f"Checking for model files in {LOCAL_MODEL_PATH}...")
     os.makedirs(LOCAL_MODEL_PATH, exist_ok=True)
     for filename in MODEL_FILES:
         local_path = os.path.join(LOCAL_MODEL_PATH, filename)
         if not os.path.exists(local_path):
@@ -78,97 +141,19 @@ def download_model_files():
             print(f"✓ {filename} already exists locally")
     print("All model files are ready!")
 if chatterbox_available:
-    print("Downloading model files from Hugging Face Hub...")
     try:
         download_model_files()
     except Exception as e:
-        print(f"ERROR: Failed to download model files: {e}")
-        print("Model loading will fail without these files.")
-    print(f"Attempting to load Chatterbox model from local directory: {LOCAL_MODEL_PATH}")
-    if not os.path.exists(LOCAL_MODEL_PATH):
-        print(f"ERROR: Local model directory not found at {LOCAL_MODEL_PATH}")
-        print("Please ensure the model files were downloaded successfully.")
-    else:
-        print(f"Contents of {LOCAL_MODEL_PATH}: {os.listdir(LOCAL_MODEL_PATH)}")
-        try:
-            device = "cuda" if torch.cuda.is_available() else "cpu"
-            print(f"Using device: {device}")
-            try:
-                model = ChatterboxTTS.from_local(LOCAL_MODEL_PATH, device)
-                print("Chatterbox model loaded successfully using from_local method.")
-            except Exception as e1:
-                print(f"from_local attempt failed: {e1}")
-                try:
-                    model = ChatterboxTTS.from_pretrained(device)
-                    print("Chatterbox model loaded successfully with from_pretrained.")
-                except Exception as e2:
-                    print(f"from_pretrained failed: {e2}")
-                    try:
-                        import pathlib
-                        import json
-                        model_path = pathlib.Path(LOCAL_MODEL_PATH)
-                        print(f"Manual loading with correct constructor signature...")
-                        s3gen_path = model_path / "s3gen.pt"
-                        ve_path = model_path / "ve.pt"
-                        tokenizer_path = model_path / "tokenizer.json"
-                        t3_cfg_path = model_path / "t3_cfg.pt"
-                        print(f"  Loading s3gen from: {s3gen_path}")
-                        s3gen = torch.load(s3gen_path, map_location=torch.device('cpu'))
-                        print(f"  Loading ve from: {ve_path}")
-                        ve = torch.load(ve_path, map_location=torch.device('cpu'))
-                        print(f"  Loading t3_cfg from: {t3_cfg_path}")
-                        t3_cfg = torch.load(t3_cfg_path, map_location=torch.device('cpu'))
-                        print(f"  Loading tokenizer from: {tokenizer_path}")
-                        with open(tokenizer_path, 'r') as f:
-                            tokenizer_data = json.load(f)
-                        try:
-                            from chatterbox.models.tokenizers.tokenizer import EnTokenizer
-                            tokenizer = EnTokenizer.from_dict(tokenizer_data)
-                            print("  Created EnTokenizer from JSON data")
-                        except Exception as tok_error:
-                            print(f"  Could not create EnTokenizer: {tok_error}")
-                            tokenizer = tokenizer_data
-                        print("  Creating ChatterboxTTS instance with correct signature...")
-                        model = ChatterboxTTS(
-                            t3=t3_cfg,
-                            s3gen=s3gen,
-                            ve=ve,
-                            tokenizer=tokenizer,
-                            device=device
-                        )
-                        print("Chatterbox model loaded successfully with manual constructor.")
-                    except Exception as e3:
-                        print(f"Manual loading failed: {e3}")
-                        print(f"Detailed error: {str(e3)}")
-                        try:
-                            print("Trying alternative parameter order...")
-                            model = ChatterboxTTS(
-                                s3gen, ve, tokenizer, t3_cfg, device
-                            )
-                            print("Chatterbox model loaded with alternative parameter order.")
-                        except Exception as e4:
-                            print(f"Alternative parameter order failed: {e4}")
-                            raise e3
-        except Exception as e:
-            print(f"ERROR: Failed to load Chatterbox model from local directory: {e}")
-            print("Detailed error trace:")
-            traceback.print_exc()
-            model = None
-else:
-    print("ERROR: Chatterbox TTS library not available")
 @spaces.GPU
 def clone_voice(text_to_speak, reference_audio_path, exaggeration=0.6, cfg_pace=0.3, random_seed=0, temperature=0.6):
     if not chatterbox_available:
         return None, "Error: Chatterbox TTS library not available. Please check installation."
     if model is None:
@@ -179,52 +164,99 @@ def clone_voice(text_to_speak, reference_audio_path, exaggeration=0.6, cfg_pace=
         return None, "Error: Please upload a reference audio file (.wav or .mp3)."
     try:
-        print(f"Received request:")
-        print(f"  Text: '{text_to_speak}'")
         print(f"  Audio: '{reference_audio_path}'")
-        print(f"  Exaggeration: {exaggeration}")
-        print(f"  CFG/Pace: {cfg_pace}")
-        print(f"  Random Seed: {random_seed}")
-        print(f"  Temperature: {temperature}")
         if random_seed > 0:
-            import torch
             torch.manual_seed(random_seed)
             if torch.cuda.is_available():
                 torch.cuda.manual_seed(random_seed)
-        output_wav_data = model.generate(
-            text=text_to_speak,
-            audio_prompt_path=reference_audio_path,
-            exaggeration=exaggeration,
-            cfg_weight=cfg_pace,
-            temperature=temperature
-        )
         try:
             sample_rate = model.sr
         except:
             sample_rate = 24000
-        print(f"Audio generated successfully. Output data type: {type(output_wav_data)}, Sample rate: {sample_rate}")
         if isinstance(output_wav_data, str):
-            return output_wav_data, "Success: Audio generated successfully!"
         else:
             import numpy as np
             if hasattr(output_wav_data, 'cpu'):
                 output_wav_data = output_wav_data.cpu().numpy()
             if output_wav_data.ndim > 1:
                 output_wav_data = output_wav_data.squeeze()
-            return (sample_rate, output_wav_data), "Success: Audio generated successfully!"
     except Exception as e:
-        print(f"ERROR: Failed during audio generation: {e}")
-        print("Detailed error trace for audio generation:")
         traceback.print_exc()
-        return None, f"Error during audio generation: {str(e)}. Check logs for more details."
 def clone_voice_api(text_to_speak, reference_audio_url, exaggeration=0.6, cfg_pace=0.3, random_seed=0, temperature=0.6):
     import requests
     import tempfile
     import os
@@ -232,164 +264,45 @@ def clone_voice_api(text_to_speak, reference_audio_url, exaggeration=0.6, cfg_pa
     temp_audio_path = None
     try:
         if reference_audio_url.startswith('data:audio'):
             header, encoded = reference_audio_url.split(',', 1)
             audio_data = base64.b64decode(encoded)
-            if 'mp3' in header:
-                ext = '.mp3'
-            elif 'wav' in header:
-                ext = '.wav'
-            else:
-                ext = '.wav'
             with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as temp_file:
                 temp_file.write(audio_data)
                 temp_audio_path = temp_file.name
         elif reference_audio_url.startswith('http'):
-            response = requests.get(reference_audio_url)
             response.raise_for_status()
-            if reference_audio_url.endswith('.mp3'):
-                ext = '.mp3'
-            elif reference_audio_url.endswith('.wav'):
-                ext = '.wav'
-            else:
-                ext = '.wav'
             with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as temp_file:
                 temp_file.write(response.content)
                 temp_audio_path = temp_file.name
         else:
             temp_audio_path = reference_audio_url
         audio_output, status = clone_voice(text_to_speak, temp_audio_path, exaggeration, cfg_pace, random_seed, temperature)
-        if temp_audio_path and temp_audio_path != reference_audio_url:
-            try:
-                os.unlink(temp_audio_path)
-            except:
-                pass
         return audio_output, status
     except Exception as e:
         if temp_audio_path and temp_audio_path != reference_audio_url:
             try:
                 os.unlink(temp_audio_path)
             except:
                 pass
-        return None, f"API Error: {str(e)}"
 def main():
     print("Starting Advanced Gradio interface...")
-    # Create a Blocks interface with multiple functions
-    with gr.Blocks(title="🎙️ Advanced Chatterbox Voice Cloning") as demo:
-        gr.Markdown("# 🎙️ Advanced Chatterbox Voice Cloning")
-        gr.Markdown("Clone any voice using advanced AI technology with fine-tuned controls.")
-        with gr.Row():
-            with gr.Column(scale=2):
-                # Main interface inputs
-                text_input = gr.Textbox(
-                    label="Text to Speak",
-                    placeholder="Enter the text you want the cloned voice to say...",
-                    lines=3
-                )
-                audio_input = gr.Audio(
-                    type="filepath",
-                    label="Reference Audio (Upload a short .wav or .mp3 clip)",
-                    sources=["upload", "microphone"]
-                )
-                with gr.Accordion("🔧 Advanced Settings", open=False):
-                    with gr.Row():
-                        exaggeration_input = gr.Slider(
-                            minimum=0.25,
-                            maximum=1.0,
-                            value=0.6,
-                            step=0.05,
-                            label="Exaggeration",
-                            info="Controls voice characteristic emphasis"
-                        )
-                        cfg_pace_input = gr.Slider(
-                            minimum=0.2,
-                            maximum=1.0,
-                            value=0.3,
-                            step=0.05,
-                            label="CFG/Pace",
-                            info="Classifier-free guidance weight"
-                        )
-                    with gr.Row():
-                        seed_input = gr.Number(
-                            value=0,
-                            label="Random Seed",
-                            info="Set to 0 for random results",
-                            precision=0
-                        )
-                        temperature_input = gr.Slider(
-                            minimum=0.05,
-                            maximum=2.0,
-                            value=0.6,
-                            step=0.05,
-                            label="Temperature",
-                            info="Controls randomness in generation"
-                        )
-                generate_btn = gr.Button("🎵 Generate Voice Clone", variant="primary", size="lg")
-            with gr.Column(scale=1):
-                # Outputs
-                audio_output = gr.Audio(label="Generated Audio", type="numpy")
-                status_output = gr.Textbox(label="Status", lines=2)
-        with gr.Accordion("📝 Examples", open=False):
-            gr.Examples(
-                examples=[
-                    ["Hello, this is a test of the voice cloning system.", None, 0.5, 0.5, 0, 0.8],
-                    ["The quick brown fox jumps over the lazy dog.", None, 0.7, 0.3, 42, 0.6],
-                    ["Welcome to our AI voice cloning service. We hope you enjoy the experience!", None, 0.4, 0.7, 123, 1.0]
-                ],
-                inputs=[text_input, audio_input, exaggeration_input, cfg_pace_input, seed_input, temperature_input]
-            )
-        # Main interface function (for file uploads)
-        generate_btn.click(
-            fn=clone_voice_api,
-            inputs=[text_input, audio_input, exaggeration_input, cfg_pace_input, seed_input, temperature_input],
-            outputs=[audio_output, status_output],
-            api_name="predict"
-        )
-        # API function for base64 data (for external API calls)
-        def clone_voice_base64_api(text_to_speak, reference_audio_b64, exaggeration=0.6, cfg_pace=0.3, random_seed=0, temperature=0.6):
-            """API function that accepts base64 audio data directly."""
-            return clone_voice_api(text_to_speak, reference_audio_b64, exaggeration, cfg_pace, random_seed, temperature)
-        # Hidden inputs/outputs for the base64 API
-        with gr.Row(visible=False):
-            api_text_input = gr.Textbox()
-            api_audio_input = gr.Textbox()  # This will receive base64 data URL
-            api_exaggeration_input = gr.Slider(minimum=0.25, maximum=1.0, value=0.6)
-            api_cfg_pace_input = gr.Slider(minimum=0.2, maximum=1.0, value=0.3)
-            api_seed_input = gr.Number(value=0, precision=0)
-            api_temperature_input = gr.Slider(minimum=0.05, maximum=2.0, value=0.6)
-            api_audio_output = gr.Audio(type="numpy")
-            api_status_output = gr.Textbox()
-            api_btn = gr.Button()
-        # API endpoint for base64 data
-        api_btn.click(
-            fn=clone_voice_base64_api,
-            inputs=[api_text_input, api_audio_input, api_exaggeration_input, api_cfg_pace_input, api_seed_input, api_temperature_input],
-            outputs=[api_audio_output, api_status_output],
-            api_name="clone_voice"
-        )
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        show_error=True,
-        quiet=False,
-        favicon_path=None,
-        share=False,
-        auth=None
-    )
 if __name__ == "__main__":
-    main()

 import os
 import traceback
 import torch
+import gc
 from huggingface_hub import hf_hub_download
 import shutil
 import spaces
     from chatterbox.tts import ChatterboxTTS
     chatterbox_available = True
     print("Chatterbox TTS imported successfully")
+except ImportError as e:
+    print(f"Failed to import ChatterboxTTS: {e}")
+    chatterbox_available = False
+model = None
+def cleanup_gpu_memory():
+    """Clean up GPU memory to prevent CUDA errors."""
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+        gc.collect()
+def safe_load_model():
+    """Safely load the model with proper error handling."""
+    global model
+    if not chatterbox_available:
+        print("ERROR: Chatterbox TTS library not available")
+        return False
     try:
+        # Clean up any existing GPU memory
+        cleanup_gpu_memory()
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        print(f"Loading model on device: {device}")
+        # Try different loading methods
         try:
+            model = ChatterboxTTS.from_local(LOCAL_MODEL_PATH, device)
+            print("✓ Model loaded successfully using from_local method.")
+        except Exception as e1:
+            print(f"from_local failed: {e1}")
+            try:
+                model = ChatterboxTTS.from_pretrained(device)
+                print("✓ Model loaded successfully with from_pretrained.")
+            except Exception as e2:
+                print(f"from_pretrained failed: {e2}")
+                # Manual loading as fallback
+                model = load_model_manually(device)
+        # Move model to device and set to eval mode
+        if model and hasattr(model, 'to'):
+            model = model.to(device)
+        if model and hasattr(model, 'eval'):
+            model.eval()
+        # Clean up after loading
+        cleanup_gpu_memory()
+        return True
+    except Exception as e:
+        print(f"ERROR: Failed to load model: {e}")
+        traceback.print_exc()
+        model = None
+        cleanup_gpu_memory()
+        return False
+def load_model_manually(device):
+    """Manual model loading with proper error handling."""
+    import pathlib
+    import json
+    model_path = pathlib.Path(LOCAL_MODEL_PATH)
+    print("Manual loading with correct constructor signature...")
+    # Load components to CPU first
+    s3gen_path = model_path / "s3gen.pt"
+    ve_path = model_path / "ve.pt"
+    tokenizer_path = model_path / "tokenizer.json"
+    t3_cfg_path = model_path / "t3_cfg.pt"
+    s3gen = torch.load(s3gen_path, map_location='cpu')
+    ve = torch.load(ve_path, map_location='cpu')
+    t3_cfg = torch.load(t3_cfg_path, map_location='cpu')
+    with open(tokenizer_path, 'r') as f:
+        tokenizer_data = json.load(f)
     try:
+        from chatterbox.models.tokenizers.tokenizer import EnTokenizer
+        tokenizer = EnTokenizer.from_dict(tokenizer_data)
+    except Exception:
+        tokenizer = tokenizer_data
+    # Create model instance
+    model = ChatterboxTTS(
+        t3=t3_cfg,
+        s3gen=s3gen,
+        ve=ve,
+        tokenizer=tokenizer,
+        device=device
+    )
+    print("✓ Model loaded successfully with manual constructor.")
+    return model
 def download_model_files():
+    """Download model files with error handling."""
     print(f"Checking for model files in {LOCAL_MODEL_PATH}...")
     os.makedirs(LOCAL_MODEL_PATH, exist_ok=True)
     for filename in MODEL_FILES:
         local_path = os.path.join(LOCAL_MODEL_PATH, filename)
         if not os.path.exists(local_path):
             print(f"✓ {filename} already exists locally")
     print("All model files are ready!")
+# Initialize model
 if chatterbox_available:
     try:
         download_model_files()
+        safe_load_model()
     except Exception as e:
+        print(f"ERROR during initialization: {e}")
 @spaces.GPU
 def clone_voice(text_to_speak, reference_audio_path, exaggeration=0.6, cfg_pace=0.3, random_seed=0, temperature=0.6):
+    """Main voice cloning function with improved error handling."""
+    # Input validation
     if not chatterbox_available:
         return None, "Error: Chatterbox TTS library not available. Please check installation."
     if model is None:
         return None, "Error: Please upload a reference audio file (.wav or .mp3)."
     try:
+        print(f"Processing request:")
+        print(f"  Text length: {len(text_to_speak)} characters")
         print(f"  Audio: '{reference_audio_path}'")
+        print(f"  Parameters: exag={exaggeration}, cfg={cfg_pace}, seed={random_seed}, temp={temperature}")
+        # Clean GPU memory before generation
+        cleanup_gpu_memory()
+        # Set random seed if specified
         if random_seed > 0:
             torch.manual_seed(random_seed)
             if torch.cuda.is_available():
                 torch.cuda.manual_seed(random_seed)
+        # Check CUDA availability before generation
+        if torch.cuda.is_available():
+            print(f"CUDA memory before generation: {torch.cuda.memory_allocated() / 1024**2:.1f} MB")
+        # Generate audio with error handling
+        try:
+            with torch.no_grad():  # Disable gradient computation
+                output_wav_data = model.generate(
+                    text=text_to_speak,
+                    audio_prompt_path=reference_audio_path,
+                    exaggeration=exaggeration,
+                    cfg_weight=cfg_pace,
+                    temperature=temperature
+                )
+        except RuntimeError as e:
+            if "CUDA" in str(e) or "out of memory" in str(e):
+                print(f"CUDA error during generation: {e}")
+                # Try to recover by cleaning memory and retrying
+                cleanup_gpu_memory()
+                try:
+                    with torch.no_grad():
+                        output_wav_data = model.generate(
+                            text=text_to_speak,
+                            audio_prompt_path=reference_audio_path,
+                            exaggeration=exaggeration,
+                            cfg_weight=cfg_pace,
+                            temperature=temperature
+                        )
+                    print("✓ Recovery successful after memory cleanup")
+                except Exception as retry_error:
+                    print(f"✗ Recovery failed: {retry_error}")
+                    return None, f"CUDA error: {str(e)}. GPU memory issue - please try again in a moment."
+            else:
+                raise e
+        # Get sample rate
         try:
             sample_rate = model.sr
         except:
             sample_rate = 24000
+        # Process output
         if isinstance(output_wav_data, str):
+            result = output_wav_data
         else:
             import numpy as np
             if hasattr(output_wav_data, 'cpu'):
                 output_wav_data = output_wav_data.cpu().numpy()
             if output_wav_data.ndim > 1:
                 output_wav_data = output_wav_data.squeeze()
+            result = (sample_rate, output_wav_data)
+        # Clean up GPU memory after generation
+        cleanup_gpu_memory()
+        if torch.cuda.is_available():
+            print(f"CUDA memory after generation: {torch.cuda.memory_allocated() / 1024**2:.1f} MB")
+        print("✓ Audio generated successfully")
+        return result, "Success: Audio generated successfully!"
     except Exception as e:
+        print(f"ERROR during audio generation: {e}")
         traceback.print_exc()
+        # Clean up on error
+        cleanup_gpu_memory()
+        # Provide specific error messages
+        error_msg = str(e)
+        if "CUDA" in error_msg or "device-side assert" in error_msg:
+            return None, f"CUDA error: {error_msg}. This is usually a temporary GPU issue. Please try again in a moment."
+        elif "out of memory" in error_msg:
+            return None, f"GPU memory error: {error_msg}. Please try with shorter text or try again later."
+        else:
+            return None, f"Error during audio generation: {error_msg}. Check logs for more details."
 def clone_voice_api(text_to_speak, reference_audio_url, exaggeration=0.6, cfg_pace=0.3, random_seed=0, temperature=0.6):
+    """API wrapper with improved error handling."""
     import requests
     import tempfile
     import os
     temp_audio_path = None
     try:
+        # Handle different audio input formats
         if reference_audio_url.startswith('data:audio'):
             header, encoded = reference_audio_url.split(',', 1)
             audio_data = base64.b64decode(encoded)
+            ext = '.mp3' if 'mp3' in header else '.wav'
             with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as temp_file:
                 temp_file.write(audio_data)
                 temp_audio_path = temp_file.name
         elif reference_audio_url.startswith('http'):
+            response = requests.get(reference_audio_url, timeout=30)
             response.raise_for_status()
+            ext = '.mp3' if reference_audio_url.endswith('.mp3') else '.wav'
             with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as temp_file:
                 temp_file.write(response.content)
                 temp_audio_path = temp_file.name
         else:
             temp_audio_path = reference_audio_url
+        # Generate audio
         audio_output, status = clone_voice(text_to_speak, temp_audio_path, exaggeration, cfg_pace, random_seed, temperature)
         return audio_output, status
     except Exception as e:
+        print(f"API Error: {e}")
+        return None, f"API Error: {str(e)}"
+    finally:
+        # Clean up temporary file
         if temp_audio_path and temp_audio_path != reference_audio_url:
             try:
                 os.unlink(temp_audio_path)
             except:
                 pass
+# Rest of your Gradio interface code remains the same...
 def main():
     print("Starting Advanced Gradio interface...")
+    # Your existing Gradio interface code here
+    pass
 if __name__ == "__main__":
+    main()