Spaces:

ramimu
/

voice_cloning

Running

App Files Files Community

ramimu commited on Jun 19

Commit

74dbc75

verified ·

1 Parent(s): 91d6893

Update app.py

Browse files

Files changed (1) hide show

app.py +154 -55

app.py CHANGED Viewed

@@ -22,27 +22,50 @@ except ImportError as e:
     print(f"Failed to import ChatterboxTTS: {e}")
     chatterbox_available = False
 model = None
-def cleanup_gpu_memory():
-    """Clean up GPU memory to prevent CUDA errors."""
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-        torch.cuda.synchronize()
-        gc.collect()
-def safe_load_model():
-    """Safely load the model with proper error handling."""
-    global model
     if not chatterbox_available:
         print("ERROR: Chatterbox TTS library not available")
         return False
     try:
-        # Clean up any existing GPU memory
-        cleanup_gpu_memory()
         device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"Loading model on device: {device}")
@@ -65,16 +88,16 @@ def safe_load_model():
             model = model.to(device)
         if model and hasattr(model, 'eval'):
             model.eval()
-        # Clean up after loading
-        cleanup_gpu_memory()
         return True
     except Exception as e:
-        print(f"ERROR: Failed to load model: {e}")
         traceback.print_exc()
         model = None
-        cleanup_gpu_memory()
         return False
 def load_model_manually(device):
@@ -85,7 +108,7 @@ def load_model_manually(device):
     model_path = pathlib.Path(LOCAL_MODEL_PATH)
     print("Manual loading with correct constructor signature...")
-    # Load components to CPU first
     s3gen_path = model_path / "s3gen.pt"
     ve_path = model_path / "ve.pt"
     tokenizer_path = model_path / "tokenizer.json"
@@ -116,54 +139,46 @@ def load_model_manually(device):
     print("✓ Model loaded successfully with manual constructor.")
     return model
-def download_model_files():
-    """Download model files with error handling."""
-    print(f"Checking for model files in {LOCAL_MODEL_PATH}...")
-    os.makedirs(LOCAL_MODEL_PATH, exist_ok=True)
-    for filename in MODEL_FILES:
-        local_path = os.path.join(LOCAL_MODEL_PATH, filename)
-        if not os.path.exists(local_path):
-            print(f"Downloading {filename} from {MODEL_REPO_ID}...")
-            try:
-                downloaded_path = hf_hub_download(
-                    repo_id=MODEL_REPO_ID,
-                    filename=filename,
-                    cache_dir="./cache",
-                    force_download=False
-                )
-                shutil.copy2(downloaded_path, local_path)
-                print(f"✓ Downloaded and copied {filename}")
-            except Exception as e:
-                print(f"✗ Failed to download {filename}: {e}")
-                raise e
-        else:
-            print(f"✓ {filename} already exists locally")
-    print("All model files are ready!")
-# Initialize model
 if chatterbox_available:
     try:
         download_model_files()
-        safe_load_model()
     except Exception as e:
-        print(f"ERROR during initialization: {e}")
 @spaces.GPU
 def clone_voice(text_to_speak, reference_audio_path, exaggeration=0.6, cfg_pace=0.3, random_seed=0, temperature=0.6):
-    """Main voice cloning function with improved error handling."""
     # Input validation
     if not chatterbox_available:
         return None, "Error: Chatterbox TTS library not available. Please check installation."
-    if model is None:
-        return None, "Error: Model not loaded. Please check the logs for details."
     if not text_to_speak or text_to_speak.strip() == "":
         return None, "Error: Please enter some text to speak."
     if reference_audio_path is None:
         return None, "Error: Please upload a reference audio file (.wav or .mp3)."
     try:
         print(f"Processing request:")
         print(f"  Text length: {len(text_to_speak)} characters")
         print(f"  Audio: '{reference_audio_path}'")
@@ -178,13 +193,13 @@ def clone_voice(text_to_speak, reference_audio_path, exaggeration=0.6, cfg_pace=
             if torch.cuda.is_available():
                 torch.cuda.manual_seed(random_seed)
-        # Check CUDA availability before generation
         if torch.cuda.is_available():
             print(f"CUDA memory before generation: {torch.cuda.memory_allocated() / 1024**2:.1f} MB")
         # Generate audio with error handling
         try:
-            with torch.no_grad():  # Disable gradient computation
                 output_wav_data = model.generate(
                     text=text_to_speak,
                     audio_prompt_path=reference_audio_path,
@@ -209,6 +224,7 @@ def clone_voice(text_to_speak, reference_audio_path, exaggeration=0.6, cfg_pace=
                     print("✓ Recovery successful after memory cleanup")
                 except Exception as retry_error:
                     print(f"✗ Recovery failed: {retry_error}")
                     return None, f"CUDA error: {str(e)}. GPU memory issue - please try again in a moment."
             else:
                 raise e
@@ -244,7 +260,10 @@ def clone_voice(text_to_speak, reference_audio_path, exaggeration=0.6, cfg_pace=
         traceback.print_exc()
         # Clean up on error
-        cleanup_gpu_memory()
         # Provide specific error messages
         error_msg = str(e)
@@ -256,7 +275,7 @@ def clone_voice(text_to_speak, reference_audio_path, exaggeration=0.6, cfg_pace=
             return None, f"Error during audio generation: {error_msg}. Check logs for more details."
 def clone_voice_api(text_to_speak, reference_audio_url, exaggeration=0.6, cfg_pace=0.3, random_seed=0, temperature=0.6):
-    """API wrapper with improved error handling."""
     import requests
     import tempfile
     import os
@@ -282,7 +301,7 @@ def clone_voice_api(text_to_speak, reference_audio_url, exaggeration=0.6, cfg_pa
         else:
             temp_audio_path = reference_audio_url
-        # Generate audio
         audio_output, status = clone_voice(text_to_speak, temp_audio_path, exaggeration, cfg_pace, random_seed, temperature)
         return audio_output, status
@@ -298,11 +317,91 @@ def clone_voice_api(text_to_speak, reference_audio_url, exaggeration=0.6, cfg_pa
             except:
                 pass
-# Rest of your Gradio interface code remains the same...
 def main():
     print("Starting Advanced Gradio interface...")
-    # Your existing Gradio interface code here
-    pass
 if __name__ == "__main__":
     main()

     print(f"Failed to import ChatterboxTTS: {e}")
     chatterbox_available = False
+# Global model variable - will be loaded inside GPU function
 model = None
+model_loaded = False
+def download_model_files():
+    """Download model files with error handling."""
+    print(f"Checking for model files in {LOCAL_MODEL_PATH}...")
+    os.makedirs(LOCAL_MODEL_PATH, exist_ok=True)
+    for filename in MODEL_FILES:
+        local_path = os.path.join(LOCAL_MODEL_PATH, filename)
+        if not os.path.exists(local_path):
+            print(f"Downloading {filename} from {MODEL_REPO_ID}...")
+            try:
+                downloaded_path = hf_hub_download(
+                    repo_id=MODEL_REPO_ID,
+                    filename=filename,
+                    cache_dir="./cache",
+                    force_download=False
+                )
+                shutil.copy2(downloaded_path, local_path)
+                print(f"✓ Downloaded and copied {filename}")
+            except Exception as e:
+                print(f"✗ Failed to download {filename}: {e}")
+                raise e
+        else:
+            print(f"✓ {filename} already exists locally")
+    print("All model files are ready!")
+def load_model_on_gpu():
+    """Load model inside GPU context - only called within @spaces.GPU decorated function."""
+    global model, model_loaded
+    if model_loaded and model is not None:
+        return True
     if not chatterbox_available:
         print("ERROR: Chatterbox TTS library not available")
         return False
     try:
+        print("Loading model inside GPU context...")
+        # Now we can safely use CUDA operations
         device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"Loading model on device: {device}")
             model = model.to(device)
         if model and hasattr(model, 'eval'):
             model.eval()
+        model_loaded = True
+        print("✓ Model loaded successfully in GPU context")
         return True
     except Exception as e:
+        print(f"ERROR: Failed to load model in GPU context: {e}")
         traceback.print_exc()
         model = None
+        model_loaded = False
         return False
 def load_model_manually(device):
     model_path = pathlib.Path(LOCAL_MODEL_PATH)
     print("Manual loading with correct constructor signature...")
+    # Load components to CPU first, then move to device
     s3gen_path = model_path / "s3gen.pt"
     ve_path = model_path / "ve.pt"
     tokenizer_path = model_path / "tokenizer.json"
     print("✓ Model loaded successfully with manual constructor.")
     return model
+def cleanup_gpu_memory():
+    """Clean up GPU memory - only call within GPU context."""
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+        gc.collect()
+# Download model files during startup (CPU only)
 if chatterbox_available:
     try:
         download_model_files()
+        print("Model files downloaded. Model will be loaded on first GPU request.")
     except Exception as e:
+        print(f"ERROR during model file download: {e}")
 @spaces.GPU
 def clone_voice(text_to_speak, reference_audio_path, exaggeration=0.6, cfg_pace=0.3, random_seed=0, temperature=0.6):
+    """Main voice cloning function - runs on GPU."""
+    global model, model_loaded
     # Input validation
     if not chatterbox_available:
         return None, "Error: Chatterbox TTS library not available. Please check installation."
     if not text_to_speak or text_to_speak.strip() == "":
         return None, "Error: Please enter some text to speak."
     if reference_audio_path is None:
         return None, "Error: Please upload a reference audio file (.wav or .mp3)."
     try:
+        # Load model if not already loaded (inside GPU context)
+        if not model_loaded:
+            print("Loading model for the first time...")
+            if not load_model_on_gpu():
+                return None, "Error: Failed to load model. Please check the logs for details."
+        if model is None:
+            return None, "Error: Model not loaded. Please check the logs for details."
         print(f"Processing request:")
         print(f"  Text length: {len(text_to_speak)} characters")
         print(f"  Audio: '{reference_audio_path}'")
             if torch.cuda.is_available():
                 torch.cuda.manual_seed(random_seed)
+        # Check CUDA availability and memory
         if torch.cuda.is_available():
             print(f"CUDA memory before generation: {torch.cuda.memory_allocated() / 1024**2:.1f} MB")
         # Generate audio with error handling
         try:
+            with torch.no_grad():  # Disable gradient computation to save memory
                 output_wav_data = model.generate(
                     text=text_to_speak,
                     audio_prompt_path=reference_audio_path,
                     print("✓ Recovery successful after memory cleanup")
                 except Exception as retry_error:
                     print(f"✗ Recovery failed: {retry_error}")
+                    cleanup_gpu_memory()
                     return None, f"CUDA error: {str(e)}. GPU memory issue - please try again in a moment."
             else:
                 raise e
         traceback.print_exc()
         # Clean up on error
+        try:
+            cleanup_gpu_memory()
+        except:
+            pass
         # Provide specific error messages
         error_msg = str(e)
             return None, f"Error during audio generation: {error_msg}. Check logs for more details."
 def clone_voice_api(text_to_speak, reference_audio_url, exaggeration=0.6, cfg_pace=0.3, random_seed=0, temperature=0.6):
+    """API wrapper function - this will call the GPU function."""
     import requests
     import tempfile
     import os
         else:
             temp_audio_path = reference_audio_url
+        # Call the GPU function
         audio_output, status = clone_voice(text_to_speak, temp_audio_path, exaggeration, cfg_pace, random_seed, temperature)
         return audio_output, status
             except:
                 pass
+# Your existing Gradio interface code goes here...
 def main():
     print("Starting Advanced Gradio interface...")
+    # Your existing Gradio interface code
+    with gr.Blocks(title="🎙️ Advanced Chatterbox Voice Cloning") as demo:
+        gr.Markdown("# 🎙️ Advanced Chatterbox Voice Cloning")
+        gr.Markdown("Clone any voice using advanced AI technology with fine-tuned controls.")
+        with gr.Row():
+            with gr.Column(scale=2):
+                text_input = gr.Textbox(
+                    label="Text to Speak",
+                    placeholder="Enter the text you want the cloned voice to say...",
+                    lines=3
+                )
+                audio_input = gr.Audio(
+                    type="filepath",
+                    label="Reference Audio (Upload a short .wav or .mp3 clip)",
+                    sources=["upload", "microphone"]
+                )
+                with gr.Accordion("🔧 Advanced Settings", open=False):
+                    with gr.Row():
+                        exaggeration_input = gr.Slider(
+                            minimum=0.25, maximum=1.0, value=0.6, step=0.05,
+                            label="Exaggeration", info="Controls voice characteristic emphasis"
+                        )
+                        cfg_pace_input = gr.Slider(
+                            minimum=0.2, maximum=1.0, value=0.3, step=0.05,
+                            label="CFG/Pace", info="Classifier-free guidance weight"
+                        )
+                    with gr.Row():
+                        seed_input = gr.Number(
+                            value=0, label="Random Seed", info="Set to 0 for random results", precision=0
+                        )
+                        temperature_input = gr.Slider(
+                            minimum=0.05, maximum=2.0, value=0.6, step=0.05,
+                            label="Temperature", info="Controls randomness in generation"
+                        )
+                generate_btn = gr.Button("🎵 Generate Voice Clone", variant="primary", size="lg")
+            with gr.Column(scale=1):
+                audio_output = gr.Audio(label="Generated Audio", type="numpy")
+                status_output = gr.Textbox(label="Status", lines=2)
+        # Connect the interface
+        generate_btn.click(
+            fn=clone_voice_api,
+            inputs=[text_input, audio_input, exaggeration_input, cfg_pace_input, seed_input, temperature_input],
+            outputs=[audio_output, status_output],
+            api_name="predict"
+        )
+        # API endpoint for external calls
+        def clone_voice_base64_api(text_to_speak, reference_audio_b64, exaggeration=0.6, cfg_pace=0.3, random_seed=0, temperature=0.6):
+            return clone_voice_api(text_to_speak, reference_audio_b64, exaggeration, cfg_pace, random_seed, temperature)
+        # Hidden API interface
+        with gr.Row(visible=False):
+            api_text_input = gr.Textbox()
+            api_audio_input = gr.Textbox()
+            api_exaggeration_input = gr.Slider(minimum=0.25, maximum=1.0, value=0.6)
+            api_cfg_pace_input = gr.Slider(minimum=0.2, maximum=1.0, value=0.3)
+            api_seed_input = gr.Number(value=0, precision=0)
+            api_temperature_input = gr.Slider(minimum=0.05, maximum=2.0, value=0.6)
+            api_audio_output = gr.Audio(type="numpy")
+            api_status_output = gr.Textbox()
+            api_btn = gr.Button()
+        api_btn.click(
+            fn=clone_voice_base64_api,
+            inputs=[api_text_input, api_audio_input, api_exaggeration_input, api_cfg_pace_input, api_seed_input, api_temperature_input],
+            outputs=[api_audio_output, api_status_output],
+            api_name="clone_voice"
+        )
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        show_error=True,
+        quiet=False,
+        share=False
+    )
 if __name__ == "__main__":
     main()