Spaces:

ramimu
/

voice_cloning

Running on Zero

App Files Files Community

ramimu commited on May 30

Commit

240a407

verified ·

1 Parent(s): 5cbfdab

Update app.py

Browse files

Files changed (1) hide show

app.py +428 -13

app.py CHANGED Viewed

@@ -1,18 +1,433 @@
-#!/usr/bin/env python3
-"""
-Chatterbox Voice Cloning - Hugging Face Space
-Main entry point for the application
-"""
-import sys
 import os
-# Add the voice_cloning directory to the path
-sys.path.append(os.path.join(os.path.dirname(__file__), 'voice_cloning'))
-# Import and run the main app
-from voice_cloning.app import *
 if __name__ == "__main__":
-    # Run the app
-    main()

+import gradio as gr
 import os
+import traceback # For detailed error logging
+import torch
+from huggingface_hub import hf_hub_download
+import shutil
+# Import configuration
+try:
+    from config import MODEL_REPO_ID, MODEL_FILES, LOCAL_MODEL_PATH
+except ImportError:
+    # Fallback configuration if config.py is not found
+    MODEL_REPO_ID = "ramimu/chatterbox-voice-cloning-model"
+    LOCAL_MODEL_PATH = "./chatterbox_model_files"
+    MODEL_FILES = ["s3gen.pt", "t3_cfg.pt", "ve.pt", "tokenizer.json"]
+# Try importing chatterbox with better error handling
+try:
+    from chatterbox.tts import ChatterboxTTS
+    chatterbox_available = True
+    print("Chatterbox TTS imported successfully")
+except ImportError as e:
+    print(f"Failed to import ChatterboxTTS: {e}")
+    print("Trying alternative import...")
+    try:
+        import chatterbox
+        from chatterbox import ChatterboxTTS
+        chatterbox_available = True
+        print("Chatterbox TTS imported with alternative method")
+    except ImportError as e2:
+        print(f"Alternative import also failed: {e2}")
+        chatterbox_available = False
+# --- Global Model Variable ---
+model = None
+def download_model_files():
+    """Download model files from Hugging Face Hub if they don't exist locally"""
+    print(f"Checking for model files in {LOCAL_MODEL_PATH}...")
+    # Create model directory if it doesn't exist
+    os.makedirs(LOCAL_MODEL_PATH, exist_ok=True)
+    for filename in MODEL_FILES:
+        local_path = os.path.join(LOCAL_MODEL_PATH, filename)
+        if not os.path.exists(local_path):
+            print(f"Downloading {filename} from {MODEL_REPO_ID}...")
+            try:
+                downloaded_path = hf_hub_download(
+                    repo_id=MODEL_REPO_ID,
+                    filename=filename,
+                    cache_dir="./cache",
+                    force_download=False  # Use cache if available
+                )
+                # Copy to our local model path
+                shutil.copy2(downloaded_path, local_path)
+                print(f"✓ Downloaded and copied {filename}")
+            except Exception as e:
+                print(f"✗ Failed to download {filename}: {e}")
+                raise e
+        else:
+            print(f"✓ {filename} already exists locally")
+    print("All model files are ready!")
+# --- Load the Model ---
+if chatterbox_available:
+    print("Downloading model files from Hugging Face Hub...")
+    try:
+        download_model_files()
+    except Exception as e:
+        print(f"ERROR: Failed to download model files: {e}")
+        print("Model loading will fail without these files.")
+    print(f"Attempting to load Chatterbox model from local directory: {LOCAL_MODEL_PATH}")
+    if not os.path.exists(LOCAL_MODEL_PATH):
+        print(f"ERROR: Local model directory not found at {LOCAL_MODEL_PATH}")
+        print("Please ensure the model files were downloaded successfully.")
+    else:
+        print(f"Contents of {LOCAL_MODEL_PATH}: {os.listdir(LOCAL_MODEL_PATH)}")
+        try:
+            # Load the model from the specified local directory
+            # Set device to CPU or CUDA if available
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            print(f"Using device: {device}")
+            # The correct method signature is from_local(model_path, device)
+            # based on the error message showing from_local is called internally
+            try:
+                model = ChatterboxTTS.from_local(LOCAL_MODEL_PATH, device)
+                print("Chatterbox model loaded successfully using from_local method.")
+            except Exception as e1:
+                print(f"from_local attempt failed: {e1}")
+                try:
+                    # Try the corrected from_pretrained with proper parameter order
+                    # It seems from_pretrained expects (local_path, device) not (device=device)
+                    model = ChatterboxTTS.from_pretrained(LOCAL_MODEL_PATH, device)
+                    print("Chatterbox model loaded successfully with corrected from_pretrained.")
+                except Exception as e2:
+                    print(f"Corrected from_pretrained failed: {e2}")
+                    try:
+                        # Try loading individual components manually
+                        import torch
+                        s3gen_path = os.path.join(LOCAL_MODEL_PATH, "s3gen.pt")
+                        ve_path = os.path.join(LOCAL_MODEL_PATH, "ve.pt")
+                        tokenizer_path = os.path.join(LOCAL_MODEL_PATH, "tokenizer.json")
+                        t3_cfg_path = os.path.join(LOCAL_MODEL_PATH, "t3_cfg.pt")
+                        print(f"Loading components manually...")
+                        print(f"  s3gen: {s3gen_path}")
+                        print(f"  ve: {ve_path}")
+                        print(f"  tokenizer: {tokenizer_path}")
+                        print(f"  t3_cfg: {t3_cfg_path}")
+                        # Load the components
+                        s3gen = torch.load(s3gen_path, map_location=device)
+                        ve = torch.load(ve_path, map_location=device)
+                        # Load tokenizer
+                        import json
+                        with open(tokenizer_path, 'r') as f:
+                            tokenizer = json.load(f)
+                        # Create model instance with loaded components
+                        model = ChatterboxTTS(s3gen, ve, tokenizer, device)
+                        print("Chatterbox model loaded successfully with manual component loading.")
+                    except Exception as e3:
+                        print(f"Manual loading failed: {e3}")
+                        raise e3
+        except Exception as e:
+            print(f"ERROR: Failed to load Chatterbox model from local directory: {e}")
+            print("Detailed error trace:")
+            traceback.print_exc() # Prints the full traceback to the Hugging Face Space logs
+            model = None # Ensure model is None if loading fails
+else:
+    print("ERROR: Chatterbox TTS library not available")
+def clone_voice(text_to_speak, reference_audio_path, exaggeration=0.6, cfg_pace=0.3, random_seed=0, temperature=0.6):
+    if not chatterbox_available:
+        return None, "Error: Chatterbox TTS library not available. Please check installation."
+    if model is None:
+        return None, "Error: Model not loaded. Please check the logs for details."
+    if not text_to_speak or text_to_speak.strip() == "":
+        return None, "Error: Please enter some text to speak."
+    if reference_audio_path is None:
+        return None, "Error: Please upload a reference audio file (.wav or .mp3)."
+    try:
+        print(f"Received request:")
+        print(f"  Text: '{text_to_speak}'")
+        print(f"  Audio: '{reference_audio_path}'")
+        print(f"  Exaggeration: {exaggeration}")
+        print(f"  CFG/Pace: {cfg_pace}")
+        print(f"  Random Seed: {random_seed}")
+        print(f"  Temperature: {temperature}")
+        # Set random seed if specified
+        if random_seed > 0:
+            import torch
+            torch.manual_seed(random_seed)
+            if torch.cuda.is_available():
+                torch.cuda.manual_seed(random_seed)
+        # Use the correct ChatterboxTTS generate method signature with advanced parameters
+        output_wav_data = model.generate(
+            text=text_to_speak,
+            audio_prompt_path=reference_audio_path,
+            exaggeration=exaggeration,    # Controls how much the voice characteristics are emphasized
+            cfg_weight=cfg_pace,          # Classifier-free guidance weight (pace)
+            temperature=temperature       # Controls randomness in generation
+        )
+        # Get the sample rate from the model
+        try:
+            sample_rate = model.sr  # ChatterboxTTS uses 'sr' attribute
+        except:
+            sample_rate = 24000  # Default fallback
+        print(f"Audio generated successfully. Output data type: {type(output_wav_data)}, Sample rate: {sample_rate}")
+        # Handle different output formats
+        if isinstance(output_wav_data, str):
+            # If it's a file path, return the path
+            return output_wav_data, "Success: Audio generated successfully!"
+        else:
+            # If it's numpy array or tensor, return with sample rate
+            import numpy as np
+            if hasattr(output_wav_data, 'cpu'):
+                # Convert tensor to numpy if needed
+                output_wav_data = output_wav_data.cpu().numpy()
+            # Ensure it's the right shape for Gradio (1D array)
+            if output_wav_data.ndim > 1:
+                output_wav_data = output_wav_data.squeeze()
+            return (sample_rate, output_wav_data), "Success: Audio generated successfully!"
+    except Exception as e:
+        print(f"ERROR: Failed during audio generation: {e}")
+        print("Detailed error trace for audio generation:")
+        traceback.print_exc() # Prints the full traceback
+        return None, f"Error during audio generation: {str(e)}. Check logs for more details."
+# --- API Endpoint Function ---
+def clone_voice_api(text_to_speak, reference_audio_url, exaggeration=0.6, cfg_pace=0.3, random_seed=0, temperature=0.6):
+    """
+    API version of clone_voice that accepts URL or base64 audio data
+    """
+    import requests
+    import tempfile
+    import os
+    import base64
+    # Handle different audio input formats
+    temp_audio_path = None
+    try:
+        if reference_audio_url.startswith('data:audio'):
+            # Handle base64 encoded audio
+            header, encoded = reference_audio_url.split(',', 1)
+            audio_data = base64.b64decode(encoded)
+            # Determine file extension from MIME type
+            if 'mp3' in header:
+                ext = '.mp3'
+            elif 'wav' in header:
+                ext = '.wav'
+            else:
+                ext = '.wav'  # Default
+            # Save to temporary file
+            with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as temp_file:
+                temp_file.write(audio_data)
+                temp_audio_path = temp_file.name
+        elif reference_audio_url.startswith('http'):
+            # Download audio from URL
+            response = requests.get(reference_audio_url)
+            response.raise_for_status()
+            # Determine extension from URL or content type
+            if reference_audio_url.endswith('.mp3'):
+                ext = '.mp3'
+            elif reference_audio_url.endswith('.wav'):
+                ext = '.wav'
+            else:
+                ext = '.wav'  # Default
+            with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as temp_file:
+                temp_file.write(response.content)
+                temp_audio_path = temp_file.name
+        else:
+            # Assume it's a local file path
+            temp_audio_path = reference_audio_url
+        # Call the main clone_voice function
+        audio_output, status = clone_voice(text_to_speak, temp_audio_path, exaggeration, cfg_pace, random_seed, temperature)
+        # Clean up temporary file if we created one
+        if temp_audio_path and temp_audio_path != reference_audio_url:
+            try:
+                os.unlink(temp_audio_path)
+            except:
+                pass
+        return audio_output, status
+    except Exception as e:
+        if temp_audio_path and temp_audio_path != reference_audio_url:
+            try:
+                os.unlink(temp_audio_path)
+            except:
+                pass
+        return None, f"API Error: {str(e)}"
+# --- Define Gradio Interface ---
+# --- Define Gradio Interface ---
+with gr.Blocks(title="Advanced Chatterbox Voice Cloning", theme=gr.themes.Soft()) as iface:
+    gr.Markdown("# 🎙️ Advanced Chatterbox Voice Cloning")
+    gr.Markdown("Clone any voice using advanced AI technology with fine-tuned controls.")
+    with gr.Row():
+        with gr.Column(scale=2):
+            # Main inputs
+            text_input = gr.Textbox(
+                label="Text to Speak",
+                placeholder="Enter the text you want the cloned voice to say...",
+                lines=3
+            )
+            audio_input = gr.Audio(
+                type="filepath",
+                label="Reference Audio (Upload a short .wav or .mp3 clip)",
+                sources=["upload", "microphone"]
+            )
+            # Advanced controls in an accordion
+            with gr.Accordion("🔧 Advanced Settings", open=False):
+                with gr.Row():
+                    exaggeration = gr.Slider(
+                        minimum=0.25,
+                        maximum=1.0,
+                        value=0.6,
+                        step=0.05,
+                        label="Exaggeration",
+                        info="Controls voice characteristic emphasis (0.5 = neutral, higher = more exaggerated)"
+                    )
+                    cfg_pace = gr.Slider(
+                        minimum=0.2,
+                        maximum=1.0,
+                        value=0.3,
+                        step=0.05,
+                        label="CFG/Pace",
+                        info="Classifier-free guidance weight (affects generation quality and pace)"
+                    )
+                with gr.Row():
+                    random_seed = gr.Number(
+                        value=0,
+                        label="Random Seed",
+                        info="Set to 0 for random results, or use a specific number for reproducible outputs",
+                        precision=0
+                    )
+                    temperature = gr.Slider(
+                        minimum=0.05,
+                        maximum=2.0,
+                        value=0.6,
+                        step=0.05,
+                        label="Temperature",
+                        info="Controls randomness in generation (lower = more consistent, higher = more varied)"
+                    )
+            # Generate button
+            generate_btn = gr.Button("🎵 Generate Voice Clone", variant="primary", size="lg")
+        with gr.Column(scale=1):
+            # Outputs
+            audio_output = gr.Audio(
+                label="Generated Audio",
+                type="numpy",
+                interactive=False
+            )
+            status_output = gr.Textbox(
+                label="Status",
+                interactive=False,
+                lines=2
+            )
+    # API Information
+    with gr.Accordion("🔌 API Usage", open=False):
+        gr.Markdown("""
+        ### Using this as an API endpoint
+        You can use this Hugging Face Space as an API endpoint in your applications:
+        **Endpoint URL:** `https://your-username-voice-cloning.hf.space/api/predict`
+        **Example Python code:**
+        ```python
+        import requests
+        import base64
+        # Encode your audio file
+        with open("reference_audio.wav", "rb") as f:
+            audio_data = base64.b64encode(f.read()).decode()
+            audio_url = f"data:audio/wav;base64,{audio_data}"
+        # API request
+        response = requests.post(
+            "https://your-username-voice-cloning.hf.space/api/predict",
+            json={
+                "data": [
+                    "Hello, this is my cloned voice!",  # text
+                    audio_url,                          # reference audio (base64 or URL)
+                    0.6,                               # exaggeration
+                    0.3,                               # cfg_pace
+                    0,                                 # random_seed
+                    0.6                                # temperature
+                ]
+            }
+        )
+        ```
+        **Parameters:**
+        - `text_to_speak`: Text to synthesize
+        - `reference_audio`: Base64 encoded audio or URL
+        - `exaggeration`: Voice emphasis (0.25-1.0, default: 0.6)
+        - `cfg_pace`: Generation guidance (0.2-1.0, default: 0.3)
+        - `random_seed`: Reproducibility seed (0 for random, default: 0)
+        - `temperature`: Generation randomness (0.05-2.0, default: 0.6)
+        """)
+    # Examples
+    with gr.Accordion("📝 Examples", open=False):
+        gr.Examples(
+            examples=[
+                ["Hello, this is a test of the voice cloning system.", None, 0.5, 0.5, 0, 0.8],
+                ["The quick brown fox jumps over the lazy dog.", None, 0.7, 0.3, 42, 0.6],
+                ["Welcome to our AI voice cloning service. We hope you enjoy the experience!", None, 0.4, 0.7, 123, 1.0]
+            ],
+            inputs=[text_input, audio_input, exaggeration, cfg_pace, random_seed, temperature],
+            outputs=[audio_output, status_output],
+            fn=clone_voice,
+            cache_examples=False
+        )
+    # Connect the generate button
+    generate_btn.click(
+        fn=clone_voice,
+        inputs=[text_input, audio_input, exaggeration, cfg_pace, random_seed, temperature],
+        outputs=[audio_output, status_output],
+        api_name="clone_voice"  # This enables API access
+    )
+# --- Launch the Gradio App ---
+def main():
+    print("Starting Advanced Gradio interface...")
+    # Launch with specific configuration for API access and avoid manifest issues
+    iface.launch(
+        server_name="0.0.0.0",  # Allow external connections
+        server_port=7860,       # Explicit port
+        show_error=True,        # Show detailed errors
+        quiet=False,            # Show startup logs
+        favicon_path=None,      # Disable favicon to avoid 404
+        share=False,            # Set to True if you want a public link
+        auth=None,              # Add authentication if needed: ("username", "password")
+        app_kwargs={
+            "docs_url": "/docs",     # Enable API docs at /docs
+            "redoc_url": "/redoc"    # Enable alternative docs at /redoc
+        }
+    )
 if __name__ == "__main__":
+    main()