Spaces:

mrfakename
/

DMOSpeech2

Running on Zero

App Files Files Community

mrfakename commited on Jul 22

Commit

597cecf

1 Parent(s): 39d2f14

pt 1

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +143 -248
ctcmodel.py +73 -75
discriminator_conformer.py +100 -63
dmd_trainer.py +338 -151
duration_predictor.py +38 -17
duration_trainer.py +181 -116
duration_trainer_with_prompt.py +164 -94
ecapa_tdnn.py +155 -50
f5_tts/api.py +29 -18
f5_tts/eval/ecapa_tdnn.py +91 -19
f5_tts/eval/eval_infer_batch.py +34 -15
f5_tts/eval/eval_librispeech_test_clean.py +20 -8
f5_tts/eval/eval_seedtts_testset.py +17 -7
f5_tts/eval/eval_utmos.py +9 -3
f5_tts/eval/utils_eval.py +43 -13
f5_tts/infer/infer_cli.py +53 -32
f5_tts/infer/infer_gradio.py +178 -57
f5_tts/infer/speech_edit.py +30 -14
f5_tts/infer/utils_infer.py +106 -33
f5_tts/model/__init__.py +2 -5
f5_tts/model/backbones/dit.py +63 -30
f5_tts/model/backbones/mmdit.py +82 -65
f5_tts/model/backbones/unett.py +45 -23
f5_tts/model/cfm.py +61 -25
f5_tts/model/dataset.py +100 -82
f5_tts/model/modules.py +105 -38
f5_tts/model/trainer.py +163 -78
f5_tts/model/utils.py +31 -18
f5_tts/model_new/__init__.py +0 -1
f5_tts/model_new/backbones/dit.py +65 -26
f5_tts/model_new/backbones/mmdit.py +42 -20
f5_tts/model_new/backbones/unett.py +68 -27
f5_tts/model_new/cfm.py +41 -18
f5_tts/model_new/dataset.py +31 -9
f5_tts/model_new/modules.py +126 -39
f5_tts/model_new/trainer.py +142 -43
f5_tts/model_new/utils.py +29 -13
f5_tts/runtime/triton_trtllm/benchmark.py +106 -28
f5_tts/runtime/triton_trtllm/client_grpc.py +42 -13
f5_tts/runtime/triton_trtllm/client_http.py +26 -5
f5_tts/runtime/triton_trtllm/model_repo_f5_tts/f5_tts/1/f5_tts_trtllm.py +125 -33
f5_tts/runtime/triton_trtllm/model_repo_f5_tts/f5_tts/1/model.py +62 -17
f5_tts/runtime/triton_trtllm/patch/__init__.py +7 -11
f5_tts/runtime/triton_trtllm/patch/f5tts/model.py +13 -4
f5_tts/runtime/triton_trtllm/patch/f5tts/modules.py +98 -43
f5_tts/runtime/triton_trtllm/scripts/conv_stft.py +3 -2
f5_tts/runtime/triton_trtllm/scripts/convert_checkpoint.py +76 -22
f5_tts/runtime/triton_trtllm/scripts/export_vocoder_to_onnx.py +12 -5
f5_tts/runtime/triton_trtllm/scripts/fill_template.py +6 -2
f5_tts/scripts/count_max_epoch.py +6 -2

app.py CHANGED Viewed

@@ -1,130 +1,51 @@
-import gradio as gr
-import torch
-import torchaudio
-import numpy as np
 import tempfile
 import time
 from pathlib import Path
-from huggingface_hub import hf_hub_download
-import os
 import spaces
 from transformers import pipeline
-# Import the inference module
 from infer import DMOInference
-# Global variables
-model = None
-asr_pipe = None
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# Initialize ASR pipeline
-def initialize_asr_pipeline(device=device, dtype=None):
-    """Initialize the ASR pipeline on startup."""
-    global asr_pipe
-    if dtype is None:
-        dtype = (
-            torch.float16
-            if "cuda" in device
-            and torch.cuda.is_available()
-            and torch.cuda.get_device_properties(device).major >= 7
-            and not torch.cuda.get_device_name().endswith("[ZLUDA]")
-            else torch.float32
-        )
-    print("Initializing ASR pipeline...")
-    try:
-        asr_pipe = pipeline(
-            "automatic-speech-recognition",
-            model="openai/whisper-large-v3-turbo",
-            torch_dtype=dtype,
-            device="cpu"  # Keep ASR on CPU to save GPU memory
-        )
-        print("ASR pipeline initialized successfully")
-    except Exception as e:
-        print(f"Error initializing ASR pipeline: {e}")
-        asr_pipe = None
-# Transcribe function
 def transcribe(ref_audio, language=None):
     """Transcribe audio using the pre-loaded ASR pipeline."""
-    global asr_pipe
-    if asr_pipe is None:
-        return ""  # Return empty string if ASR is not available
-    try:
-        result = asr_pipe(
-            ref_audio,
-            chunk_length_s=30,
-            batch_size=128,
-            generate_kwargs={"task": "transcribe", "language": language} if language else {"task": "transcribe"},
-            return_timestamps=False,
-        )
-        return result["text"].strip()
-    except Exception as e:
-        print(f"Transcription error: {e}")
-        return ""
-def download_models():
-    """Download models from HuggingFace Hub."""
-    try:
-        print("Downloading models from HuggingFace...")
-        # Download student model
-        student_path = hf_hub_download(
-            repo_id="yl4579/DMOSpeech2",
-            filename="model_85000.pt",
-            cache_dir="./models"
-        )
-        # Download duration predictor
-        duration_path = hf_hub_download(
-            repo_id="yl4579/DMOSpeech2",
-            filename="model_1500.pt",
-            cache_dir="./models"
-        )
-        print(f"Student model: {student_path}")
-        print(f"Duration model: {duration_path}")
-        return student_path, duration_path
-    except Exception as e:
-        print(f"Error downloading models: {e}")
-        return None, None
-def initialize_model():
-    """Initialize the model on startup."""
-    global model
-    try:
-        # Download models
-        student_path, duration_path = download_models()
-        if not student_path or not duration_path:
-            return False, "Failed to download models from HuggingFace"
-        # Initialize model
-        model = DMOInference(
-            student_checkpoint_path=student_path,
-            duration_predictor_path=duration_path,
-            device=device,
-            model_type="F5TTS_Base"
-        )
-        return True, f"Model loaded successfully on {device.upper()}"
-    except Exception as e:
-        return False, f"Error initializing model: {str(e)}"
-# Initialize models on startup
-print("Initializing models...")
-model_loaded, status_message = initialize_model()
-initialize_asr_pipeline()  # Initialize ASR pipeline
-@spaces.GPU(duration=120)  # Request GPU for up to 120 seconds
 def generate_speech(
     prompt_audio,
     prompt_text,
@@ -134,128 +55,115 @@ def generate_speech(
     custom_teacher_steps,
     custom_teacher_stopping_time,
     custom_student_start_step,
-    verbose
 ):
-    """Generate speech with different configurations."""
-    if not model_loaded or model is None:
-        return None, "Model not loaded! Please refresh the page.", "", ""
     if prompt_audio is None:
-        return None, "Please upload a reference audio!", "", ""
     if not target_text:
-        return None, "Please enter text to generate!", "", ""
-    try:
-        # Auto-transcribe if prompt_text is empty
-        if not prompt_text and prompt_text != "":
-            print("Auto-transcribing reference audio...")
-            prompt_text = transcribe(prompt_audio)
-            print(f"Transcribed: {prompt_text}")
-        start_time = time.time()
-        # Configure parameters based on mode
-        if mode == "Student Only (4 steps)":
-            teacher_steps = 0
-            student_start_step = 0
-            teacher_stopping_time = 1.0
-        elif mode == "Teacher-Guided (8 steps)":
-            # Default configuration from the notebook
-            teacher_steps = 16
-            teacher_stopping_time = 0.07
-            student_start_step = 1
-        elif mode == "High Diversity (16 steps)":
-            teacher_steps = 24
-            teacher_stopping_time = 0.3
-            student_start_step = 2
-        else:  # Custom
-            teacher_steps = custom_teacher_steps
-            teacher_stopping_time = custom_teacher_stopping_time
-            student_start_step = custom_student_start_step
-        # Generate speech
-        generated_audio = model.generate(
-            gen_text=target_text,
-            audio_path=prompt_audio,
-            prompt_text=prompt_text if prompt_text else None,
-            teacher_steps=teacher_steps,
-            teacher_stopping_time=teacher_stopping_time,
-            student_start_step=student_start_step,
-            temperature=temperature,
-            verbose=verbose
-        )
-        end_time = time.time()
-        # Calculate metrics
-        processing_time = end_time - start_time
-        audio_duration = generated_audio.shape[-1] / 24000
-        rtf = processing_time / audio_duration
-        # Save audio
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
-            output_path = tmp_file.name
-        if isinstance(generated_audio, np.ndarray):
-            generated_audio = torch.from_numpy(generated_audio)
-        if generated_audio.dim() == 1:
-            generated_audio = generated_audio.unsqueeze(0)
-        torchaudio.save(output_path, generated_audio, 24000)
-        # Format metrics
-        metrics = f"RTF: {rtf:.2f}x ({1/rtf:.2f}x speed) | Processing: {processing_time:.2f}s for {audio_duration:.2f}s audio"
-        return output_path, "Success!", metrics, f"Mode: {mode} | Transcribed: {prompt_text[:50]}..." if not prompt_text else f"Mode: {mode}"
-    except Exception as e:
-        return None, f"Error: {str(e)}", "", ""
 # Create Gradio interface
 with gr.Blocks(title="DMOSpeech 2 - Zero-Shot TTS") as demo:
-    gr.Markdown(f"""
     # 🎙️ DMOSpeech 2: Zero-Shot Text-to-Speech
     Generate natural speech in any voice with just a short reference audio!
-    """)
     with gr.Row():
         with gr.Column(scale=1):
             # Reference audio input
             prompt_audio = gr.Audio(
                 label="📎 Reference Audio",
                 type="filepath",
-                sources=["upload", "microphone"]
             )
             prompt_text = gr.Textbox(
                 label="📝 Reference Text (leave empty for auto-transcription)",
                 placeholder="The text spoken in the reference audio...",
-                lines=2
             )
             target_text = gr.Textbox(
                 label="✍️ Text to Generate",
                 placeholder="Enter the text you want to synthesize...",
-                lines=4
             )
             # Generation mode
             mode = gr.Radio(
                 choices=[
                     "Student Only (4 steps)",
                     "Teacher-Guided (8 steps)",
                     "High Diversity (16 steps)",
-                    "Custom"
                 ],
                 value="Teacher-Guided (8 steps)",
                 label="🚀 Generation Mode",
-                info="Choose speed vs quality/diversity tradeoff"
             )
             # Advanced settings (collapsible)
             with gr.Accordion("⚙️ Advanced Settings", open=False):
                 temperature = gr.Slider(
@@ -264,9 +172,9 @@ with gr.Blocks(title="DMOSpeech 2 - Zero-Shot TTS") as demo:
                     value=0.0,
                     step=0.1,
                     label="Duration Temperature",
-                    info="0 = deterministic, >0 = more variation in speech rhythm"
                 )
                 with gr.Group(visible=False) as custom_settings:
                     gr.Markdown("### Custom Mode Settings")
                     custom_teacher_steps = gr.Slider(
@@ -275,60 +183,50 @@ with gr.Blocks(title="DMOSpeech 2 - Zero-Shot TTS") as demo:
                         value=16,
                         step=1,
                         label="Teacher Steps",
-                        info="More steps = higher quality"
                     )
                     custom_teacher_stopping_time = gr.Slider(
                         minimum=0.0,
                         maximum=1.0,
                         value=0.07,
                         step=0.01,
                         label="Teacher Stopping Time",
-                        info="When to switch to student"
                     )
                     custom_student_start_step = gr.Slider(
                         minimum=0,
                         maximum=4,
                         value=1,
                         step=1,
                         label="Student Start Step",
-                        info="Which student step to start from"
                     )
                 verbose = gr.Checkbox(
                     value=False,
                     label="Verbose Output",
-                    info="Show detailed generation steps"
                 )
             generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
         with gr.Column(scale=1):
             # Output
             output_audio = gr.Audio(
-                label="🔊 Generated Speech",
-                type="filepath",
-                autoplay=True
             )
-            status = gr.Textbox(
-                label="Status",
-                interactive=False
-            )
-            metrics = gr.Textbox(
-                label="Performance Metrics",
-                interactive=False
-            )
-            info = gr.Textbox(
-                label="Generation Info",
-                interactive=False
-            )
             # Tips
-            gr.Markdown("""
             ### 💡 Quick Tips:
             - **Auto-transcription**: Leave reference text empty to auto-transcribe
@@ -341,8 +239,9 @@ with gr.Blocks(title="DMOSpeech 2 - Zero-Shot TTS") as demo:
             - Student Only: ~0.05x (20x faster than real-time)
             - Teacher-Guided: ~0.10x (10x faster)
             - High Diversity: ~0.20x (5x faster)
-            """)
     # Event handler
     generate_btn.click(
         generate_speech,
@@ -355,21 +254,17 @@ with gr.Blocks(title="DMOSpeech 2 - Zero-Shot TTS") as demo:
             custom_teacher_steps,
             custom_teacher_stopping_time,
             custom_student_start_step,
-            verbose
         ],
-        outputs=[output_audio, status, metrics, info]
     )
     # Update visibility of custom settings based on mode
     def update_custom_visibility(mode):
-        is_custom = (mode == "Custom")
         return gr.update(visible=is_custom)
-    mode.change(
-        update_custom_visibility,
-        inputs=[mode],
-        outputs=[custom_settings]
-    )
 # Launch the app
 if __name__ == "__main__":
@@ -377,5 +272,5 @@ if __name__ == "__main__":
         print(f"Warning: Model failed to load - {status_message}")
     if not asr_pipe:
         print("Warning: ASR pipeline not available - auto-transcription disabled")
-    demo.launch()

+## IMPORTS ##
+import os
 import tempfile
 import time
 from pathlib import Path
+import gradio as gr
+import numpy as np
 import spaces
+import torch
+import torchaudio
+from cached_path import cached_path
+from huggingface_hub import hf_hub_download
 from transformers import pipeline
 from infer import DMOInference
+## CUDA DEVICE ##
 device = "cuda" if torch.cuda.is_available() else "cpu"
+## LOAD MODELS ##
+asr_pipe = pipeline(
+    "automatic-speech-recognition", model="openai/whisper-large-v3-turbo", device=device
+)
+model = DMOInference(
+    student_checkpoint_path=str(cached_path("hf://yl4579/DMOSpeech2/model_85000.pt")),
+    duration_predictor_path=str(cached_path("hf://yl4579/DMOSpeech2/model_1500.pt")),
+    device=device,
+    model_type="F5TTS_Base",
+)
 def transcribe(ref_audio, language=None):
     """Transcribe audio using the pre-loaded ASR pipeline."""
+    return asr_pipe(
+        ref_audio,
+        chunk_length_s=30,
+        batch_size=128,
+        generate_kwargs=(
+            {"task": "transcribe", "language": language}
+            if language
+            else {"task": "transcribe"}
+        ),
+        return_timestamps=False,
+    )["text"].strip()
+@spaces.GPU(duration=120)
 def generate_speech(
     prompt_audio,
     prompt_text,
     custom_teacher_steps,
     custom_teacher_stopping_time,
     custom_student_start_step,
+    verbose,
 ):
     if prompt_audio is None:
+        raise gr.Error("Please upload a reference audio!")
     if not target_text:
+        raise gr.Error("Please enter text to generate!")
+    if not prompt_text and prompt_text != "":
+        prompt_text = transcribe(prompt_audio)
+    if mode == "Student Only (4 steps)":
+        teacher_steps = 0
+        student_start_step = 0
+        teacher_stopping_time = 1.0
+    elif mode == "Teacher-Guided (8 steps)":
+        teacher_steps = 16
+        teacher_stopping_time = 0.07
+        student_start_step = 1
+    elif mode == "High Diversity (16 steps)":
+        teacher_steps = 24
+        teacher_stopping_time = 0.3
+        student_start_step = 2
+    else:  # Custom
+        teacher_steps = custom_teacher_steps
+        teacher_stopping_time = custom_teacher_stopping_time
+        student_start_step = custom_student_start_step
+    # Generate speech
+    generated_audio = model.generate(
+        gen_text=target_text,
+        audio_path=prompt_audio,
+        prompt_text=prompt_text if prompt_text else None,
+        teacher_steps=teacher_steps,
+        teacher_stopping_time=teacher_stopping_time,
+        student_start_step=student_start_step,
+        temperature=temperature,
+        verbose=verbose,
+    )
+    # Save audio
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
+        output_path = tmp_file.name
+    if isinstance(generated_audio, np.ndarray):
+        generated_audio = torch.from_numpy(generated_audio)
+    if generated_audio.dim() == 1:
+        generated_audio = generated_audio.unsqueeze(0)
+    torchaudio.save(output_path, generated_audio, 24000)
+    return (
+        output_path,
+        "Success!",
+        (
+            f"Mode: {mode} | Transcribed: {prompt_text[:50]}..."
+            if not prompt_text
+            else f"Mode: {mode}"
+        ),
+    )
 # Create Gradio interface
 with gr.Blocks(title="DMOSpeech 2 - Zero-Shot TTS") as demo:
+    gr.Markdown(
+        f"""
     # 🎙️ DMOSpeech 2: Zero-Shot Text-to-Speech
     Generate natural speech in any voice with just a short reference audio!
+    """
+    )
     with gr.Row():
         with gr.Column(scale=1):
             # Reference audio input
             prompt_audio = gr.Audio(
                 label="📎 Reference Audio",
                 type="filepath",
+                sources=["upload", "microphone"],
             )
             prompt_text = gr.Textbox(
                 label="📝 Reference Text (leave empty for auto-transcription)",
                 placeholder="The text spoken in the reference audio...",
+                lines=2,
             )
             target_text = gr.Textbox(
                 label="✍️ Text to Generate",
                 placeholder="Enter the text you want to synthesize...",
+                lines=4,
             )
             # Generation mode
             mode = gr.Radio(
                 choices=[
                     "Student Only (4 steps)",
                     "Teacher-Guided (8 steps)",
                     "High Diversity (16 steps)",
+                    "Custom",
                 ],
                 value="Teacher-Guided (8 steps)",
                 label="🚀 Generation Mode",
+                info="Choose speed vs quality/diversity tradeoff",
             )
             # Advanced settings (collapsible)
             with gr.Accordion("⚙️ Advanced Settings", open=False):
                 temperature = gr.Slider(
                     value=0.0,
                     step=0.1,
                     label="Duration Temperature",
+                    info="0 = deterministic, >0 = more variation in speech rhythm",
                 )
                 with gr.Group(visible=False) as custom_settings:
                     gr.Markdown("### Custom Mode Settings")
                     custom_teacher_steps = gr.Slider(
                         value=16,
                         step=1,
                         label="Teacher Steps",
+                        info="More steps = higher quality",
                     )
                     custom_teacher_stopping_time = gr.Slider(
                         minimum=0.0,
                         maximum=1.0,
                         value=0.07,
                         step=0.01,
                         label="Teacher Stopping Time",
+                        info="When to switch to student",
                     )
                     custom_student_start_step = gr.Slider(
                         minimum=0,
                         maximum=4,
                         value=1,
                         step=1,
                         label="Student Start Step",
+                        info="Which student step to start from",
                     )
                 verbose = gr.Checkbox(
                     value=False,
                     label="Verbose Output",
+                    info="Show detailed generation steps",
                 )
             generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
         with gr.Column(scale=1):
             # Output
             output_audio = gr.Audio(
+                label="🔊 Generated Speech", type="filepath", autoplay=True
             )
+            status = gr.Textbox(label="Status", interactive=False)
+            metrics = gr.Textbox(label="Performance Metrics", interactive=False)
+            info = gr.Textbox(label="Generation Info", interactive=False)
             # Tips
+            gr.Markdown(
+                """
             ### 💡 Quick Tips:
             - **Auto-transcription**: Leave reference text empty to auto-transcribe
             - Student Only: ~0.05x (20x faster than real-time)
             - Teacher-Guided: ~0.10x (10x faster)
             - High Diversity: ~0.20x (5x faster)
+            """
+            )
     # Event handler
     generate_btn.click(
         generate_speech,
             custom_teacher_steps,
             custom_teacher_stopping_time,
             custom_student_start_step,
+            verbose,
         ],
+        outputs=[output_audio, status, metrics, info],
     )
     # Update visibility of custom settings based on mode
     def update_custom_visibility(mode):
+        is_custom = mode == "Custom"
         return gr.update(visible=is_custom)
+    mode.change(update_custom_visibility, inputs=[mode], outputs=[custom_settings])
 # Launch the app
 if __name__ == "__main__":
         print(f"Warning: Model failed to load - {status_message}")
     if not asr_pipe:
         print("Warning: ASR pipeline not available - auto-transcription disabled")
+    demo.launch()

ctcmodel.py CHANGED Viewed

@@ -1,36 +1,24 @@
-from torch import nn
-import torch
 import copy
 from pathlib import Path
-from torchaudio.models import Conformer
-from f5_tts.model.utils import default
-from f5_tts.model.utils import exists
-from f5_tts.model.utils import list_str_to_idx
-from f5_tts.model.utils import list_str_to_tensor
-from f5_tts.model.utils import lens_to_mask
-from f5_tts.model.utils import mask_from_frac_lengths
-from f5_tts.model.utils import (
-    default,
-    exists,
-    list_str_to_idx,
-    list_str_to_tensor,
-    lens_to_mask,
-    mask_from_frac_lengths,
-)
 class ResBlock(nn.Module):
     def __init__(self, hidden_dim, n_conv=3, dropout_p=0.2):
         super().__init__()
         self._n_groups = 8
-        self.blocks = nn.ModuleList([
-            self._get_conv(hidden_dim, dilation=3**i, dropout_p=dropout_p)
-            for i in range(n_conv)])
     def forward(self, x):
         for block in self.blocks:
@@ -41,70 +29,71 @@ class ResBlock(nn.Module):
     def _get_conv(self, hidden_dim, dilation, dropout_p=0.2):
         layers = [
-            nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, padding=dilation, dilation=dilation),
             nn.ReLU(),
             nn.GroupNorm(num_groups=self._n_groups, num_channels=hidden_dim),
             nn.Dropout(p=dropout_p),
             nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, padding=1, dilation=1),
             nn.ReLU(),
-            nn.Dropout(p=dropout_p)
         ]
         return nn.Sequential(*layers)
 class ConformerCTC(nn.Module):
-    def __init__(self,
-                 vocab_size,
-                 mel_dim=100,
-                 num_heads=8,
-                 d_hid=512,
-                 nlayers=6):
         super().__init__()
         self.mel_proj = nn.Conv1d(mel_dim, d_hid, kernel_size=3, padding=1)
         self.d_hid = d_hid
         self.resblock1 = nn.Sequential(
-                ResBlock(d_hid),
-                nn.GroupNorm(num_groups=1, num_channels=d_hid)
-            )
         self.resblock2 = nn.Sequential(
-                ResBlock(d_hid),
-                nn.GroupNorm(num_groups=1, num_channels=d_hid)
-            )
         self.conf_pre = torch.nn.ModuleList(
-            [Conformer(
-             input_dim=d_hid,
-             num_heads=num_heads,
-             ffn_dim=d_hid * 2,
-             num_layers=1,
-             depthwise_conv_kernel_size=15,
-             use_group_norm=True,)
                 for _ in range(nlayers // 2)
             ]
         )
         self.conf_after = torch.nn.ModuleList(
-            [Conformer(
-             input_dim=d_hid,
-             num_heads=num_heads,
-             ffn_dim=d_hid * 2,
-             num_layers=1,
-             depthwise_conv_kernel_size=7,
-             use_group_norm=True,)
                 for _ in range(nlayers // 2)
             ]
         )
-        self.out = nn.Linear(d_hid, 1 + vocab_size) # 1 for blank
         self.ctc_loss = nn.CTCLoss(blank=vocab_size, zero_infinity=True).cuda()
     def forward(self, latent, text=None, text_lens=None):
         layers = []
@@ -125,20 +114,24 @@ class ConformerCTC(nn.Module):
         batch_size, time_steps, _ = x.shape
         # Create a dummy lengths tensor (all sequences are assumed to be full length).
-        input_lengths = torch.full((batch_size,), time_steps, device=x.device, dtype=torch.int64)
-        for layer in (self.conf_pre):
             x, _ = layer(x, input_lengths)
             layers.append(x.transpose(1, 2))
-        for layer in (self.conf_after):
             x, _ = layer(x, input_lengths)
             layers.append(x.transpose(1, 2))
         x = self.out(x)
         if text_lens is not None and text is not None:
-            loss = self.ctc_loss(x.log_softmax(dim=2).transpose(0, 1), text, input_lengths, text_lens)
             return x, layers, loss
         else:
             return x, layers
@@ -147,9 +140,8 @@ class ConformerCTC(nn.Module):
 if __name__ == "__main__":
     from f5_tts.model.utils import get_tokenizer
     bsz = 16
     tokenizer = "pinyin"  # 'pinyin', 'char', or 'custom'
     tokenizer_path = None  # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
     dataset_name = "Emilia_ZH_EN"
@@ -158,15 +150,17 @@ if __name__ == "__main__":
     else:
         tokenizer_path = dataset_name
     vocab_char_map, vocab_size = get_tokenizer(tokenizer_path, tokenizer)
-    model = ConformerCTC(vocab_size, mel_dim=80, num_heads=8, d_hid=512, nlayers=6).cuda()
     text = ["hello world"] * bsz
     lens = torch.randint(1, 1000, (bsz,)).cuda()
     inp = torch.randn(bsz, lens.max(), 80).cuda()
     batch, seq_len, dtype, device = *inp.shape[:2], inp.dtype, inp.device
     # handle text as string
     text_lens = torch.tensor([len(t) for t in text], device=device)
     if isinstance(text, list):
@@ -198,7 +192,6 @@ if __name__ == "__main__":
     char_vocab_map = list(vocab_char_map.keys())
     for batch in best_path:
         decoded_sequence = []
         previous_token = None
@@ -212,10 +205,15 @@ if __name__ == "__main__":
         decoded_sequences.append(decoded_sequence)
     # Convert token indices to characters
-    decoded_texts = [''.join([char_vocab_map[token] for token in sequence]) for sequence in decoded_sequences]
     gt_texts = []
     for i in range(text_lens.size(0)):
-        gt_texts.append(''.join([char_vocab_map[token] for token in text[i, :text_lens[i]]]))
     print(decoded_texts)
-    print(gt_texts)

 import copy
 from pathlib import Path
+import torch
+from torch import nn
+from torchaudio.models import Conformer
+from f5_tts.model.utils import (default, exists, lens_to_mask, list_str_to_idx,
+                                list_str_to_tensor, mask_from_frac_lengths)
 class ResBlock(nn.Module):
     def __init__(self, hidden_dim, n_conv=3, dropout_p=0.2):
         super().__init__()
         self._n_groups = 8
+        self.blocks = nn.ModuleList(
+            [
+                self._get_conv(hidden_dim, dilation=3**i, dropout_p=dropout_p)
+                for i in range(n_conv)
+            ]
+        )
     def forward(self, x):
         for block in self.blocks:
     def _get_conv(self, hidden_dim, dilation, dropout_p=0.2):
         layers = [
+            nn.Conv1d(
+                hidden_dim,
+                hidden_dim,
+                kernel_size=3,
+                padding=dilation,
+                dilation=dilation,
+            ),
             nn.ReLU(),
             nn.GroupNorm(num_groups=self._n_groups, num_channels=hidden_dim),
             nn.Dropout(p=dropout_p),
             nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, padding=1, dilation=1),
             nn.ReLU(),
+            nn.Dropout(p=dropout_p),
         ]
         return nn.Sequential(*layers)
 class ConformerCTC(nn.Module):
+    def __init__(self, vocab_size, mel_dim=100, num_heads=8, d_hid=512, nlayers=6):
         super().__init__()
         self.mel_proj = nn.Conv1d(mel_dim, d_hid, kernel_size=3, padding=1)
         self.d_hid = d_hid
         self.resblock1 = nn.Sequential(
+            ResBlock(d_hid), nn.GroupNorm(num_groups=1, num_channels=d_hid)
+        )
         self.resblock2 = nn.Sequential(
+            ResBlock(d_hid), nn.GroupNorm(num_groups=1, num_channels=d_hid)
+        )
         self.conf_pre = torch.nn.ModuleList(
+            [
+                Conformer(
+                    input_dim=d_hid,
+                    num_heads=num_heads,
+                    ffn_dim=d_hid * 2,
+                    num_layers=1,
+                    depthwise_conv_kernel_size=15,
+                    use_group_norm=True,
+                )
                 for _ in range(nlayers // 2)
             ]
         )
         self.conf_after = torch.nn.ModuleList(
+            [
+                Conformer(
+                    input_dim=d_hid,
+                    num_heads=num_heads,
+                    ffn_dim=d_hid * 2,
+                    num_layers=1,
+                    depthwise_conv_kernel_size=7,
+                    use_group_norm=True,
+                )
                 for _ in range(nlayers // 2)
             ]
         )
+        self.out = nn.Linear(d_hid, 1 + vocab_size)  # 1 for blank
         self.ctc_loss = nn.CTCLoss(blank=vocab_size, zero_infinity=True).cuda()
     def forward(self, latent, text=None, text_lens=None):
         layers = []
         batch_size, time_steps, _ = x.shape
         # Create a dummy lengths tensor (all sequences are assumed to be full length).
+        input_lengths = torch.full(
+            (batch_size,), time_steps, device=x.device, dtype=torch.int64
+        )
+        for layer in self.conf_pre:
             x, _ = layer(x, input_lengths)
             layers.append(x.transpose(1, 2))
+        for layer in self.conf_after:
             x, _ = layer(x, input_lengths)
             layers.append(x.transpose(1, 2))
         x = self.out(x)
         if text_lens is not None and text is not None:
+            loss = self.ctc_loss(
+                x.log_softmax(dim=2).transpose(0, 1), text, input_lengths, text_lens
+            )
             return x, layers, loss
         else:
             return x, layers
 if __name__ == "__main__":
     from f5_tts.model.utils import get_tokenizer
     bsz = 16
     tokenizer = "pinyin"  # 'pinyin', 'char', or 'custom'
     tokenizer_path = None  # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
     dataset_name = "Emilia_ZH_EN"
     else:
         tokenizer_path = dataset_name
     vocab_char_map, vocab_size = get_tokenizer(tokenizer_path, tokenizer)
+    model = ConformerCTC(
+        vocab_size, mel_dim=80, num_heads=8, d_hid=512, nlayers=6
+    ).cuda()
     text = ["hello world"] * bsz
     lens = torch.randint(1, 1000, (bsz,)).cuda()
     inp = torch.randn(bsz, lens.max(), 80).cuda()
     batch, seq_len, dtype, device = *inp.shape[:2], inp.dtype, inp.device
     # handle text as string
     text_lens = torch.tensor([len(t) for t in text], device=device)
     if isinstance(text, list):
     char_vocab_map = list(vocab_char_map.keys())
     for batch in best_path:
         decoded_sequence = []
         previous_token = None
         decoded_sequences.append(decoded_sequence)
     # Convert token indices to characters
+    decoded_texts = [
+        "".join([char_vocab_map[token] for token in sequence])
+        for sequence in decoded_sequences
+    ]
     gt_texts = []
     for i in range(text_lens.size(0)):
+        gt_texts.append(
+            "".join([char_vocab_map[token] for token in text[i, : text_lens[i]]])
+        )
     print(decoded_texts)
+    print(gt_texts)

discriminator_conformer.py CHANGED Viewed

@@ -2,30 +2,28 @@
 from __future__ import annotations
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torchaudio.transforms as trans
-from pathlib import Path
 from torchaudio.models import Conformer
-from f5_tts.model.utils import (
-    default,
-    exists,
-    list_str_to_idx,
-    list_str_to_tensor,
-    lens_to_mask,
-    mask_from_frac_lengths,
-)
 class ResBlock(nn.Module):
     def __init__(self, hidden_dim, n_conv=3, dropout_p=0.2):
         super().__init__()
         self._n_groups = 8
-        self.blocks = nn.ModuleList([
-            self._get_conv(hidden_dim, dilation=3**i, dropout_p=dropout_p)
-            for i in range(n_conv)])
     def forward(self, x):
         for block in self.blocks:
@@ -36,46 +34,67 @@ class ResBlock(nn.Module):
     def _get_conv(self, hidden_dim, dilation, dropout_p=0.2):
         layers = [
-            nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, padding=dilation, dilation=dilation),
             nn.ReLU(),
             nn.GroupNorm(num_groups=self._n_groups, num_channels=hidden_dim),
             nn.Dropout(p=dropout_p),
             nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, padding=1, dilation=1),
             nn.ReLU(),
-            nn.Dropout(p=dropout_p)
         ]
         return nn.Sequential(*layers)
 class ConformerDiscirminator(nn.Module):
-    def __init__(self, input_dim, channels=512, num_layers=3, num_heads=8, depthwise_conv_kernel_size=15, use_group_norm=True):
         super().__init__()
         self.input_layer = nn.Conv1d(input_dim, channels, kernel_size=3, padding=1)
         self.resblock1 = nn.Sequential(
-                ResBlock(channels),
-                nn.GroupNorm(num_groups=1, num_channels=channels)
-            )
         self.resblock2 = nn.Sequential(
-                ResBlock(channels),
-                nn.GroupNorm(num_groups=1, num_channels=channels)
-            )
-        self.conformer1 = Conformer(**{"input_dim": channels,
-                "num_heads": num_heads,
-                "ffn_dim": channels * 2,
-                "num_layers": 1,
                 "depthwise_conv_kernel_size": depthwise_conv_kernel_size // 2,
-                "use_group_norm": use_group_norm})
-        self.conformer2 = Conformer(**{"input_dim": channels,
-                "num_heads": num_heads,
-                "ffn_dim": channels * 2,
-                "num_layers": num_layers - 1,
                 "depthwise_conv_kernel_size": depthwise_conv_kernel_size,
-                "use_group_norm": use_group_norm})
         self.linear = nn.Conv1d(channels, 1, kernel_size=1)
     def forward(self, x):
@@ -89,12 +108,14 @@ class ConformerDiscirminator(nn.Module):
         x = nn.functional.avg_pool1d(x, 2)
         x = self.resblock2(x)
         x = nn.functional.avg_pool1d(x, 2)
         # Transpose to (B, T, C) for the conformer.
         x = x.transpose(1, 2)
         batch_size, time_steps, _ = x.shape
         # Create a dummy lengths tensor (all sequences are assumed to be full length).
-        lengths = torch.full((batch_size,), time_steps, device=x.device, dtype=torch.int64)
         # The built-in Conformer returns (output, output_lengths); we discard lengths.
         x, _ = self.conformer1(x, lengths)
@@ -107,12 +128,13 @@ class ConformerDiscirminator(nn.Module):
         return out
 if __name__ == "__main__":
-    from f5_tts.model.utils import get_tokenizer
     from f5_tts.model import DiT
     bsz = 2
     tokenizer = "pinyin"  # 'pinyin', 'char', or 'custom'
     tokenizer_path = None  # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
     dataset_name = "Emilia_ZH_EN"
@@ -121,20 +143,28 @@ if __name__ == "__main__":
     else:
         tokenizer_path = dataset_name
     vocab_char_map, vocab_size = get_tokenizer(tokenizer_path, tokenizer)
-    fake_unet = DiT(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4, text_num_embeds=vocab_size, mel_dim=80)
     fake_unet = fake_unet.cuda()
     text = ["hello world"] * bsz
     lens = torch.randint(1, 1000, (bsz,)).cuda()
     inp = torch.randn(bsz, lens.max(), 80).cuda()
     batch, seq_len, dtype, device = *inp.shape[:2], inp.dtype, inp.device
     batch, seq_len, dtype, device = *inp.shape[:2], inp.dtype, inp.device
     # handle text as string
     if isinstance(text, list):
         if exists(vocab_char_map):
@@ -147,13 +177,17 @@ if __name__ == "__main__":
     if not exists(lens):
         lens = torch.full((batch,), seq_len, device=device)
-    mask = lens_to_mask(lens, length=seq_len)  # useless here, as collate_fn will pad to max length in batch
     frac_lengths_mask = (0.7, 1.0)
     # get a random span to mask out for training conditionally
-    frac_lengths = torch.zeros((batch,), device=device).float().uniform_(*frac_lengths_mask)
     rand_span_mask = mask_from_frac_lengths(lens, frac_lengths)
     if exists(mask):
         rand_span_mask &= mask
@@ -163,38 +197,41 @@ if __name__ == "__main__":
     x1 = inp
     x0 = torch.randn_like(x1)
     t = time.unsqueeze(-1).unsqueeze(-1)
     phi = (1 - t) * x0 + t * x1
     flow = x1 - x0
     cond = torch.where(rand_span_mask[..., None], torch.zeros_like(x1), x1)
     layers = fake_unet(
-        x=phi,
         cond=cond,
-        text=text,
-        time=time,
         drop_audio_cond=False,
         drop_text=False,
-        classify_mode=True
     )
     # layers = torch.stack(layers, dim=1).transpose(-1, -2).flatten(start_dim=1, end_dim=2)
     # print(layers.shape)
     from ctcmodel import ConformerCTC
-    ctcmodel = ConformerCTC(vocab_size=vocab_size, mel_dim=80, num_heads=8, d_hid=512, nlayers=6).cuda()
     real_out, layer = ctcmodel(inp)
-    layer = layer[-3:] # only use the last 3 layers
-    layer = [F.interpolate(l, mode='nearest', scale_factor=4).transpose(-1, -2) for l in layer]
     if layer[0].size(1) < layers[0].size(1):
         layer = [F.pad(l, (0, 0, 0, layers[0].size(1) - l.size(1))) for l in layer]
     layers = layer + layers
-    model = ConformerDiscirminator(input_dim=23 * 1024 + 3 * 512,
-                            channels=512
-                            )
     model = model.cuda()
     print(model)

 from __future__ import annotations
+from pathlib import Path
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torchaudio.transforms as trans
 from torchaudio.models import Conformer
+from f5_tts.model.utils import (default, exists, lens_to_mask, list_str_to_idx,
+                                list_str_to_tensor, mask_from_frac_lengths)
 class ResBlock(nn.Module):
     def __init__(self, hidden_dim, n_conv=3, dropout_p=0.2):
         super().__init__()
         self._n_groups = 8
+        self.blocks = nn.ModuleList(
+            [
+                self._get_conv(hidden_dim, dilation=3**i, dropout_p=dropout_p)
+                for i in range(n_conv)
+            ]
+        )
     def forward(self, x):
         for block in self.blocks:
     def _get_conv(self, hidden_dim, dilation, dropout_p=0.2):
         layers = [
+            nn.Conv1d(
+                hidden_dim,
+                hidden_dim,
+                kernel_size=3,
+                padding=dilation,
+                dilation=dilation,
+            ),
             nn.ReLU(),
             nn.GroupNorm(num_groups=self._n_groups, num_channels=hidden_dim),
             nn.Dropout(p=dropout_p),
             nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, padding=1, dilation=1),
             nn.ReLU(),
+            nn.Dropout(p=dropout_p),
         ]
         return nn.Sequential(*layers)
 class ConformerDiscirminator(nn.Module):
+    def __init__(
+        self,
+        input_dim,
+        channels=512,
+        num_layers=3,
+        num_heads=8,
+        depthwise_conv_kernel_size=15,
+        use_group_norm=True,
+    ):
         super().__init__()
         self.input_layer = nn.Conv1d(input_dim, channels, kernel_size=3, padding=1)
         self.resblock1 = nn.Sequential(
+            ResBlock(channels), nn.GroupNorm(num_groups=1, num_channels=channels)
+        )
         self.resblock2 = nn.Sequential(
+            ResBlock(channels), nn.GroupNorm(num_groups=1, num_channels=channels)
+        )
+        self.conformer1 = Conformer(
+            **{
+                "input_dim": channels,
+                "num_heads": num_heads,
+                "ffn_dim": channels * 2,
+                "num_layers": 1,
                 "depthwise_conv_kernel_size": depthwise_conv_kernel_size // 2,
+                "use_group_norm": use_group_norm,
+            }
+        )
+        self.conformer2 = Conformer(
+            **{
+                "input_dim": channels,
+                "num_heads": num_heads,
+                "ffn_dim": channels * 2,
+                "num_layers": num_layers - 1,
                 "depthwise_conv_kernel_size": depthwise_conv_kernel_size,
+                "use_group_norm": use_group_norm,
+            }
+        )
         self.linear = nn.Conv1d(channels, 1, kernel_size=1)
     def forward(self, x):
         x = nn.functional.avg_pool1d(x, 2)
         x = self.resblock2(x)
         x = nn.functional.avg_pool1d(x, 2)
         # Transpose to (B, T, C) for the conformer.
         x = x.transpose(1, 2)
         batch_size, time_steps, _ = x.shape
         # Create a dummy lengths tensor (all sequences are assumed to be full length).
+        lengths = torch.full(
+            (batch_size,), time_steps, device=x.device, dtype=torch.int64
+        )
         # The built-in Conformer returns (output, output_lengths); we discard lengths.
         x, _ = self.conformer1(x, lengths)
         return out
 if __name__ == "__main__":
     from f5_tts.model import DiT
+    from f5_tts.model.utils import get_tokenizer
     bsz = 2
     tokenizer = "pinyin"  # 'pinyin', 'char', or 'custom'
     tokenizer_path = None  # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
     dataset_name = "Emilia_ZH_EN"
     else:
         tokenizer_path = dataset_name
     vocab_char_map, vocab_size = get_tokenizer(tokenizer_path, tokenizer)
+    fake_unet = DiT(
+        dim=1024,
+        depth=22,
+        heads=16,
+        ff_mult=2,
+        text_dim=512,
+        conv_layers=4,
+        text_num_embeds=vocab_size,
+        mel_dim=80,
+    )
     fake_unet = fake_unet.cuda()
     text = ["hello world"] * bsz
     lens = torch.randint(1, 1000, (bsz,)).cuda()
     inp = torch.randn(bsz, lens.max(), 80).cuda()
     batch, seq_len, dtype, device = *inp.shape[:2], inp.dtype, inp.device
     batch, seq_len, dtype, device = *inp.shape[:2], inp.dtype, inp.device
     # handle text as string
     if isinstance(text, list):
         if exists(vocab_char_map):
     if not exists(lens):
         lens = torch.full((batch,), seq_len, device=device)
+    mask = lens_to_mask(
+        lens, length=seq_len
+    )  # useless here, as collate_fn will pad to max length in batch
     frac_lengths_mask = (0.7, 1.0)
     # get a random span to mask out for training conditionally
+    frac_lengths = (
+        torch.zeros((batch,), device=device).float().uniform_(*frac_lengths_mask)
+    )
     rand_span_mask = mask_from_frac_lengths(lens, frac_lengths)
     if exists(mask):
         rand_span_mask &= mask
     x1 = inp
     x0 = torch.randn_like(x1)
     t = time.unsqueeze(-1).unsqueeze(-1)
     phi = (1 - t) * x0 + t * x1
     flow = x1 - x0
     cond = torch.where(rand_span_mask[..., None], torch.zeros_like(x1), x1)
     layers = fake_unet(
+        x=phi,
         cond=cond,
+        text=text,
+        time=time,
         drop_audio_cond=False,
         drop_text=False,
+        classify_mode=True,
     )
     # layers = torch.stack(layers, dim=1).transpose(-1, -2).flatten(start_dim=1, end_dim=2)
     # print(layers.shape)
     from ctcmodel import ConformerCTC
+    ctcmodel = ConformerCTC(
+        vocab_size=vocab_size, mel_dim=80, num_heads=8, d_hid=512, nlayers=6
+    ).cuda()
     real_out, layer = ctcmodel(inp)
+    layer = layer[-3:]  # only use the last 3 layers
+    layer = [
+        F.interpolate(l, mode="nearest", scale_factor=4).transpose(-1, -2)
+        for l in layer
+    ]
     if layer[0].size(1) < layers[0].size(1):
         layer = [F.pad(l, (0, 0, 0, layers[0].size(1) - l.size(1))) for l in layer]
     layers = layer + layers
+    model = ConformerDiscirminator(input_dim=23 * 1024 + 3 * 512, channels=512)
     model = model.cuda()
     print(model)

dmd_trainer.py CHANGED Viewed

@@ -1,28 +1,26 @@
 from __future__ import annotations
-import os
 import gc
-from tqdm import tqdm
-import wandb
 import torch
 import torch.nn as nn
-from torch.optim import AdamW
-from torch.utils.data import DataLoader, Dataset, SequentialSampler
-from torch.optim.lr_scheduler import LinearLR, SequentialLR
 from accelerate import Accelerator
 from accelerate.utils import DistributedDataParallelKwargs
-from unimodel import UniModel
 from f5_tts.model import CFM
-from f5_tts.model.utils import exists, default
 from f5_tts.model.dataset import DynamicBatchSampler, collate_fn
 # trainer
-import math
 class RunningStats:
     def __init__(self):
@@ -41,7 +39,7 @@ class RunningStats:
     @property
     def variance(self):
         """Return the sample variance. Returns NaN if fewer than two samples."""
-        return self.M2 / (self.count - 1) if self.count > 1 else float('nan')
     @property
     def std(self):
@@ -49,7 +47,6 @@ class RunningStats:
         return math.sqrt(self.variance)
 class Trainer:
     def __init__(
         self,
@@ -74,7 +71,6 @@ class Trainer:
         accelerate_kwargs: dict = dict(),
         bnb_optimizer: bool = False,
         scale: float = 1.0,
         # training parameters for DMDSpeech
         num_student_step: int = 1,
         gen_update_ratio: int = 5,
@@ -82,7 +78,6 @@ class Trainer:
         lambda_generator_loss: float = 1.0,
         lambda_ctc_loss: float = 1.0,
         lambda_sim_loss: float = 1.0,
         num_GAN: int = 5000,
         num_D: int = 500,
         num_ctc: int = 5000,
@@ -103,7 +98,13 @@ class Trainer:
         if logger == "wandb":
             if exists(wandb_resume_id):
-                init_kwargs = {"wandb": {"resume": "allow", "name": wandb_run_name, "id": wandb_resume_id}}
             else:
                 init_kwargs = {"wandb": {"resume": "allow", "name": wandb_run_name}}
             self.accelerator.init_trackers(
@@ -130,7 +131,9 @@ class Trainer:
         self.epochs = epochs
         self.num_warmup_updates = num_warmup_updates
         self.save_per_updates = save_per_updates
-        self.last_per_steps = default(last_per_steps, save_per_updates * grad_accumulation_steps)
         self.checkpoint_path = default(checkpoint_path, "ckpts/test_e2-tts")
         self.batch_size = batch_size
@@ -142,41 +145,56 @@ class Trainer:
         self.noise_scheduler = noise_scheduler
         self.duration_predictor = duration_predictor
         self.log_step = log_step
-        self.gen_update_ratio = gen_update_ratio # number of generator updates per guidance (fake score function and discriminator) update
-        self.lambda_discriminator_loss = lambda_discriminator_loss # weight for discriminator loss (L_adv)
-        self.lambda_generator_loss = lambda_generator_loss # weight for generator loss (L_adv)
-        self.lambda_ctc_loss = lambda_ctc_loss # weight for ctc loss
-        self.lambda_sim_loss = lambda_sim_loss # weight for similarity loss
         # create distillation schedule for student model
-        self.student_steps = (
-                torch.linspace(0.0, 1.0, num_student_step + 1)[:-1])
-        self.GAN = model.guidance_model.gen_cls_loss # whether to use GAN training
-        self.num_GAN = num_GAN # number of steps before adversarial training
-        self.num_D = num_D # number of steps to train the discriminator before adversarial training
-        self.num_ctc = num_ctc # number of steps before CTC training
-        self.num_sim = num_sim # number of steps before similarity training
-        self.num_simu = num_simu # number of steps before using simulated data
         # Assuming `self.model.fake_unet.parameters()` and `self.model.guidance_model.parameters()` are accessible
         if bnb_optimizer:
             import bitsandbytes as bnb
-            self.optimizer_generator = bnb.optim.AdamW8bit(self.model.feedforward_model.parameters(), lr=learning_rate)
-            self.optimizer_guidance = bnb.optim.AdamW8bit(self.model.guidance_model.parameters(), lr=learning_rate)
         else:
-            self.optimizer_generator = AdamW(self.model.feedforward_model.parameters(), lr=learning_rate, eps=1e-7)
-            self.optimizer_guidance = AdamW(self.model.guidance_model.parameters(), lr=learning_rate, eps=1e-7)
-        self.model, self.optimizer_generator, self.optimizer_guidance = self.accelerator.prepare(self.model, self.optimizer_generator, self.optimizer_guidance)
         self.generator_norm = RunningStats()
         self.guidance_norm = RunningStats()
     @property
     def is_main(self):
         return self.accelerator.is_main_process
@@ -186,8 +204,12 @@ class Trainer:
         if self.is_main:
             checkpoint = dict(
                 model_state_dict=self.accelerator.unwrap_model(self.model).state_dict(),
-                optimizer_generator_state_dict=self.accelerator.unwrap_model(self.optimizer_generator).state_dict(),
-                optimizer_guidance_state_dict=self.accelerator.unwrap_model(self.optimizer_guidance).state_dict(),
                 scheduler_generator_state_dict=self.scheduler_generator.state_dict(),
                 scheduler_guidance_state_dict=self.scheduler_guidance.state_dict(),
                 step=step,
@@ -196,10 +218,14 @@ class Trainer:
             if not os.path.exists(self.checkpoint_path):
                 os.makedirs(self.checkpoint_path)
             if last:
-                self.accelerator.save(checkpoint, f"{self.checkpoint_path}/model_last.pt")
                 print(f"Saved last checkpoint at step {step}")
             else:
-                self.accelerator.save(checkpoint, f"{self.checkpoint_path}/model_{step}.pt")
     def load_checkpoint(self):
         if (
@@ -218,9 +244,15 @@ class Trainer:
                 key=lambda x: int("".join(filter(str.isdigit, x))),
             )[-1]
         # checkpoint = torch.load(f"{self.checkpoint_path}/{latest_checkpoint}", map_location=self.accelerator.device)  # rather use accelerator.load_state ಥ_ಥ
-        checkpoint = torch.load(f"{self.checkpoint_path}/{latest_checkpoint}", weights_only=True, map_location="cpu")
-        self.accelerator.unwrap_model(self.model).load_state_dict(checkpoint["model_state_dict"], strict=False)
         # self.accelerator.unwrap_model(self.optimizer_generator).load_state_dict(checkpoint["optimizer_generator_state_dict"])
         # self.accelerator.unwrap_model(self.optimizer_guidance).load_state_dict(checkpoint["optimizer_guidance_state_dict"])
         # if self.scheduler_guidance:
@@ -232,9 +264,14 @@ class Trainer:
         del checkpoint
         gc.collect()
         return step
-    def train(self, train_dataset: Dataset, num_workers=64, resumable_with_seed: int = None, vocoder: nn.Module = None):
         if exists(resumable_with_seed):
             generator = torch.Generator()
             generator.manual_seed(resumable_with_seed)
@@ -256,7 +293,11 @@ class Trainer:
             self.accelerator.even_batches = False
             sampler = SequentialSampler(train_dataset)
             batch_sampler = DynamicBatchSampler(
-                sampler, self.batch_size, max_samples=self.max_samples, random_seed=resumable_with_seed, drop_last=False
             )
             train_dataloader = DataLoader(
                 train_dataset,
@@ -267,29 +308,63 @@ class Trainer:
                 batch_sampler=batch_sampler,
             )
         else:
-            raise ValueError(f"batch_size_type must be either 'sample' or 'frame', but received {self.batch_size_type}")
         #  accelerator.prepare() dispatches batches to devices;
         #  which means the length of dataloader calculated before, should consider the number of devices
-        warmup_steps = (
-            self.num_warmup_updates * self.accelerator.num_processes
-        )
         # consider a fixed warmup steps while using accelerate multi-gpu ddp
         # otherwise by default with split_batches=False, warmup steps change with num_processes
         total_steps = len(train_dataloader) * self.epochs / self.grad_accumulation_steps
         decay_steps = total_steps - warmup_steps
-        warmup_scheduler_generator = LinearLR(self.optimizer_generator, start_factor=1e-8, end_factor=1.0, total_iters=warmup_steps // (self.gen_update_ratio * self.grad_accumulation_steps))
-        decay_scheduler_generator = LinearLR(self.optimizer_generator, start_factor=1.0, end_factor=1e-8, total_iters=decay_steps // (self.gen_update_ratio * self.grad_accumulation_steps))
-        self.scheduler_generator = SequentialLR(self.optimizer_generator, schedulers=[warmup_scheduler_generator, decay_scheduler_generator], milestones=[warmup_steps // (self.gen_update_ratio * self.grad_accumulation_steps)])
-        warmup_scheduler_guidance = LinearLR(self.optimizer_guidance, start_factor=1e-8, end_factor=1.0, total_iters=warmup_steps)
-        decay_scheduler_guidance = LinearLR(self.optimizer_guidance, start_factor=1.0, end_factor=1e-8, total_iters=decay_steps)
-        self.scheduler_guidance = SequentialLR(self.optimizer_guidance, schedulers=[warmup_scheduler_guidance, decay_scheduler_guidance], milestones=[warmup_steps])
-        train_dataloader, self.scheduler_generator, self.scheduler_guidance = self.accelerator.prepare(
-            train_dataloader, self.scheduler_generator, self.scheduler_guidance
         )  # actual steps = 1 gpu steps / gpus
         start_step = self.load_checkpoint()
         global_step = start_step
@@ -298,7 +373,9 @@ class Trainer:
             orig_epoch_step = len(train_dataloader)
             skipped_epoch = int(start_step // orig_epoch_step)
             skipped_batch = start_step % orig_epoch_step
-            skipped_dataloader = self.accelerator.skip_first_batches(train_dataloader, num_batches=skipped_batch)
         else:
             skipped_epoch = 0
@@ -323,48 +400,59 @@ class Trainer:
             for batch in progress_bar:
                 update_generator = global_step % self.gen_update_ratio == 0
                 with self.accelerator.accumulate(self.model):
                     metrics = {}
                     text_inputs = batch["text"]
                     mel_spec = batch["mel"].permute(0, 2, 1)
                     mel_lengths = batch["mel_lengths"]
                     mel_spec = mel_spec / self.scale
-                    guidance_loss_dict, guidance_log_dict = self.model(inp=mel_spec,
-                                                                text=text_inputs,
-                                                                lens=mel_lengths,
-                                                                student_steps=self.student_steps,
-                                                                update_generator=False,
-                                                                use_simulated=global_step >= self.num_simu,
-                                                                )
                     # if self.GAN and update_generator:
                     #     # only add discriminator loss if GAN is enabled and generator is being updated
                     #     guidance_cls_loss = guidance_loss_dict["guidance_cls_loss"] * (self.lambda_discriminator_loss if global_step >= self.num_GAN and update_generator else 0)
                     #     metrics['loss/discriminator_loss'] = guidance_loss_dict["guidance_cls_loss"]
                     #     self.accelerator.backward(guidance_cls_loss, retain_graph=True)
                     #     if self.max_grad_norm > 0 and self.accelerator.sync_gradients:
                     #         metrics['grad_norm_guidance'] = self.accelerator.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
                     guidance_loss = 0
                     guidance_loss += guidance_loss_dict["loss_fake_mean"]
-                    metrics['loss/fake_score'] = guidance_loss_dict["loss_fake_mean"]
                     metrics["loss/guidance_loss"] = guidance_loss
                     if self.GAN and update_generator:
                         # only add discriminator loss if GAN is enabled and generator is being updated
-                        guidance_cls_loss = guidance_loss_dict["guidance_cls_loss"] * (self.lambda_discriminator_loss if global_step >= self.num_GAN and update_generator else 0)
-                        metrics['loss/discriminator_loss'] = guidance_loss_dict["guidance_cls_loss"]
                         guidance_loss += guidance_cls_loss
                     self.accelerator.backward(guidance_loss)
                     if self.max_grad_norm > 0 and self.accelerator.sync_gradients:
-                        metrics['grad_norm_guidance'] = self.accelerator.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
                         # if self.guidance_norm.count < 100:
                         #     self.guidance_norm.update(metrics['grad_norm_guidance'])
@@ -376,20 +464,20 @@ class Trainer:
                         # elif self.guidance_norm.count >= 100:
                         #     self.guidance_norm.update(metrics['grad_norm_guidance'])
                     self.optimizer_guidance.step()
                     self.scheduler_guidance.step()
                     self.optimizer_guidance.zero_grad()
                     self.optimizer_generator.zero_grad()  # zero out the generator's gradient as well
                     if update_generator:
-                        generator_loss_dict, generator_log_dict = self.model(inp=mel_spec,
-                                                                        text=text_inputs,
-                                                                        lens=mel_lengths,
-                                                                        student_steps=self.student_steps,
-                                                                        update_generator=True,
-                                                                        use_simulated=global_step >= self.num_ctc,
-                                                                        )
                         # if self.GAN:
                         #     gen_cls_loss = generator_loss_dict["gen_cls_loss"] * (self.lambda_generator_loss if global_step >= (self.num_GAN + self.num_D) and update_generator else 0)
                         #     metrics["loss/gen_cls_loss"] = generator_loss_dict["gen_cls_loss"]
@@ -402,32 +490,57 @@ class Trainer:
                         generator_loss = 0
                         generator_loss += generator_loss_dict["loss_dm"]
                         if "loss_mse" in generator_loss_dict:
-                            generator_loss += generator_loss_dict["loss_mse"]
-                        generator_loss += generator_loss_dict["loss_ctc"] * (self.lambda_ctc_loss if global_step >= self.num_ctc else 0)
-                        generator_loss += generator_loss_dict["loss_sim"] * (self.lambda_sim_loss if global_step >= self.num_sim else 0)
-                        generator_loss += generator_loss_dict["loss_kl"] * (self.lambda_ctc_loss if global_step >= self.num_ctc else 0)
                         if self.GAN:
-                            gen_cls_loss = generator_loss_dict["gen_cls_loss"] * (self.lambda_generator_loss if global_step >= (self.num_GAN + self.num_D) and update_generator else 0)
-                            metrics["loss/gen_cls_loss"] = generator_loss_dict["gen_cls_loss"]
                             generator_loss += gen_cls_loss
-                        metrics['loss/dm_loss'] = generator_loss_dict["loss_dm"]
-                        metrics['loss/ctc_loss'] = generator_loss_dict["loss_ctc"]
-                        metrics['loss/similarity_loss'] = generator_loss_dict["loss_sim"]
-                        metrics['loss/generator_loss'] = generator_loss
-                        if "loss_mse" in generator_loss_dict and generator_loss_dict["loss_mse"] != 0:
-                            metrics['loss/mse_loss'] = generator_loss_dict["loss_mse"]
-                        if "loss_kl" in generator_loss_dict and generator_loss_dict["loss_kl"] != 0:
-                            metrics['loss/kl_loss'] = generator_loss_dict["loss_kl"]
                         self.accelerator.backward(generator_loss)
                         if self.max_grad_norm > 0 and self.accelerator.sync_gradients:
-                            metrics['grad_norm_generator'] = self.accelerator.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
                             # self.generator_norm.update(metrics['grad_norm_generator'])
                             # if metrics['grad_norm_generator'] > self.generator_norm.mean + 15 * self.generator_norm.std:
                             #     self.optimizer_generator.zero_grad()
                             #     self.optimizer_guidance.zero_grad()
@@ -440,89 +553,165 @@ class Trainer:
                             self.optimizer_generator.zero_grad()
                             self.optimizer_guidance.zero_grad()  # zero out the guidance's gradient as well
                 global_step += 1
                 if self.accelerator.is_local_main_process:
-                    self.accelerator.log({**metrics,
-                                          "lr_generator": self.scheduler_generator.get_last_lr()[0],
-                                          "lr_guidance": self.scheduler_guidance.get_last_lr()[0],
-                                          }
-                                         , step=global_step)
-                if global_step % self.log_step == 0 and self.accelerator.is_local_main_process and vocoder is not None:
                     # log the first batch of the epoch
                     with torch.no_grad():
-                        generator_input = generator_log_dict['generator_input'][0].unsqueeze(0).permute(0, 2, 1) * self.scale
                         generator_input = vocoder.decode(generator_input.float().cpu())
                         generator_input = wandb.Audio(
                             generator_input.float().numpy().squeeze(),
                             sample_rate=24000,
-                            caption="time: " + str(generator_log_dict['time'][0].float().cpu().numpy())
                         )
-                        generator_output = generator_log_dict['generator_output'][0].unsqueeze(0).permute(0, 2, 1) * self.scale
-                        generator_output = vocoder.decode(generator_output.float().cpu())
                         generator_output = wandb.Audio(
                             generator_output.float().numpy().squeeze(),
                             sample_rate=24000,
-                            caption="time: " + str(generator_log_dict['time'][0].float().cpu().numpy())
                         )
-                        generator_cond = generator_log_dict['generator_cond'][0].unsqueeze(0).permute(0, 2, 1) * self.scale
                         generator_cond = vocoder.decode(generator_cond.float().cpu())
                         generator_cond = wandb.Audio(
                             generator_cond.float().numpy().squeeze(),
                             sample_rate=24000,
-                            caption="time: " + str(generator_log_dict['time'][0].float().cpu().numpy())
                         )
-                        ground_truth = generator_log_dict['ground_truth'][0].unsqueeze(0).permute(0, 2, 1) * self.scale
                         ground_truth = vocoder.decode(ground_truth.float().cpu())
                         ground_truth = wandb.Audio(
                             ground_truth.float().numpy().squeeze(),
                             sample_rate=24000,
-                            caption="time: " + str(generator_log_dict['time'][0].float().cpu().numpy())
                         )
-                        dmtrain_noisy_inp = generator_log_dict['dmtrain_noisy_inp'][0].unsqueeze(0).permute(0, 2, 1) * self.scale
-                        dmtrain_noisy_inp = vocoder.decode(dmtrain_noisy_inp.float().cpu())
                         dmtrain_noisy_inp = wandb.Audio(
                             dmtrain_noisy_inp.float().numpy().squeeze(),
                             sample_rate=24000,
-                            caption="dmtrain_time: " + str(generator_log_dict['dmtrain_time'][0].float().cpu().numpy())
                         )
-                        dmtrain_pred_real_image = generator_log_dict['dmtrain_pred_real_image'][0].unsqueeze(0).permute(0, 2, 1) * self.scale
-                        dmtrain_pred_real_image = vocoder.decode(dmtrain_pred_real_image.float().cpu())
                         dmtrain_pred_real_image = wandb.Audio(
                             dmtrain_pred_real_image.float().numpy().squeeze(),
                             sample_rate=24000,
-                            caption="dmtrain_time: " + str(generator_log_dict['dmtrain_time'][0].float().cpu().numpy())
                         )
-                        dmtrain_pred_fake_image = generator_log_dict['dmtrain_pred_fake_image'][0].unsqueeze(0).permute(0, 2, 1) * self.scale
-                        dmtrain_pred_fake_image = vocoder.decode(dmtrain_pred_fake_image.float().cpu())
                         dmtrain_pred_fake_image = wandb.Audio(
                             dmtrain_pred_fake_image.float().numpy().squeeze(),
                             sample_rate=24000,
-                            caption="dmtrain_time: " + str(generator_log_dict['dmtrain_time'][0].float().cpu().numpy())
                         )
-                        self.accelerator.log({"noisy_input": generator_input,
-                                              "output": generator_output,
-                                                "cond": generator_cond,
-                                                "ground_truth": ground_truth,
-                                                "dmtrain_noisy_inp": dmtrain_noisy_inp,
-                                                "dmtrain_pred_real_image": dmtrain_pred_real_image,
-                                                "dmtrain_pred_fake_image": dmtrain_pred_fake_image,
-                                             }, step=global_step)
                 progress_bar.set_postfix(step=str(global_step), metrics=metrics)
-                if global_step % (self.save_per_updates * self.grad_accumulation_steps) == 0:
                     self.save_checkpoint(global_step)
                 if global_step % self.last_per_steps == 0:
@@ -531,5 +720,3 @@ class Trainer:
         self.save_checkpoint(global_step, last=True)
         self.accelerator.end_training()

 from __future__ import annotations
 import gc
+import math
+import os
 import torch
 import torch.nn as nn
+import wandb
 from accelerate import Accelerator
 from accelerate.utils import DistributedDataParallelKwargs
+from torch.optim import AdamW
+from torch.optim.lr_scheduler import LinearLR, SequentialLR
+from torch.utils.data import DataLoader, Dataset, SequentialSampler
+from tqdm import tqdm
 from f5_tts.model import CFM
 from f5_tts.model.dataset import DynamicBatchSampler, collate_fn
+from f5_tts.model.utils import default, exists
+from unimodel import UniModel
 # trainer
 class RunningStats:
     def __init__(self):
     @property
     def variance(self):
         """Return the sample variance. Returns NaN if fewer than two samples."""
+        return self.M2 / (self.count - 1) if self.count > 1 else float("nan")
     @property
     def std(self):
         return math.sqrt(self.variance)
 class Trainer:
     def __init__(
         self,
         accelerate_kwargs: dict = dict(),
         bnb_optimizer: bool = False,
         scale: float = 1.0,
         # training parameters for DMDSpeech
         num_student_step: int = 1,
         gen_update_ratio: int = 5,
         lambda_generator_loss: float = 1.0,
         lambda_ctc_loss: float = 1.0,
         lambda_sim_loss: float = 1.0,
         num_GAN: int = 5000,
         num_D: int = 500,
         num_ctc: int = 5000,
         if logger == "wandb":
             if exists(wandb_resume_id):
+                init_kwargs = {
+                    "wandb": {
+                        "resume": "allow",
+                        "name": wandb_run_name,
+                        "id": wandb_resume_id,
+                    }
+                }
             else:
                 init_kwargs = {"wandb": {"resume": "allow", "name": wandb_run_name}}
             self.accelerator.init_trackers(
         self.epochs = epochs
         self.num_warmup_updates = num_warmup_updates
         self.save_per_updates = save_per_updates
+        self.last_per_steps = default(
+            last_per_steps, save_per_updates * grad_accumulation_steps
+        )
         self.checkpoint_path = default(checkpoint_path, "ckpts/test_e2-tts")
         self.batch_size = batch_size
         self.noise_scheduler = noise_scheduler
         self.duration_predictor = duration_predictor
         self.log_step = log_step
+        self.gen_update_ratio = gen_update_ratio  # number of generator updates per guidance (fake score function and discriminator) update
+        self.lambda_discriminator_loss = (
+            lambda_discriminator_loss  # weight for discriminator loss (L_adv)
+        )
+        self.lambda_generator_loss = (
+            lambda_generator_loss  # weight for generator loss (L_adv)
+        )
+        self.lambda_ctc_loss = lambda_ctc_loss  # weight for ctc loss
+        self.lambda_sim_loss = lambda_sim_loss  # weight for similarity loss
         # create distillation schedule for student model
+        self.student_steps = torch.linspace(0.0, 1.0, num_student_step + 1)[:-1]
+        self.GAN = model.guidance_model.gen_cls_loss  # whether to use GAN training
+        self.num_GAN = num_GAN  # number of steps before adversarial training
+        self.num_D = num_D  # number of steps to train the discriminator before adversarial training
+        self.num_ctc = num_ctc  # number of steps before CTC training
+        self.num_sim = num_sim  # number of steps before similarity training
+        self.num_simu = num_simu  # number of steps before using simulated data
         # Assuming `self.model.fake_unet.parameters()` and `self.model.guidance_model.parameters()` are accessible
         if bnb_optimizer:
             import bitsandbytes as bnb
+            self.optimizer_generator = bnb.optim.AdamW8bit(
+                self.model.feedforward_model.parameters(), lr=learning_rate
+            )
+            self.optimizer_guidance = bnb.optim.AdamW8bit(
+                self.model.guidance_model.parameters(), lr=learning_rate
+            )
         else:
+            self.optimizer_generator = AdamW(
+                self.model.feedforward_model.parameters(), lr=learning_rate, eps=1e-7
+            )
+            self.optimizer_guidance = AdamW(
+                self.model.guidance_model.parameters(), lr=learning_rate, eps=1e-7
+            )
+        self.model, self.optimizer_generator, self.optimizer_guidance = (
+            self.accelerator.prepare(
+                self.model, self.optimizer_generator, self.optimizer_guidance
+            )
+        )
         self.generator_norm = RunningStats()
         self.guidance_norm = RunningStats()
     @property
     def is_main(self):
         return self.accelerator.is_main_process
         if self.is_main:
             checkpoint = dict(
                 model_state_dict=self.accelerator.unwrap_model(self.model).state_dict(),
+                optimizer_generator_state_dict=self.accelerator.unwrap_model(
+                    self.optimizer_generator
+                ).state_dict(),
+                optimizer_guidance_state_dict=self.accelerator.unwrap_model(
+                    self.optimizer_guidance
+                ).state_dict(),
                 scheduler_generator_state_dict=self.scheduler_generator.state_dict(),
                 scheduler_guidance_state_dict=self.scheduler_guidance.state_dict(),
                 step=step,
             if not os.path.exists(self.checkpoint_path):
                 os.makedirs(self.checkpoint_path)
             if last:
+                self.accelerator.save(
+                    checkpoint, f"{self.checkpoint_path}/model_last.pt"
+                )
                 print(f"Saved last checkpoint at step {step}")
             else:
+                self.accelerator.save(
+                    checkpoint, f"{self.checkpoint_path}/model_{step}.pt"
+                )
     def load_checkpoint(self):
         if (
                 key=lambda x: int("".join(filter(str.isdigit, x))),
             )[-1]
         # checkpoint = torch.load(f"{self.checkpoint_path}/{latest_checkpoint}", map_location=self.accelerator.device)  # rather use accelerator.load_state ಥ_ಥ
+        checkpoint = torch.load(
+            f"{self.checkpoint_path}/{latest_checkpoint}",
+            weights_only=True,
+            map_location="cpu",
+        )
+        self.accelerator.unwrap_model(self.model).load_state_dict(
+            checkpoint["model_state_dict"], strict=False
+        )
         # self.accelerator.unwrap_model(self.optimizer_generator).load_state_dict(checkpoint["optimizer_generator_state_dict"])
         # self.accelerator.unwrap_model(self.optimizer_guidance).load_state_dict(checkpoint["optimizer_guidance_state_dict"])
         # if self.scheduler_guidance:
         del checkpoint
         gc.collect()
         return step
+    def train(
+        self,
+        train_dataset: Dataset,
+        num_workers=64,
+        resumable_with_seed: int = None,
+        vocoder: nn.Module = None,
+    ):
         if exists(resumable_with_seed):
             generator = torch.Generator()
             generator.manual_seed(resumable_with_seed)
             self.accelerator.even_batches = False
             sampler = SequentialSampler(train_dataset)
             batch_sampler = DynamicBatchSampler(
+                sampler,
+                self.batch_size,
+                max_samples=self.max_samples,
+                random_seed=resumable_with_seed,
+                drop_last=False,
             )
             train_dataloader = DataLoader(
                 train_dataset,
                 batch_sampler=batch_sampler,
             )
         else:
+            raise ValueError(
+                f"batch_size_type must be either 'sample' or 'frame', but received {self.batch_size_type}"
+            )
         #  accelerator.prepare() dispatches batches to devices;
         #  which means the length of dataloader calculated before, should consider the number of devices
+        warmup_steps = self.num_warmup_updates * self.accelerator.num_processes
         # consider a fixed warmup steps while using accelerate multi-gpu ddp
         # otherwise by default with split_batches=False, warmup steps change with num_processes
         total_steps = len(train_dataloader) * self.epochs / self.grad_accumulation_steps
         decay_steps = total_steps - warmup_steps
+        warmup_scheduler_generator = LinearLR(
+            self.optimizer_generator,
+            start_factor=1e-8,
+            end_factor=1.0,
+            total_iters=warmup_steps
+            // (self.gen_update_ratio * self.grad_accumulation_steps),
+        )
+        decay_scheduler_generator = LinearLR(
+            self.optimizer_generator,
+            start_factor=1.0,
+            end_factor=1e-8,
+            total_iters=decay_steps
+            // (self.gen_update_ratio * self.grad_accumulation_steps),
+        )
+        self.scheduler_generator = SequentialLR(
+            self.optimizer_generator,
+            schedulers=[warmup_scheduler_generator, decay_scheduler_generator],
+            milestones=[
+                warmup_steps // (self.gen_update_ratio * self.grad_accumulation_steps)
+            ],
+        )
+        warmup_scheduler_guidance = LinearLR(
+            self.optimizer_guidance,
+            start_factor=1e-8,
+            end_factor=1.0,
+            total_iters=warmup_steps,
+        )
+        decay_scheduler_guidance = LinearLR(
+            self.optimizer_guidance,
+            start_factor=1.0,
+            end_factor=1e-8,
+            total_iters=decay_steps,
+        )
+        self.scheduler_guidance = SequentialLR(
+            self.optimizer_guidance,
+            schedulers=[warmup_scheduler_guidance, decay_scheduler_guidance],
+            milestones=[warmup_steps],
+        )
+        train_dataloader, self.scheduler_generator, self.scheduler_guidance = (
+            self.accelerator.prepare(
+                train_dataloader, self.scheduler_generator, self.scheduler_guidance
+            )
         )  # actual steps = 1 gpu steps / gpus
         start_step = self.load_checkpoint()
         global_step = start_step
             orig_epoch_step = len(train_dataloader)
             skipped_epoch = int(start_step // orig_epoch_step)
             skipped_batch = start_step % orig_epoch_step
+            skipped_dataloader = self.accelerator.skip_first_batches(
+                train_dataloader, num_batches=skipped_batch
+            )
         else:
             skipped_epoch = 0
             for batch in progress_bar:
                 update_generator = global_step % self.gen_update_ratio == 0
                 with self.accelerator.accumulate(self.model):
                     metrics = {}
                     text_inputs = batch["text"]
                     mel_spec = batch["mel"].permute(0, 2, 1)
                     mel_lengths = batch["mel_lengths"]
                     mel_spec = mel_spec / self.scale
+                    guidance_loss_dict, guidance_log_dict = self.model(
+                        inp=mel_spec,
+                        text=text_inputs,
+                        lens=mel_lengths,
+                        student_steps=self.student_steps,
+                        update_generator=False,
+                        use_simulated=global_step >= self.num_simu,
+                    )
                     # if self.GAN and update_generator:
                     #     # only add discriminator loss if GAN is enabled and generator is being updated
                     #     guidance_cls_loss = guidance_loss_dict["guidance_cls_loss"] * (self.lambda_discriminator_loss if global_step >= self.num_GAN and update_generator else 0)
                     #     metrics['loss/discriminator_loss'] = guidance_loss_dict["guidance_cls_loss"]
                     #     self.accelerator.backward(guidance_cls_loss, retain_graph=True)
                     #     if self.max_grad_norm > 0 and self.accelerator.sync_gradients:
                     #         metrics['grad_norm_guidance'] = self.accelerator.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
                     guidance_loss = 0
                     guidance_loss += guidance_loss_dict["loss_fake_mean"]
+                    metrics["loss/fake_score"] = guidance_loss_dict["loss_fake_mean"]
                     metrics["loss/guidance_loss"] = guidance_loss
                     if self.GAN and update_generator:
                         # only add discriminator loss if GAN is enabled and generator is being updated
+                        guidance_cls_loss = guidance_loss_dict["guidance_cls_loss"] * (
+                            self.lambda_discriminator_loss
+                            if global_step >= self.num_GAN and update_generator
+                            else 0
+                        )
+                        metrics["loss/discriminator_loss"] = guidance_loss_dict[
+                            "guidance_cls_loss"
+                        ]
                         guidance_loss += guidance_cls_loss
                     self.accelerator.backward(guidance_loss)
                     if self.max_grad_norm > 0 and self.accelerator.sync_gradients:
+                        metrics["grad_norm_guidance"] = (
+                            self.accelerator.clip_grad_norm_(
+                                self.model.parameters(), self.max_grad_norm
+                            )
+                        )
                         # if self.guidance_norm.count < 100:
                         #     self.guidance_norm.update(metrics['grad_norm_guidance'])
                         # elif self.guidance_norm.count >= 100:
                         #     self.guidance_norm.update(metrics['grad_norm_guidance'])
                     self.optimizer_guidance.step()
                     self.scheduler_guidance.step()
                     self.optimizer_guidance.zero_grad()
                     self.optimizer_generator.zero_grad()  # zero out the generator's gradient as well
                     if update_generator:
+                        generator_loss_dict, generator_log_dict = self.model(
+                            inp=mel_spec,
+                            text=text_inputs,
+                            lens=mel_lengths,
+                            student_steps=self.student_steps,
+                            update_generator=True,
+                            use_simulated=global_step >= self.num_ctc,
+                        )
                         # if self.GAN:
                         #     gen_cls_loss = generator_loss_dict["gen_cls_loss"] * (self.lambda_generator_loss if global_step >= (self.num_GAN + self.num_D) and update_generator else 0)
                         #     metrics["loss/gen_cls_loss"] = generator_loss_dict["gen_cls_loss"]
                         generator_loss = 0
                         generator_loss += generator_loss_dict["loss_dm"]
                         if "loss_mse" in generator_loss_dict:
+                            generator_loss += generator_loss_dict["loss_mse"]
+                        generator_loss += generator_loss_dict["loss_ctc"] * (
+                            self.lambda_ctc_loss if global_step >= self.num_ctc else 0
+                        )
+                        generator_loss += generator_loss_dict["loss_sim"] * (
+                            self.lambda_sim_loss if global_step >= self.num_sim else 0
+                        )
+                        generator_loss += generator_loss_dict["loss_kl"] * (
+                            self.lambda_ctc_loss if global_step >= self.num_ctc else 0
+                        )
                         if self.GAN:
+                            gen_cls_loss = generator_loss_dict["gen_cls_loss"] * (
+                                self.lambda_generator_loss
+                                if global_step >= (self.num_GAN + self.num_D)
+                                and update_generator
+                                else 0
+                            )
+                            metrics["loss/gen_cls_loss"] = generator_loss_dict[
+                                "gen_cls_loss"
+                            ]
                             generator_loss += gen_cls_loss
+                        metrics["loss/dm_loss"] = generator_loss_dict["loss_dm"]
+                        metrics["loss/ctc_loss"] = generator_loss_dict["loss_ctc"]
+                        metrics["loss/similarity_loss"] = generator_loss_dict[
+                            "loss_sim"
+                        ]
+                        metrics["loss/generator_loss"] = generator_loss
+                        if (
+                            "loss_mse" in generator_loss_dict
+                            and generator_loss_dict["loss_mse"] != 0
+                        ):
+                            metrics["loss/mse_loss"] = generator_loss_dict["loss_mse"]
+                        if (
+                            "loss_kl" in generator_loss_dict
+                            and generator_loss_dict["loss_kl"] != 0
+                        ):
+                            metrics["loss/kl_loss"] = generator_loss_dict["loss_kl"]
                         self.accelerator.backward(generator_loss)
                         if self.max_grad_norm > 0 and self.accelerator.sync_gradients:
+                            metrics["grad_norm_generator"] = (
+                                self.accelerator.clip_grad_norm_(
+                                    self.model.parameters(), self.max_grad_norm
+                                )
+                            )
                             # self.generator_norm.update(metrics['grad_norm_generator'])
                             # if metrics['grad_norm_generator'] > self.generator_norm.mean + 15 * self.generator_norm.std:
                             #     self.optimizer_generator.zero_grad()
                             #     self.optimizer_guidance.zero_grad()
                             self.optimizer_generator.zero_grad()
                             self.optimizer_guidance.zero_grad()  # zero out the guidance's gradient as well
                 global_step += 1
                 if self.accelerator.is_local_main_process:
+                    self.accelerator.log(
+                        {
+                            **metrics,
+                            "lr_generator": self.scheduler_generator.get_last_lr()[0],
+                            "lr_guidance": self.scheduler_guidance.get_last_lr()[0],
+                        },
+                        step=global_step,
+                    )
+                if (
+                    global_step % self.log_step == 0
+                    and self.accelerator.is_local_main_process
+                    and vocoder is not None
+                ):
                     # log the first batch of the epoch
                     with torch.no_grad():
+                        generator_input = (
+                            generator_log_dict["generator_input"][0]
+                            .unsqueeze(0)
+                            .permute(0, 2, 1)
+                            * self.scale
+                        )
                         generator_input = vocoder.decode(generator_input.float().cpu())
                         generator_input = wandb.Audio(
                             generator_input.float().numpy().squeeze(),
                             sample_rate=24000,
+                            caption="time: "
+                            + str(generator_log_dict["time"][0].float().cpu().numpy()),
                         )
+                        generator_output = (
+                            generator_log_dict["generator_output"][0]
+                            .unsqueeze(0)
+                            .permute(0, 2, 1)
+                            * self.scale
+                        )
+                        generator_output = vocoder.decode(
+                            generator_output.float().cpu()
+                        )
                         generator_output = wandb.Audio(
                             generator_output.float().numpy().squeeze(),
                             sample_rate=24000,
+                            caption="time: "
+                            + str(generator_log_dict["time"][0].float().cpu().numpy()),
+                        )
+                        generator_cond = (
+                            generator_log_dict["generator_cond"][0]
+                            .unsqueeze(0)
+                            .permute(0, 2, 1)
+                            * self.scale
                         )
                         generator_cond = vocoder.decode(generator_cond.float().cpu())
                         generator_cond = wandb.Audio(
                             generator_cond.float().numpy().squeeze(),
                             sample_rate=24000,
+                            caption="time: "
+                            + str(generator_log_dict["time"][0].float().cpu().numpy()),
+                        )
+                        ground_truth = (
+                            generator_log_dict["ground_truth"][0]
+                            .unsqueeze(0)
+                            .permute(0, 2, 1)
+                            * self.scale
                         )
                         ground_truth = vocoder.decode(ground_truth.float().cpu())
                         ground_truth = wandb.Audio(
                             ground_truth.float().numpy().squeeze(),
                             sample_rate=24000,
+                            caption="time: "
+                            + str(generator_log_dict["time"][0].float().cpu().numpy()),
+                        )
+                        dmtrain_noisy_inp = (
+                            generator_log_dict["dmtrain_noisy_inp"][0]
+                            .unsqueeze(0)
+                            .permute(0, 2, 1)
+                            * self.scale
+                        )
+                        dmtrain_noisy_inp = vocoder.decode(
+                            dmtrain_noisy_inp.float().cpu()
                         )
                         dmtrain_noisy_inp = wandb.Audio(
                             dmtrain_noisy_inp.float().numpy().squeeze(),
                             sample_rate=24000,
+                            caption="dmtrain_time: "
+                            + str(
+                                generator_log_dict["dmtrain_time"][0]
+                                .float()
+                                .cpu()
+                                .numpy()
+                            ),
+                        )
+                        dmtrain_pred_real_image = (
+                            generator_log_dict["dmtrain_pred_real_image"][0]
+                            .unsqueeze(0)
+                            .permute(0, 2, 1)
+                            * self.scale
+                        )
+                        dmtrain_pred_real_image = vocoder.decode(
+                            dmtrain_pred_real_image.float().cpu()
                         )
                         dmtrain_pred_real_image = wandb.Audio(
                             dmtrain_pred_real_image.float().numpy().squeeze(),
                             sample_rate=24000,
+                            caption="dmtrain_time: "
+                            + str(
+                                generator_log_dict["dmtrain_time"][0]
+                                .float()
+                                .cpu()
+                                .numpy()
+                            ),
+                        )
+                        dmtrain_pred_fake_image = (
+                            generator_log_dict["dmtrain_pred_fake_image"][0]
+                            .unsqueeze(0)
+                            .permute(0, 2, 1)
+                            * self.scale
+                        )
+                        dmtrain_pred_fake_image = vocoder.decode(
+                            dmtrain_pred_fake_image.float().cpu()
                         )
                         dmtrain_pred_fake_image = wandb.Audio(
                             dmtrain_pred_fake_image.float().numpy().squeeze(),
                             sample_rate=24000,
+                            caption="dmtrain_time: "
+                            + str(
+                                generator_log_dict["dmtrain_time"][0]
+                                .float()
+                                .cpu()
+                                .numpy()
+                            ),
+                        )
+                        self.accelerator.log(
+                            {
+                                "noisy_input": generator_input,
+                                "output": generator_output,
+                                "cond": generator_cond,
+                                "ground_truth": ground_truth,
+                                "dmtrain_noisy_inp": dmtrain_noisy_inp,
+                                "dmtrain_pred_real_image": dmtrain_pred_real_image,
+                                "dmtrain_pred_fake_image": dmtrain_pred_fake_image,
+                            },
+                            step=global_step,
                         )
                 progress_bar.set_postfix(step=str(global_step), metrics=metrics)
+                if (
+                    global_step % (self.save_per_updates * self.grad_accumulation_steps)
+                    == 0
+                ):
                     self.save_checkpoint(global_step)
                 if global_step % self.last_per_steps == 0:
         self.save_checkpoint(global_step, last=True)
         self.accelerator.end_training()

duration_predictor.py CHANGED Viewed

@@ -3,6 +3,7 @@ import torch.nn as nn
 # from tts_encode import tts_encode
 def calculate_remaining_lengths(mel_lengths):
     B = mel_lengths.shape[0]
     max_L = mel_lengths.max().item()  # Get the maximum length in the batch
@@ -21,64 +22,84 @@ class PositionalEncoding(nn.Module):
         super().__init__()
         pe = torch.zeros(max_len, hidden_dim)
         position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
-        div_term = torch.exp(torch.arange(0, hidden_dim, 2).float() * (-torch.log(torch.tensor(10000.0)) / hidden_dim))
         pe[:, 0::2] = torch.sin(position * div_term)
         pe[:, 1::2] = torch.cos(position * div_term)
         self.pe = pe.unsqueeze(0)  # Shape: (1, max_len, hidden_dim)
     def forward(self, x):
-        x = x + self.pe[:, :x.size(1)].to(x.device)
         return x
 class SpeechLengthPredictor(nn.Module):
-    def __init__(self,
-        vocab_size=2545, n_mel=100, hidden_dim=256,
-        n_text_layer=4, n_cross_layer=4, n_head=8,
         output_dim=1,
     ):
         super().__init__()
         # Text Encoder: Embedding + Transformer Layers
-        self.text_embedder = nn.Embedding(vocab_size+1, hidden_dim, padding_idx=vocab_size)
         self.text_pe = PositionalEncoding(hidden_dim)
         encoder_layer = nn.TransformerEncoderLayer(
-            d_model=hidden_dim, nhead=n_head, dim_feedforward=hidden_dim*2, batch_first=True
         )
-        self.text_encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_text_layer)
         # Mel Spectrogram Embedder
         self.mel_embedder = nn.Linear(n_mel, hidden_dim)
         self.mel_pe = PositionalEncoding(hidden_dim)
         # Transformer Decoder Layers with Cross-Attention in Every Layer
         decoder_layer = nn.TransformerDecoderLayer(
-            d_model=hidden_dim, nhead=n_head, dim_feedforward=hidden_dim*2, batch_first=True
         )
         self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=n_cross_layer)
         # Final Classification Layer
         self.predictor = nn.Linear(hidden_dim, output_dim)
     def forward(self, text_ids, mel):
         # Encode text
         text_embedded = self.text_pe(self.text_embedder(text_ids))
-        text_features = self.text_encoder(text_embedded) # (B, L_text, D)
         # Encode Mel spectrogram
         mel_features = self.mel_pe(self.mel_embedder(mel))  # (B, L_mel, D)
         # Causal Masking for Decoder
         seq_len = mel_features.size(1)
-        causal_mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool().to(mel.device)
         # causal_mask = torch.triu(
         #     torch.full((seq_len, seq_len), float('-inf'), device=mel.device), diagonal=1
         # )
         # Transformer Decoder with Cross-Attention in Each Layer
         decoder_out = self.decoder(mel_features, text_features, tgt_mask=causal_mask)
         # Length Prediction
         length_logits = self.predictor(decoder_out).squeeze(-1)
         return length_logits

 # from tts_encode import tts_encode
 def calculate_remaining_lengths(mel_lengths):
     B = mel_lengths.shape[0]
     max_L = mel_lengths.max().item()  # Get the maximum length in the batch
         super().__init__()
         pe = torch.zeros(max_len, hidden_dim)
         position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, hidden_dim, 2).float()
+            * (-torch.log(torch.tensor(10000.0)) / hidden_dim)
+        )
         pe[:, 0::2] = torch.sin(position * div_term)
         pe[:, 1::2] = torch.cos(position * div_term)
         self.pe = pe.unsqueeze(0)  # Shape: (1, max_len, hidden_dim)
     def forward(self, x):
+        x = x + self.pe[:, : x.size(1)].to(x.device)
         return x
 class SpeechLengthPredictor(nn.Module):
+    def __init__(
+        self,
+        vocab_size=2545,
+        n_mel=100,
+        hidden_dim=256,
+        n_text_layer=4,
+        n_cross_layer=4,
+        n_head=8,
         output_dim=1,
     ):
         super().__init__()
         # Text Encoder: Embedding + Transformer Layers
+        self.text_embedder = nn.Embedding(
+            vocab_size + 1, hidden_dim, padding_idx=vocab_size
+        )
         self.text_pe = PositionalEncoding(hidden_dim)
         encoder_layer = nn.TransformerEncoderLayer(
+            d_model=hidden_dim,
+            nhead=n_head,
+            dim_feedforward=hidden_dim * 2,
+            batch_first=True,
+        )
+        self.text_encoder = nn.TransformerEncoder(
+            encoder_layer, num_layers=n_text_layer
         )
         # Mel Spectrogram Embedder
         self.mel_embedder = nn.Linear(n_mel, hidden_dim)
         self.mel_pe = PositionalEncoding(hidden_dim)
         # Transformer Decoder Layers with Cross-Attention in Every Layer
         decoder_layer = nn.TransformerDecoderLayer(
+            d_model=hidden_dim,
+            nhead=n_head,
+            dim_feedforward=hidden_dim * 2,
+            batch_first=True,
         )
         self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=n_cross_layer)
         # Final Classification Layer
         self.predictor = nn.Linear(hidden_dim, output_dim)
     def forward(self, text_ids, mel):
         # Encode text
         text_embedded = self.text_pe(self.text_embedder(text_ids))
+        text_features = self.text_encoder(text_embedded)  # (B, L_text, D)
         # Encode Mel spectrogram
         mel_features = self.mel_pe(self.mel_embedder(mel))  # (B, L_mel, D)
         # Causal Masking for Decoder
         seq_len = mel_features.size(1)
+        causal_mask = (
+            torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool().to(mel.device)
+        )
         # causal_mask = torch.triu(
         #     torch.full((seq_len, seq_len), float('-inf'), device=mel.device), diagonal=1
         # )
         # Transformer Decoder with Cross-Attention in Each Layer
         decoder_out = self.decoder(mel_features, text_features, tgt_mask=causal_mask)
         # Length Prediction
         length_logits = self.predictor(decoder_out).squeeze(-1)
         return length_logits

duration_trainer.py CHANGED Viewed

@@ -1,11 +1,11 @@
 from __future__ import annotations
 import gc
-import os
 import math
 import torch
 import torchaudio
 import wandb
 from accelerate import Accelerator
@@ -13,37 +13,28 @@ from accelerate.utils import DistributedDataParallelKwargs
 from ema_pytorch import EMA
 from torch.optim import AdamW
 from torch.optim.lr_scheduler import LinearLR, SequentialLR
-from torch.utils.data import DataLoader, Dataset, SequentialSampler, Subset  # <-- Added Subset import
 from tqdm import tqdm
-import torch.nn.functional as F
-from f5_tts.model import CFM
-from f5_tts.model.dataset import collate_fn, DynamicBatchSampler
-from f5_tts.model.utils import default, exists
 from duration_predictor import calculate_remaining_lengths
 # trainer
-from f5_tts.model.utils import (
-    default,
-    exists,
-    list_str_to_idx,
-    list_str_to_tensor,
-    lens_to_mask,
-    mask_from_frac_lengths,
-)
 SAMPLE_RATE = 24_000
 def masked_l1_loss(est_lengths, tar_lengths):
-    first_zero_idx = (tar_lengths == 0).int().argmax(dim=1)
     B, L = tar_lengths.shape
-    range_tensor = torch.arange(L, device=tar_lengths.device).expand(B, L)
     mask = range_tensor <= first_zero_idx[:, None]  # Include the first 0
-    loss = F.l1_loss(est_lengths, tar_lengths, reduction='none')  # (B, L)
     loss = loss * mask  # Zero out ignored positions
     loss = loss.sum() / mask.sum()  # Normalize by valid elements
     return loss
@@ -55,9 +46,9 @@ def masked_cross_entropy_loss(est_length_logits, tar_length_labels):
     range_tensor = torch.arange(L, device=tar_length_labels.device).expand(B, L)
     mask = range_tensor <= first_zero_idx[:, None]  # Include the first 0
     loss = F.cross_entropy(
-        est_length_logits.reshape(-1, est_length_logits.size(-1)),
-        tar_length_labels.reshape(-1),
-        reduction='none'
     ).reshape(B, L)
     loss = loss * mask
     loss = loss.sum() / mask.sum()
@@ -71,7 +62,7 @@ class Trainer:
         vocab_size,
         vocab_char_map,
         process_token_to_id=True,
-        loss_fn='L1',
         lambda_L1=1,
         gumbel_tau=0.5,
         n_class=301,
@@ -110,7 +101,13 @@ class Trainer:
         self.logger = logger
         if self.logger == "wandb":
             if exists(wandb_resume_id):
-                init_kwargs = {"wandb": {"resume": "allow", "name": wandb_run_name, "id": wandb_resume_id}}
             else:
                 init_kwargs = {"wandb": {"resume": "allow", "name": wandb_run_name}}
@@ -139,7 +136,7 @@ class Trainer:
         self.vocab_size = vocab_size
         self.vocab_char_map = vocab_char_map
         self.process_token_to_id = process_token_to_id
-        assert loss_fn in ['L1', 'CE', 'L1_and_CE']
         self.loss_fn = loss_fn
         self.lambda_L1 = lambda_L1
         self.n_class = n_class
@@ -149,7 +146,9 @@ class Trainer:
         self.epochs = epochs
         self.num_warmup_updates = num_warmup_updates
         self.save_per_updates = save_per_updates
-        self.last_per_steps = default(last_per_steps, save_per_updates * grad_accumulation_steps)
         self.checkpoint_path = default(checkpoint_path, "ckpts/test_e2-tts")
         self.batch_size = batch_size
@@ -164,33 +163,44 @@ class Trainer:
             self.optimizer = bnb.optim.AdamW8bit(model.parameters(), lr=learning_rate)
         else:
             self.optimizer = AdamW(model.parameters(), lr=learning_rate)
-        self.model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
     @property
     def is_main(self):
         return self.accelerator.is_main_process
     def save_checkpoint(self, step, last=False):
         self.accelerator.wait_for_everyone()
-        if self.is_main:
             checkpoint = dict(
                 model_state_dict=self.accelerator.unwrap_model(self.model).state_dict(),
-                optimizer_state_dict=self.accelerator.unwrap_model(self.optimizer).state_dict(),
                 scheduler_state_dict=self.scheduler.state_dict(),
                 step=step,
             )
             if not os.path.exists(self.checkpoint_path):
                 os.makedirs(self.checkpoint_path)
             if last:
-                self.accelerator.save(checkpoint, f"{self.checkpoint_path}/model_last.pt")
             else:
-                self.accelerator.save(checkpoint, f"{self.checkpoint_path}/model_{step}.pt")
     def load_checkpoint(self):
         if (
             not exists(self.checkpoint_path)
             or not os.path.exists(self.checkpoint_path)
-            or not any(filename.endswith(".pt") for filename in os.listdir(self.checkpoint_path))
         ):
             return 0
@@ -203,21 +213,32 @@ class Trainer:
                 key=lambda x: int("".join(filter(str.isdigit, x))),
             )[-1]
-        print(f'To load from {latest_checkpoint}.')
         # checkpoint = torch.load(f"{self.checkpoint_path}/{latest_checkpoint}", map_location=self.accelerator.device)  # rather use accelerator.load_state ಥ_ಥ
-        checkpoint = torch.load(f"{self.checkpoint_path}/{latest_checkpoint}", weights_only=True, map_location="cpu")
-        print(f'Loaded from {latest_checkpoint}.')
         if "step" in checkpoint:
             # patch for backward compatibility, 305e3ea
-            for key in ["mel_spec.mel_stft.mel_scale.fb", "mel_spec.mel_stft.spectrogram.window"]:
                 if key in checkpoint["model_state_dict"]:
                     del checkpoint["model_state_dict"][key]
-            self.accelerator.unwrap_model(self.model).load_state_dict(checkpoint["model_state_dict"])
-            self.accelerator.unwrap_model(self.optimizer).load_state_dict(checkpoint["optimizer_state_dict"])
             if self.scheduler:
                 self.scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
             step = checkpoint["step"]
@@ -227,17 +248,18 @@ class Trainer:
                 for k, v in checkpoint["ema_model_state_dict"].items()
                 if k not in ["initted", "step"]
             }
-            self.accelerator.unwrap_model(self.model).load_state_dict(checkpoint["model_state_dict"])
             step = 0
         del checkpoint
         gc.collect()
-        print(f'Exit load_checkpoint.')
         return step
     def validate(self, valid_dataloader, global_step):
         """
         Runs evaluation on the validation set, computes the average loss,
@@ -251,54 +273,61 @@ class Trainer:
         with torch.no_grad():
             for batch in valid_dataloader:
                 # Inputs
-                mel = batch['mel'].permute(0, 2, 1) # (B, L_mel, D)
-                text = batch['text']
                 if self.process_token_to_id:
                     text_ids = list_str_to_idx(text, self.vocab_char_map).to(mel.device)
-                    text_ids = text_ids.masked_fill(text_ids==-1, self.vocab_size)
                 else:
                     text_ids = text
                 # Targets
-                mel_lengths = batch['mel_lengths']
                 tar_lengths = calculate_remaining_lengths(mel_lengths)
                 predictions = self.model(text_ids=text_ids, mel=mel)
-                if self.loss_fn == 'L1':
                     est_lengths = predictions
                     loss = masked_l1_loss(
                         est_lengths=est_lengths, tar_lengths=tar_lengths
                     )
                     frame_error = loss
-                elif self.loss_fn == 'CE':
-                    tar_length_labels = (tar_lengths // self.n_frame_per_class) \
-                        .clamp(min=0, max=self.n_class-1) # [0, 1, ..., n_class-1]
                     est_length_logtis = predictions
                     est_length_labels = torch.argmax(est_length_logtis, dim=-1)
                     loss = masked_cross_entropy_loss(
-                        est_length_logits=est_length_logtis, tar_length_labels=tar_length_labels
                     )
                     est_lengths = est_length_labels * self.n_frame_per_class
                     frame_error = masked_l1_loss(
                         est_lengths=est_lengths, tar_lengths=tar_lengths
                     )
-                elif self.loss_fn == 'L1_and_CE':
-                    tar_length_labels = (tar_lengths // self.n_frame_per_class) \
-                        .clamp(min=0, max=self.n_class-1) # [0, 1, ..., n_class-1]
                     est_length_logtis = predictions
                     est_length_1hots = F.gumbel_softmax(
                         est_length_logtis, tau=self.gumbel_tau, hard=True, dim=-1
                     )
-                    length_values = torch.arange(
-                        self.n_class, device=est_length_1hots.device
-                    ).float() * self.n_frame_per_class
                     est_lengths = (est_length_1hots * length_values).sum(-1)
                     loss_CE = masked_cross_entropy_loss(
-                        est_length_logits=est_length_logtis, tar_length_labels=tar_length_labels
                     )
                     loss_L1 = masked_l1_loss(
@@ -321,18 +350,19 @@ class Trainer:
         avg_valid_sec_error = total_sec_error / count if count > 0 else 0.0
         # Log validation metrics
         self.accelerator.log(
-            {
-                f"valid_loss": avg_valid_loss,
-                f"valid_sec_error": avg_valid_sec_error
-            },
-            step=global_step
         )
-        self.model.train()
-    def train(self, train_dataset: Dataset, valid_dataset: Dataset,
-        num_workers=64, resumable_with_seed: int = None):
         if exists(resumable_with_seed):
             generator = torch.Generator()
             generator.manual_seed(resumable_with_seed)
@@ -366,7 +396,11 @@ class Trainer:
             sampler = SequentialSampler(train_dataset)
             batch_sampler = DynamicBatchSampler(
-                sampler, self.batch_size, max_samples=self.max_samples, random_seed=resumable_with_seed, drop_last=False
             )
             train_dataloader = DataLoader(
                 train_dataset,
@@ -379,20 +413,26 @@ class Trainer:
             sampler = SequentialSampler(valid_dataset)
             batch_sampler = DynamicBatchSampler(
-                sampler, self.batch_size, max_samples=self.max_samples, random_seed=resumable_with_seed, drop_last=False
             )
             # Create validation dataloader (always sequential, no shuffling)
             valid_dataloader = DataLoader(
                 valid_dataset,
                 collate_fn=collate_fn,
                 num_workers=num_workers,
-                pin_memory=True,
                 persistent_workers=True,
                 batch_sampler=batch_sampler,
             )
         else:
-            raise ValueError(f"batch_size_type must be either 'sample' or 'frame', but received {self.batch_size_type}")
         #  accelerator.prepare() dispatches batches to devices;
         #  which means the length of dataloader calculated before, should consider the number of devices
         warmup_steps = (
@@ -401,10 +441,16 @@ class Trainer:
         # otherwise by default with split_batches=False, warmup steps change with num_processes
         total_steps = len(train_dataloader) * self.epochs / self.grad_accumulation_steps
         decay_steps = total_steps - warmup_steps
-        warmup_scheduler = LinearLR(self.optimizer, start_factor=1e-8, end_factor=1.0, total_iters=warmup_steps)
-        decay_scheduler = LinearLR(self.optimizer, start_factor=1.0, end_factor=1e-8, total_iters=decay_steps)
         self.scheduler = SequentialLR(
-            self.optimizer, schedulers=[warmup_scheduler, decay_scheduler], milestones=[warmup_steps]
         )
         train_dataloader, self.scheduler = self.accelerator.prepare(
             train_dataloader, self.scheduler
@@ -418,7 +464,9 @@ class Trainer:
             orig_epoch_step = len(train_dataloader)
             skipped_epoch = int(start_step // orig_epoch_step)
             skipped_batch = start_step % orig_epoch_step
-            skipped_dataloader = self.accelerator.skip_first_batches(train_dataloader, num_batches=skipped_batch)
         else:
             skipped_epoch = 0
@@ -444,21 +492,23 @@ class Trainer:
             for batch in progress_bar:
                 with self.accelerator.accumulate(self.model):
                     # Inputs
-                    mel = batch['mel'].permute(0, 2, 1) # (B, L_mel, D)
-                    text = batch['text']
                     if self.process_token_to_id:
-                        text_ids = list_str_to_idx(text, self.vocab_char_map).to(mel.device)
-                        text_ids = text_ids.masked_fill(text_ids==-1, self.vocab_size)
                     else:
                         text_ids = text
                     # Targets
-                    mel_lengths = batch['mel_lengths']
                     tar_lengths = calculate_remaining_lengths(mel_lengths)
                     predictions = self.model(text_ids=text_ids, mel=mel)
-                    if self.loss_fn == 'L1':
                         est_lengths = predictions
                         loss = masked_l1_loss(
                             est_lengths=est_lengths, tar_lengths=tar_lengths
@@ -469,19 +519,23 @@ class Trainer:
                             sec_error = frame_error * 256 / 24000
                         log_dict = {
-                            'loss': loss.item(),
-                            'loss_L1': loss.item(),
-                            'sec_error': sec_error.item(),
-                            'lr': self.scheduler.get_last_lr()[0]
-                         }
-                    elif self.loss_fn == 'CE':
-                        tar_length_labels = (tar_lengths // self.n_frame_per_class) \
-                            .clamp(min=0, max=self.n_class-1) # [0, 1, ..., n_class-1]
                         est_length_logtis = predictions
                         est_length_labels = torch.argmax(est_length_logtis, dim=-1)
                         loss = masked_cross_entropy_loss(
-                            est_length_logits=est_length_logtis, tar_length_labels=tar_length_labels
                         )
                         with torch.no_grad():
                             est_lengths = est_length_labels * self.n_frame_per_class
@@ -491,29 +545,36 @@ class Trainer:
                             sec_error = frame_error * 256 / 24000
                         log_dict = {
-                            'loss': loss.item(),
-                            'loss_CE': loss.item(),
-                            'sec_error': sec_error.item(),
-                            'lr': self.scheduler.get_last_lr()[0]
-                         }
-                    elif self.loss_fn == 'L1_and_CE':
-                        tar_length_labels = (tar_lengths // self.n_frame_per_class) \
-                            .clamp(min=0, max=self.n_class-1) # [0, 1, ..., n_class-1]
                         est_length_logtis = predictions
                         est_length_1hots = F.gumbel_softmax(
                             est_length_logtis, tau=self.gumbel_tau, hard=True, dim=-1
                         )
-                        length_values = torch.arange(
-                            self.n_class, device=est_length_1hots.device
-                        ).float() * self.n_frame_per_class
                         est_lengths = (est_length_1hots * length_values).sum(-1)
                         loss_CE = masked_cross_entropy_loss(
-                            est_length_logits=est_length_logtis, tar_length_labels=tar_length_labels
                         )
-                        loss_L1 =  masked_l1_loss(
                             est_lengths=est_lengths, tar_lengths=tar_lengths
                         )
@@ -524,21 +585,22 @@ class Trainer:
                             sec_error = frame_error * 256 / 24000
                         log_dict = {
-                            'loss': loss.item(),
-                            'loss_L1': loss_L1.item(),
-                            'loss_CE': loss_CE.item(),
-                            'sec_error': sec_error.item(),
-                            'lr': self.scheduler.get_last_lr()[0]
                         }
                     else:
                         raise NotImplementedError(self.loss_fn)
                     self.accelerator.backward(loss)
                     if self.max_grad_norm > 0 and self.accelerator.sync_gradients:
-                        self.accelerator.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
                     self.optimizer.step()
                     self.scheduler.step()
@@ -550,7 +612,10 @@ class Trainer:
                     self.accelerator.log(log_dict, step=global_step)
                 progress_bar.set_postfix(step=str(global_step), loss=loss.item())
-                if global_step % (self.save_per_updates * self.grad_accumulation_steps) == 0:
                     self.save_checkpoint(global_step)
                     # if self.log_samples and self.accelerator.is_local_main_process:
                     # Run validation at the end of each epoch (only on the main process)

 from __future__ import annotations
 import gc
 import math
+import os
 import torch
+import torch.nn.functional as F
 import torchaudio
 import wandb
 from accelerate import Accelerator
 from ema_pytorch import EMA
 from torch.optim import AdamW
 from torch.optim.lr_scheduler import LinearLR, SequentialLR
+from torch.utils.data import Dataset  # <-- Added Subset import
+from torch.utils.data import DataLoader, SequentialSampler, Subset
 from tqdm import tqdm
 from duration_predictor import calculate_remaining_lengths
+from f5_tts.model import CFM
+from f5_tts.model.dataset import DynamicBatchSampler, collate_fn
+from f5_tts.model.utils import (default, exists, lens_to_mask, list_str_to_idx,
+                                list_str_to_tensor, mask_from_frac_lengths)
 # trainer
 SAMPLE_RATE = 24_000
 def masked_l1_loss(est_lengths, tar_lengths):
+    first_zero_idx = (tar_lengths == 0).int().argmax(dim=1)
     B, L = tar_lengths.shape
+    range_tensor = torch.arange(L, device=tar_lengths.device).expand(B, L)
     mask = range_tensor <= first_zero_idx[:, None]  # Include the first 0
+    loss = F.l1_loss(est_lengths, tar_lengths, reduction="none")  # (B, L)
     loss = loss * mask  # Zero out ignored positions
     loss = loss.sum() / mask.sum()  # Normalize by valid elements
     return loss
     range_tensor = torch.arange(L, device=tar_length_labels.device).expand(B, L)
     mask = range_tensor <= first_zero_idx[:, None]  # Include the first 0
     loss = F.cross_entropy(
+        est_length_logits.reshape(-1, est_length_logits.size(-1)),
+        tar_length_labels.reshape(-1),
+        reduction="none",
     ).reshape(B, L)
     loss = loss * mask
     loss = loss.sum() / mask.sum()
         vocab_size,
         vocab_char_map,
         process_token_to_id=True,
+        loss_fn="L1",
         lambda_L1=1,
         gumbel_tau=0.5,
         n_class=301,
         self.logger = logger
         if self.logger == "wandb":
             if exists(wandb_resume_id):
+                init_kwargs = {
+                    "wandb": {
+                        "resume": "allow",
+                        "name": wandb_run_name,
+                        "id": wandb_resume_id,
+                    }
+                }
             else:
                 init_kwargs = {"wandb": {"resume": "allow", "name": wandb_run_name}}
         self.vocab_size = vocab_size
         self.vocab_char_map = vocab_char_map
         self.process_token_to_id = process_token_to_id
+        assert loss_fn in ["L1", "CE", "L1_and_CE"]
         self.loss_fn = loss_fn
         self.lambda_L1 = lambda_L1
         self.n_class = n_class
         self.epochs = epochs
         self.num_warmup_updates = num_warmup_updates
         self.save_per_updates = save_per_updates
+        self.last_per_steps = default(
+            last_per_steps, save_per_updates * grad_accumulation_steps
+        )
         self.checkpoint_path = default(checkpoint_path, "ckpts/test_e2-tts")
         self.batch_size = batch_size
             self.optimizer = bnb.optim.AdamW8bit(model.parameters(), lr=learning_rate)
         else:
             self.optimizer = AdamW(model.parameters(), lr=learning_rate)
+        self.model, self.optimizer = self.accelerator.prepare(
+            self.model, self.optimizer
+        )
     @property
     def is_main(self):
         return self.accelerator.is_main_process
     def save_checkpoint(self, step, last=False):
         self.accelerator.wait_for_everyone()
+        if self.is_main:
             checkpoint = dict(
                 model_state_dict=self.accelerator.unwrap_model(self.model).state_dict(),
+                optimizer_state_dict=self.accelerator.unwrap_model(
+                    self.optimizer
+                ).state_dict(),
                 scheduler_state_dict=self.scheduler.state_dict(),
                 step=step,
             )
             if not os.path.exists(self.checkpoint_path):
                 os.makedirs(self.checkpoint_path)
             if last:
+                self.accelerator.save(
+                    checkpoint, f"{self.checkpoint_path}/model_last.pt"
+                )
             else:
+                self.accelerator.save(
+                    checkpoint, f"{self.checkpoint_path}/model_{step}.pt"
+                )
     def load_checkpoint(self):
         if (
             not exists(self.checkpoint_path)
             or not os.path.exists(self.checkpoint_path)
+            or not any(
+                filename.endswith(".pt")
+                for filename in os.listdir(self.checkpoint_path)
+            )
         ):
             return 0
                 key=lambda x: int("".join(filter(str.isdigit, x))),
             )[-1]
+        print(f"To load from {latest_checkpoint}.")
         # checkpoint = torch.load(f"{self.checkpoint_path}/{latest_checkpoint}", map_location=self.accelerator.device)  # rather use accelerator.load_state ಥ_ಥ
+        checkpoint = torch.load(
+            f"{self.checkpoint_path}/{latest_checkpoint}",
+            weights_only=True,
+            map_location="cpu",
+        )
+        print(f"Loaded from {latest_checkpoint}.")
         if "step" in checkpoint:
             # patch for backward compatibility, 305e3ea
+            for key in [
+                "mel_spec.mel_stft.mel_scale.fb",
+                "mel_spec.mel_stft.spectrogram.window",
+            ]:
                 if key in checkpoint["model_state_dict"]:
                     del checkpoint["model_state_dict"][key]
+            self.accelerator.unwrap_model(self.model).load_state_dict(
+                checkpoint["model_state_dict"]
+            )
+            self.accelerator.unwrap_model(self.optimizer).load_state_dict(
+                checkpoint["optimizer_state_dict"]
+            )
             if self.scheduler:
                 self.scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
             step = checkpoint["step"]
                 for k, v in checkpoint["ema_model_state_dict"].items()
                 if k not in ["initted", "step"]
             }
+            self.accelerator.unwrap_model(self.model).load_state_dict(
+                checkpoint["model_state_dict"]
+            )
             step = 0
         del checkpoint
         gc.collect()
+        print(f"Exit load_checkpoint.")
         return step
     def validate(self, valid_dataloader, global_step):
         """
         Runs evaluation on the validation set, computes the average loss,
         with torch.no_grad():
             for batch in valid_dataloader:
                 # Inputs
+                mel = batch["mel"].permute(0, 2, 1)  # (B, L_mel, D)
+                text = batch["text"]
                 if self.process_token_to_id:
                     text_ids = list_str_to_idx(text, self.vocab_char_map).to(mel.device)
+                    text_ids = text_ids.masked_fill(text_ids == -1, self.vocab_size)
                 else:
                     text_ids = text
                 # Targets
+                mel_lengths = batch["mel_lengths"]
                 tar_lengths = calculate_remaining_lengths(mel_lengths)
                 predictions = self.model(text_ids=text_ids, mel=mel)
+                if self.loss_fn == "L1":
                     est_lengths = predictions
                     loss = masked_l1_loss(
                         est_lengths=est_lengths, tar_lengths=tar_lengths
                     )
                     frame_error = loss
+                elif self.loss_fn == "CE":
+                    tar_length_labels = (tar_lengths // self.n_frame_per_class).clamp(
+                        min=0, max=self.n_class - 1
+                    )  # [0, 1, ..., n_class-1]
                     est_length_logtis = predictions
                     est_length_labels = torch.argmax(est_length_logtis, dim=-1)
                     loss = masked_cross_entropy_loss(
+                        est_length_logits=est_length_logtis,
+                        tar_length_labels=tar_length_labels,
                     )
                     est_lengths = est_length_labels * self.n_frame_per_class
                     frame_error = masked_l1_loss(
                         est_lengths=est_lengths, tar_lengths=tar_lengths
                     )
+                elif self.loss_fn == "L1_and_CE":
+                    tar_length_labels = (tar_lengths // self.n_frame_per_class).clamp(
+                        min=0, max=self.n_class - 1
+                    )  # [0, 1, ..., n_class-1]
                     est_length_logtis = predictions
                     est_length_1hots = F.gumbel_softmax(
                         est_length_logtis, tau=self.gumbel_tau, hard=True, dim=-1
                     )
+                    length_values = (
+                        torch.arange(
+                            self.n_class, device=est_length_1hots.device
+                        ).float()
+                        * self.n_frame_per_class
+                    )
                     est_lengths = (est_length_1hots * length_values).sum(-1)
                     loss_CE = masked_cross_entropy_loss(
+                        est_length_logits=est_length_logtis,
+                        tar_length_labels=tar_length_labels,
                     )
                     loss_L1 = masked_l1_loss(
         avg_valid_sec_error = total_sec_error / count if count > 0 else 0.0
         # Log validation metrics
         self.accelerator.log(
+            {f"valid_loss": avg_valid_loss, f"valid_sec_error": avg_valid_sec_error},
+            step=global_step,
         )
+        self.model.train()
+    def train(
+        self,
+        train_dataset: Dataset,
+        valid_dataset: Dataset,
+        num_workers=64,
+        resumable_with_seed: int = None,
+    ):
         if exists(resumable_with_seed):
             generator = torch.Generator()
             generator.manual_seed(resumable_with_seed)
             sampler = SequentialSampler(train_dataset)
             batch_sampler = DynamicBatchSampler(
+                sampler,
+                self.batch_size,
+                max_samples=self.max_samples,
+                random_seed=resumable_with_seed,
+                drop_last=False,
             )
             train_dataloader = DataLoader(
                 train_dataset,
             sampler = SequentialSampler(valid_dataset)
             batch_sampler = DynamicBatchSampler(
+                sampler,
+                self.batch_size,
+                max_samples=self.max_samples,
+                random_seed=resumable_with_seed,
+                drop_last=False,
             )
             # Create validation dataloader (always sequential, no shuffling)
             valid_dataloader = DataLoader(
                 valid_dataset,
                 collate_fn=collate_fn,
                 num_workers=num_workers,
+                pin_memory=True,
                 persistent_workers=True,
                 batch_sampler=batch_sampler,
             )
         else:
+            raise ValueError(
+                f"batch_size_type must be either 'sample' or 'frame', but received {self.batch_size_type}"
+            )
         #  accelerator.prepare() dispatches batches to devices;
         #  which means the length of dataloader calculated before, should consider the number of devices
         warmup_steps = (
         # otherwise by default with split_batches=False, warmup steps change with num_processes
         total_steps = len(train_dataloader) * self.epochs / self.grad_accumulation_steps
         decay_steps = total_steps - warmup_steps
+        warmup_scheduler = LinearLR(
+            self.optimizer, start_factor=1e-8, end_factor=1.0, total_iters=warmup_steps
+        )
+        decay_scheduler = LinearLR(
+            self.optimizer, start_factor=1.0, end_factor=1e-8, total_iters=decay_steps
+        )
         self.scheduler = SequentialLR(
+            self.optimizer,
+            schedulers=[warmup_scheduler, decay_scheduler],
+            milestones=[warmup_steps],
         )
         train_dataloader, self.scheduler = self.accelerator.prepare(
             train_dataloader, self.scheduler
             orig_epoch_step = len(train_dataloader)
             skipped_epoch = int(start_step // orig_epoch_step)
             skipped_batch = start_step % orig_epoch_step
+            skipped_dataloader = self.accelerator.skip_first_batches(
+                train_dataloader, num_batches=skipped_batch
+            )
         else:
             skipped_epoch = 0
             for batch in progress_bar:
                 with self.accelerator.accumulate(self.model):
                     # Inputs
+                    mel = batch["mel"].permute(0, 2, 1)  # (B, L_mel, D)
+                    text = batch["text"]
                     if self.process_token_to_id:
+                        text_ids = list_str_to_idx(text, self.vocab_char_map).to(
+                            mel.device
+                        )
+                        text_ids = text_ids.masked_fill(text_ids == -1, self.vocab_size)
                     else:
                         text_ids = text
                     # Targets
+                    mel_lengths = batch["mel_lengths"]
                     tar_lengths = calculate_remaining_lengths(mel_lengths)
                     predictions = self.model(text_ids=text_ids, mel=mel)
+                    if self.loss_fn == "L1":
                         est_lengths = predictions
                         loss = masked_l1_loss(
                             est_lengths=est_lengths, tar_lengths=tar_lengths
                             sec_error = frame_error * 256 / 24000
                         log_dict = {
+                            "loss": loss.item(),
+                            "loss_L1": loss.item(),
+                            "sec_error": sec_error.item(),
+                            "lr": self.scheduler.get_last_lr()[0],
+                        }
+                    elif self.loss_fn == "CE":
+                        tar_length_labels = (
+                            tar_lengths // self.n_frame_per_class
+                        ).clamp(
+                            min=0, max=self.n_class - 1
+                        )  # [0, 1, ..., n_class-1]
                         est_length_logtis = predictions
                         est_length_labels = torch.argmax(est_length_logtis, dim=-1)
                         loss = masked_cross_entropy_loss(
+                            est_length_logits=est_length_logtis,
+                            tar_length_labels=tar_length_labels,
                         )
                         with torch.no_grad():
                             est_lengths = est_length_labels * self.n_frame_per_class
                             sec_error = frame_error * 256 / 24000
                         log_dict = {
+                            "loss": loss.item(),
+                            "loss_CE": loss.item(),
+                            "sec_error": sec_error.item(),
+                            "lr": self.scheduler.get_last_lr()[0],
+                        }
+                    elif self.loss_fn == "L1_and_CE":
+                        tar_length_labels = (
+                            tar_lengths // self.n_frame_per_class
+                        ).clamp(
+                            min=0, max=self.n_class - 1
+                        )  # [0, 1, ..., n_class-1]
                         est_length_logtis = predictions
                         est_length_1hots = F.gumbel_softmax(
                             est_length_logtis, tau=self.gumbel_tau, hard=True, dim=-1
                         )
+                        length_values = (
+                            torch.arange(
+                                self.n_class, device=est_length_1hots.device
+                            ).float()
+                            * self.n_frame_per_class
+                        )
                         est_lengths = (est_length_1hots * length_values).sum(-1)
                         loss_CE = masked_cross_entropy_loss(
+                            est_length_logits=est_length_logtis,
+                            tar_length_labels=tar_length_labels,
                         )
+                        loss_L1 = masked_l1_loss(
                             est_lengths=est_lengths, tar_lengths=tar_lengths
                         )
                             sec_error = frame_error * 256 / 24000
                         log_dict = {
+                            "loss": loss.item(),
+                            "loss_L1": loss_L1.item(),
+                            "loss_CE": loss_CE.item(),
+                            "sec_error": sec_error.item(),
+                            "lr": self.scheduler.get_last_lr()[0],
                         }
                     else:
                         raise NotImplementedError(self.loss_fn)
                     self.accelerator.backward(loss)
                     if self.max_grad_norm > 0 and self.accelerator.sync_gradients:
+                        self.accelerator.clip_grad_norm_(
+                            self.model.parameters(), self.max_grad_norm
+                        )
                     self.optimizer.step()
                     self.scheduler.step()
                     self.accelerator.log(log_dict, step=global_step)
                 progress_bar.set_postfix(step=str(global_step), loss=loss.item())
+                if (
+                    global_step % (self.save_per_updates * self.grad_accumulation_steps)
+                    == 0
+                ):
                     self.save_checkpoint(global_step)
                     # if self.log_samples and self.accelerator.is_local_main_process:
                     # Run validation at the end of each epoch (only on the main process)

duration_trainer_with_prompt.py CHANGED Viewed

@@ -1,11 +1,11 @@
 from __future__ import annotations
 import gc
-import os
 import math
 import torch
 import torchaudio
 import wandb
 from accelerate import Accelerator
@@ -13,25 +13,17 @@ from accelerate.utils import DistributedDataParallelKwargs
 from ema_pytorch import EMA
 from torch.optim import AdamW
 from torch.optim.lr_scheduler import LinearLR, SequentialLR
-from torch.utils.data import DataLoader, Dataset, SequentialSampler, Subset  # <-- Added Subset import
 from tqdm import tqdm
-import torch.nn.functional as F
 from f5_tts.model import CFM
-from f5_tts.model.dataset import collate_fn, DynamicBatchSampler
-from f5_tts.model.utils import default, exists
 # trainer
-from f5_tts.model.utils import (
-    default,
-    exists,
-    list_str_to_idx,
-    list_str_to_tensor,
-    lens_to_mask,
-    mask_from_frac_lengths,
-)
 SAMPLE_RATE = 24_000
@@ -43,7 +35,7 @@ class Trainer:
         vocab_size,
         vocab_char_map,
         process_token_to_id=True,
-        loss_fn='L1',
         lambda_L1=1,
         gumbel_tau=0.5,
         n_class=301,
@@ -83,7 +75,13 @@ class Trainer:
         self.logger = logger
         if self.logger == "wandb":
             if exists(wandb_resume_id):
-                init_kwargs = {"wandb": {"resume": "allow", "name": wandb_run_name, "id": wandb_resume_id}}
             else:
                 init_kwargs = {"wandb": {"resume": "allow", "name": wandb_run_name}}
@@ -112,7 +110,7 @@ class Trainer:
         self.vocab_size = vocab_size
         self.vocab_char_map = vocab_char_map
         self.process_token_to_id = process_token_to_id
-        assert loss_fn in ['L1', 'CE', 'L1_and_CE']
         self.loss_fn = loss_fn
         self.lambda_L1 = lambda_L1
         self.n_class = n_class
@@ -122,7 +120,9 @@ class Trainer:
         self.epochs = epochs
         self.num_warmup_updates = num_warmup_updates
         self.save_per_updates = save_per_updates
-        self.last_per_steps = default(last_per_steps, save_per_updates * grad_accumulation_steps)
         self.checkpoint_path = default(checkpoint_path, "ckpts/test_e2-tts")
         self.batch_size = batch_size
@@ -137,33 +137,44 @@ class Trainer:
             self.optimizer = bnb.optim.AdamW8bit(model.parameters(), lr=learning_rate)
         else:
             self.optimizer = AdamW(model.parameters(), lr=learning_rate)
-        self.model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
     @property
     def is_main(self):
         return self.accelerator.is_main_process
     def save_checkpoint(self, step, last=False):
         self.accelerator.wait_for_everyone()
-        if self.is_main:
             checkpoint = dict(
                 model_state_dict=self.accelerator.unwrap_model(self.model).state_dict(),
-                optimizer_state_dict=self.accelerator.unwrap_model(self.optimizer).state_dict(),
                 scheduler_state_dict=self.scheduler.state_dict(),
                 step=step,
             )
             if not os.path.exists(self.checkpoint_path):
                 os.makedirs(self.checkpoint_path)
             if last:
-                self.accelerator.save(checkpoint, f"{self.checkpoint_path}/model_last.pt")
             else:
-                self.accelerator.save(checkpoint, f"{self.checkpoint_path}/model_{step}.pt")
     def load_checkpoint(self):
         if (
             not exists(self.checkpoint_path)
             or not os.path.exists(self.checkpoint_path)
-            or not any(filename.endswith(".pt") for filename in os.listdir(self.checkpoint_path))
         ):
             return 0
@@ -176,21 +187,32 @@ class Trainer:
                 key=lambda x: int("".join(filter(str.isdigit, x))),
             )[-1]
-        print(f'To load from {latest_checkpoint}.')
         # checkpoint = torch.load(f"{self.checkpoint_path}/{latest_checkpoint}", map_location=self.accelerator.device)  # rather use accelerator.load_state ಥ_ಥ
-        checkpoint = torch.load(f"{self.checkpoint_path}/{latest_checkpoint}", weights_only=True, map_location="cpu")
-        print(f'Loaded from {latest_checkpoint}.')
         if "step" in checkpoint:
             # patch for backward compatibility, 305e3ea
-            for key in ["mel_spec.mel_stft.mel_scale.fb", "mel_spec.mel_stft.spectrogram.window"]:
                 if key in checkpoint["model_state_dict"]:
                     del checkpoint["model_state_dict"][key]
-            self.accelerator.unwrap_model(self.model).load_state_dict(checkpoint["model_state_dict"])
-            self.accelerator.unwrap_model(self.optimizer).load_state_dict(checkpoint["optimizer_state_dict"])
             if self.scheduler:
                 self.scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
             step = checkpoint["step"]
@@ -200,17 +222,18 @@ class Trainer:
                 for k, v in checkpoint["ema_model_state_dict"].items()
                 if k not in ["initted", "step"]
             }
-            self.accelerator.unwrap_model(self.model).load_state_dict(checkpoint["model_state_dict"])
             step = 0
         del checkpoint
         gc.collect()
-        print(f'Exit load_checkpoint.')
         return step
     def validate(self, valid_dataloader, global_step):
         """
         Runs evaluation on the validation set, computes the average loss,
@@ -226,31 +249,40 @@ class Trainer:
             for batch in valid_dataloader:
                 # Inputs
-                prompt_mel = batch['pmt_mel_specs'].permute(0, 2, 1) # (B, L_mel, D)
-                prompt_text = batch['pmt_text']
-                text = batch['text']
-                target_ids = list_str_to_idx(text, self.vocab_char_map).to(prompt_mel.device)
-                target_ids = target_ids.masked_fill(target_ids==-1, vocab_size)
-                prompt_ids = list_str_to_idx(prompt_text, self.vocab_char_map).to(prompt_mel.device)
-                prompt_ids = prompt_ids.masked_fill(prompt_ids==-1, vocab_size)
                 # Targets
-                tar_lengths = batch['mel_lengths']
                 # Forward
-                predictions = SLP(target_ids=target_ids, prompt_ids=prompt_ids, prompt_mel=prompt_mel) # (B, C)
-                if self.loss_fn == 'CE':
-                    tar_length_labels = (tar_lengths // self.n_frame_per_class) \
-                        .clamp(min=0, max=self.n_class-1) # [0, 1, ..., n_class-1]
                     est_length_logtis = predictions
                     est_length_labels = torch.argmax(est_length_logtis, dim=-1)
                     loss = F.cross_entropy(est_length_logtis, tar_length_labels)
                     est_lengths = est_length_labels * self.n_frame_per_class
-                    frame_error = (est_lengths.float() - tar_lengths.float()).abs().mean()
                     sec_error = frame_error * 256 / 24000
                 total_sec_error += sec_error.item()
@@ -262,18 +294,19 @@ class Trainer:
         # Log validation metrics
         self.accelerator.log(
-            {
-                f"valid_loss": avg_valid_loss,
-                f"valid_sec_error": avg_valid_sec_error
-            },
-            step=global_step
         )
-        self.model.train()
-    def train(self, train_dataset: Dataset, valid_dataset: Dataset,
-        num_workers=64, resumable_with_seed: int = None):
         if exists(resumable_with_seed):
             generator = torch.Generator()
             generator.manual_seed(resumable_with_seed)
@@ -307,7 +340,11 @@ class Trainer:
             sampler = SequentialSampler(train_dataset)
             batch_sampler = DynamicBatchSampler(
-                sampler, self.batch_size, max_samples=self.max_samples, random_seed=resumable_with_seed, drop_last=False
             )
             train_dataloader = DataLoader(
                 train_dataset,
@@ -320,20 +357,26 @@ class Trainer:
             sampler = SequentialSampler(valid_dataset)
             batch_sampler = DynamicBatchSampler(
-                sampler, self.batch_size, max_samples=self.max_samples, random_seed=resumable_with_seed, drop_last=False
             )
             # Create validation dataloader (always sequential, no shuffling)
             valid_dataloader = DataLoader(
                 valid_dataset,
                 collate_fn=collate_fn,
                 num_workers=num_workers,
-                pin_memory=True,
                 persistent_workers=True,
                 batch_sampler=batch_sampler,
             )
         else:
-            raise ValueError(f"batch_size_type must be either 'sample' or 'frame', but received {self.batch_size_type}")
         #  accelerator.prepare() dispatches batches to devices;
         #  which means the length of dataloader calculated before, should consider the number of devices
         warmup_steps = (
@@ -342,10 +385,16 @@ class Trainer:
         # otherwise by default with split_batches=False, warmup steps change with num_processes
         total_steps = len(train_dataloader) * self.epochs / self.grad_accumulation_steps
         decay_steps = total_steps - warmup_steps
-        warmup_scheduler = LinearLR(self.optimizer, start_factor=1e-8, end_factor=1.0, total_iters=warmup_steps)
-        decay_scheduler = LinearLR(self.optimizer, start_factor=1.0, end_factor=1e-8, total_iters=decay_steps)
         self.scheduler = SequentialLR(
-            self.optimizer, schedulers=[warmup_scheduler, decay_scheduler], milestones=[warmup_steps]
         )
         train_dataloader, self.scheduler = self.accelerator.prepare(
             train_dataloader, self.scheduler
@@ -359,7 +408,9 @@ class Trainer:
             orig_epoch_step = len(train_dataloader)
             skipped_epoch = int(start_step // orig_epoch_step)
             skipped_batch = start_step % orig_epoch_step
-            skipped_dataloader = self.accelerator.skip_first_batches(train_dataloader, num_batches=skipped_batch)
         else:
             skipped_epoch = 0
@@ -385,49 +436,65 @@ class Trainer:
             for batch in progress_bar:
                 with self.accelerator.accumulate(self.model):
                     # Inputs
-                    prompt_mel = batch['pmt_mel_specs'].permute(0, 2, 1) # (B, L_mel, D)
-                    prompt_text = batch['pmt_text']
-                    text = batch['text']
-                    target_ids = list_str_to_idx(text, self.vocab_char_map).to(prompt_mel.device)
-                    target_ids = target_ids.masked_fill(target_ids==-1, vocab_size)
-                    prompt_ids = list_str_to_idx(prompt_text, self.vocab_char_map).to(prompt_mel.device)
-                    prompt_ids = prompt_ids.masked_fill(prompt_ids==-1, vocab_size)
                     # Targets
-                    tar_lengths = batch['mel_lengths']
                     # Forward
-                    predictions = SLP(target_ids=target_ids, prompt_ids=prompt_ids, prompt_mel=prompt_mel) # (B, C)
-                    if self.loss_fn == 'CE':
-                        tar_length_labels = (tar_lengths // self.n_frame_per_class) \
-                            .clamp(min=0, max=self.n_class-1) # [0, 1, ..., n_class-1]
                         est_length_logtis = predictions
                         est_length_labels = torch.argmax(est_length_logtis, dim=-1)
                         loss = F.cross_entropy(est_length_logtis, tar_length_labels)
                         with torch.no_grad():
                             est_lengths = est_length_labels * self.n_frame_per_class
-                            frame_error = (est_lengths.float() - tar_lengths.float()).abs().mean()
                             sec_error = frame_error * 256 / 24000
                         log_dict = {
-                            'loss': loss.item(),
-                            'loss_CE': loss.item(),
-                            'sec_error': sec_error.item(),
-                            'lr': self.scheduler.get_last_lr()[0]
-                         }
                     else:
                         raise NotImplementedError(self.loss_fn)
                     self.accelerator.backward(loss)
                     if self.max_grad_norm > 0 and self.accelerator.sync_gradients:
-                        self.accelerator.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
                     self.optimizer.step()
                     self.scheduler.step()
@@ -439,7 +506,10 @@ class Trainer:
                     self.accelerator.log(log_dict, step=global_step)
                 progress_bar.set_postfix(step=str(global_step), loss=loss.item())
-                if global_step % (self.save_per_updates * self.grad_accumulation_steps) == 0:
                     self.save_checkpoint(global_step)
                     # if self.log_samples and self.accelerator.is_local_main_process:
                     # Run validation at the end of each epoch (only on the main process)

 from __future__ import annotations
 import gc
 import math
+import os
 import torch
+import torch.nn.functional as F
 import torchaudio
 import wandb
 from accelerate import Accelerator
 from ema_pytorch import EMA
 from torch.optim import AdamW
 from torch.optim.lr_scheduler import LinearLR, SequentialLR
+from torch.utils.data import Dataset  # <-- Added Subset import
+from torch.utils.data import DataLoader, SequentialSampler, Subset
 from tqdm import tqdm
 from f5_tts.model import CFM
+from f5_tts.model.dataset import DynamicBatchSampler, collate_fn
+from f5_tts.model.utils import (default, exists, lens_to_mask, list_str_to_idx,
+                                list_str_to_tensor, mask_from_frac_lengths)
 # trainer
 SAMPLE_RATE = 24_000
         vocab_size,
         vocab_char_map,
         process_token_to_id=True,
+        loss_fn="L1",
         lambda_L1=1,
         gumbel_tau=0.5,
         n_class=301,
         self.logger = logger
         if self.logger == "wandb":
             if exists(wandb_resume_id):
+                init_kwargs = {
+                    "wandb": {
+                        "resume": "allow",
+                        "name": wandb_run_name,
+                        "id": wandb_resume_id,
+                    }
+                }
             else:
                 init_kwargs = {"wandb": {"resume": "allow", "name": wandb_run_name}}
         self.vocab_size = vocab_size
         self.vocab_char_map = vocab_char_map
         self.process_token_to_id = process_token_to_id
+        assert loss_fn in ["L1", "CE", "L1_and_CE"]
         self.loss_fn = loss_fn
         self.lambda_L1 = lambda_L1
         self.n_class = n_class
         self.epochs = epochs
         self.num_warmup_updates = num_warmup_updates
         self.save_per_updates = save_per_updates
+        self.last_per_steps = default(
+            last_per_steps, save_per_updates * grad_accumulation_steps
+        )
         self.checkpoint_path = default(checkpoint_path, "ckpts/test_e2-tts")
         self.batch_size = batch_size
             self.optimizer = bnb.optim.AdamW8bit(model.parameters(), lr=learning_rate)
         else:
             self.optimizer = AdamW(model.parameters(), lr=learning_rate)
+        self.model, self.optimizer = self.accelerator.prepare(
+            self.model, self.optimizer
+        )
     @property
     def is_main(self):
         return self.accelerator.is_main_process
     def save_checkpoint(self, step, last=False):
         self.accelerator.wait_for_everyone()
+        if self.is_main:
             checkpoint = dict(
                 model_state_dict=self.accelerator.unwrap_model(self.model).state_dict(),
+                optimizer_state_dict=self.accelerator.unwrap_model(
+                    self.optimizer
+                ).state_dict(),
                 scheduler_state_dict=self.scheduler.state_dict(),
                 step=step,
             )
             if not os.path.exists(self.checkpoint_path):
                 os.makedirs(self.checkpoint_path)
             if last:
+                self.accelerator.save(
+                    checkpoint, f"{self.checkpoint_path}/model_last.pt"
+                )
             else:
+                self.accelerator.save(
+                    checkpoint, f"{self.checkpoint_path}/model_{step}.pt"
+                )
     def load_checkpoint(self):
         if (
             not exists(self.checkpoint_path)
             or not os.path.exists(self.checkpoint_path)
+            or not any(
+                filename.endswith(".pt")
+                for filename in os.listdir(self.checkpoint_path)
+            )
         ):
             return 0
                 key=lambda x: int("".join(filter(str.isdigit, x))),
             )[-1]
+        print(f"To load from {latest_checkpoint}.")
         # checkpoint = torch.load(f"{self.checkpoint_path}/{latest_checkpoint}", map_location=self.accelerator.device)  # rather use accelerator.load_state ಥ_ಥ
+        checkpoint = torch.load(
+            f"{self.checkpoint_path}/{latest_checkpoint}",
+            weights_only=True,
+            map_location="cpu",
+        )
+        print(f"Loaded from {latest_checkpoint}.")
         if "step" in checkpoint:
             # patch for backward compatibility, 305e3ea
+            for key in [
+                "mel_spec.mel_stft.mel_scale.fb",
+                "mel_spec.mel_stft.spectrogram.window",
+            ]:
                 if key in checkpoint["model_state_dict"]:
                     del checkpoint["model_state_dict"][key]
+            self.accelerator.unwrap_model(self.model).load_state_dict(
+                checkpoint["model_state_dict"]
+            )
+            self.accelerator.unwrap_model(self.optimizer).load_state_dict(
+                checkpoint["optimizer_state_dict"]
+            )
             if self.scheduler:
                 self.scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
             step = checkpoint["step"]
                 for k, v in checkpoint["ema_model_state_dict"].items()
                 if k not in ["initted", "step"]
             }
+            self.accelerator.unwrap_model(self.model).load_state_dict(
+                checkpoint["model_state_dict"]
+            )
             step = 0
         del checkpoint
         gc.collect()
+        print(f"Exit load_checkpoint.")
         return step
     def validate(self, valid_dataloader, global_step):
         """
         Runs evaluation on the validation set, computes the average loss,
             for batch in valid_dataloader:
                 # Inputs
+                prompt_mel = batch["pmt_mel_specs"].permute(0, 2, 1)  # (B, L_mel, D)
+                prompt_text = batch["pmt_text"]
+                text = batch["text"]
+                target_ids = list_str_to_idx(text, self.vocab_char_map).to(
+                    prompt_mel.device
+                )
+                target_ids = target_ids.masked_fill(target_ids == -1, vocab_size)
+                prompt_ids = list_str_to_idx(prompt_text, self.vocab_char_map).to(
+                    prompt_mel.device
+                )
+                prompt_ids = prompt_ids.masked_fill(prompt_ids == -1, vocab_size)
                 # Targets
+                tar_lengths = batch["mel_lengths"]
                 # Forward
+                predictions = SLP(
+                    target_ids=target_ids, prompt_ids=prompt_ids, prompt_mel=prompt_mel
+                )  # (B, C)
+                if self.loss_fn == "CE":
+                    tar_length_labels = (tar_lengths // self.n_frame_per_class).clamp(
+                        min=0, max=self.n_class - 1
+                    )  # [0, 1, ..., n_class-1]
                     est_length_logtis = predictions
                     est_length_labels = torch.argmax(est_length_logtis, dim=-1)
                     loss = F.cross_entropy(est_length_logtis, tar_length_labels)
                     est_lengths = est_length_labels * self.n_frame_per_class
+                    frame_error = (
+                        (est_lengths.float() - tar_lengths.float()).abs().mean()
+                    )
                     sec_error = frame_error * 256 / 24000
                 total_sec_error += sec_error.item()
         # Log validation metrics
         self.accelerator.log(
+            {f"valid_loss": avg_valid_loss, f"valid_sec_error": avg_valid_sec_error},
+            step=global_step,
         )
+        self.model.train()
+    def train(
+        self,
+        train_dataset: Dataset,
+        valid_dataset: Dataset,
+        num_workers=64,
+        resumable_with_seed: int = None,
+    ):
         if exists(resumable_with_seed):
             generator = torch.Generator()
             generator.manual_seed(resumable_with_seed)
             sampler = SequentialSampler(train_dataset)
             batch_sampler = DynamicBatchSampler(
+                sampler,
+                self.batch_size,
+                max_samples=self.max_samples,
+                random_seed=resumable_with_seed,
+                drop_last=False,
             )
             train_dataloader = DataLoader(
                 train_dataset,
             sampler = SequentialSampler(valid_dataset)
             batch_sampler = DynamicBatchSampler(
+                sampler,
+                self.batch_size,
+                max_samples=self.max_samples,
+                random_seed=resumable_with_seed,
+                drop_last=False,
             )
             # Create validation dataloader (always sequential, no shuffling)
             valid_dataloader = DataLoader(
                 valid_dataset,
                 collate_fn=collate_fn,
                 num_workers=num_workers,
+                pin_memory=True,
                 persistent_workers=True,
                 batch_sampler=batch_sampler,
             )
         else:
+            raise ValueError(
+                f"batch_size_type must be either 'sample' or 'frame', but received {self.batch_size_type}"
+            )
         #  accelerator.prepare() dispatches batches to devices;
         #  which means the length of dataloader calculated before, should consider the number of devices
         warmup_steps = (
         # otherwise by default with split_batches=False, warmup steps change with num_processes
         total_steps = len(train_dataloader) * self.epochs / self.grad_accumulation_steps
         decay_steps = total_steps - warmup_steps
+        warmup_scheduler = LinearLR(
+            self.optimizer, start_factor=1e-8, end_factor=1.0, total_iters=warmup_steps
+        )
+        decay_scheduler = LinearLR(
+            self.optimizer, start_factor=1.0, end_factor=1e-8, total_iters=decay_steps
+        )
         self.scheduler = SequentialLR(
+            self.optimizer,
+            schedulers=[warmup_scheduler, decay_scheduler],
+            milestones=[warmup_steps],
         )
         train_dataloader, self.scheduler = self.accelerator.prepare(
             train_dataloader, self.scheduler
             orig_epoch_step = len(train_dataloader)
             skipped_epoch = int(start_step // orig_epoch_step)
             skipped_batch = start_step % orig_epoch_step
+            skipped_dataloader = self.accelerator.skip_first_batches(
+                train_dataloader, num_batches=skipped_batch
+            )
         else:
             skipped_epoch = 0
             for batch in progress_bar:
                 with self.accelerator.accumulate(self.model):
                     # Inputs
+                    prompt_mel = batch["pmt_mel_specs"].permute(
+                        0, 2, 1
+                    )  # (B, L_mel, D)
+                    prompt_text = batch["pmt_text"]
+                    text = batch["text"]
+                    target_ids = list_str_to_idx(text, self.vocab_char_map).to(
+                        prompt_mel.device
+                    )
+                    target_ids = target_ids.masked_fill(target_ids == -1, vocab_size)
+                    prompt_ids = list_str_to_idx(prompt_text, self.vocab_char_map).to(
+                        prompt_mel.device
+                    )
+                    prompt_ids = prompt_ids.masked_fill(prompt_ids == -1, vocab_size)
                     # Targets
+                    tar_lengths = batch["mel_lengths"]
                     # Forward
+                    predictions = SLP(
+                        target_ids=target_ids,
+                        prompt_ids=prompt_ids,
+                        prompt_mel=prompt_mel,
+                    )  # (B, C)
+                    if self.loss_fn == "CE":
+                        tar_length_labels = (
+                            tar_lengths // self.n_frame_per_class
+                        ).clamp(
+                            min=0, max=self.n_class - 1
+                        )  # [0, 1, ..., n_class-1]
                         est_length_logtis = predictions
                         est_length_labels = torch.argmax(est_length_logtis, dim=-1)
                         loss = F.cross_entropy(est_length_logtis, tar_length_labels)
                         with torch.no_grad():
                             est_lengths = est_length_labels * self.n_frame_per_class
+                            frame_error = (
+                                (est_lengths.float() - tar_lengths.float()).abs().mean()
+                            )
                             sec_error = frame_error * 256 / 24000
                         log_dict = {
+                            "loss": loss.item(),
+                            "loss_CE": loss.item(),
+                            "sec_error": sec_error.item(),
+                            "lr": self.scheduler.get_last_lr()[0],
+                        }
                     else:
                         raise NotImplementedError(self.loss_fn)
                     self.accelerator.backward(loss)
                     if self.max_grad_norm > 0 and self.accelerator.sync_gradients:
+                        self.accelerator.clip_grad_norm_(
+                            self.model.parameters(), self.max_grad_norm
+                        )
                     self.optimizer.step()
                     self.scheduler.step()
                     self.accelerator.log(log_dict, step=global_step)
                 progress_bar.set_postfix(step=str(global_step), loss=loss.item())
+                if (
+                    global_step % (self.save_per_updates * self.grad_accumulation_steps)
+                    == 0
+                ):
                     self.save_checkpoint(global_step)
                     # if self.log_samples and self.accelerator.is_local_main_process:
                     # Run validation at the end of each epoch (only on the main process)

ecapa_tdnn.py CHANGED Viewed

@@ -1,23 +1,34 @@
 # part of the code is borrowed from https://github.com/lawlict/ECAPA-TDNN
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torchaudio.transforms as trans
 from ctcmodel import ConformerCTC
-# from ctcmodel_nopool import ConformerCTC as ConformerCTCNoPool
-from pathlib import Path
-''' Res2Conv1d + BatchNorm1d + ReLU
-'''
 class Res2Conv1dReluBn(nn.Module):
-    '''
     in_channels == out_channels == channels
-    '''
-    def __init__(self, channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=True, scale=4):
         super().__init__()
         assert channels % scale == 0, "{} % {} != 0".format(channels, scale)
         self.scale = scale
@@ -27,7 +38,17 @@ class Res2Conv1dReluBn(nn.Module):
         self.convs = []
         self.bns = []
         for i in range(self.nums):
-            self.convs.append(nn.Conv1d(self.width, self.width, kernel_size, stride, padding, dilation, bias=bias))
             self.bns.append(nn.BatchNorm1d(self.width))
         self.convs = nn.ModuleList(self.convs)
         self.bns = nn.ModuleList(self.bns)
@@ -51,22 +72,33 @@ class Res2Conv1dReluBn(nn.Module):
         return out
-''' Conv1d + BatchNorm1d + ReLU
-'''
 class Conv1dReluBn(nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=True):
         super().__init__()
-        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias)
         self.bn = nn.BatchNorm1d(out_channels)
     def forward(self, x):
         return self.bn(F.relu(self.conv(x)))
-''' The SE connection of 1D case.
-'''
 class SE_Connect(nn.Module):
@@ -84,15 +116,32 @@ class SE_Connect(nn.Module):
         return out
-''' SE-Res2Block of the ECAPA-TDNN architecture.
-'''
 class SE_Res2Block(nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, dilation, scale, se_bottleneck_dim):
         super().__init__()
-        self.Conv1dReluBn1 = Conv1dReluBn(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
-        self.Res2Conv1dReluBn = Res2Conv1dReluBn(out_channels, kernel_size, stride, padding, dilation, scale=scale)
-        self.Conv1dReluBn2 = Conv1dReluBn(out_channels, out_channels, kernel_size=1, stride=1, padding=0)
         self.SE_Connect = SE_Connect(out_channels, se_bottleneck_dim)
         self.shortcut = None
@@ -116,8 +165,9 @@ class SE_Res2Block(nn.Module):
         return x + residual
-''' Attentive weighted mean and standard deviation pooling.
-'''
 class AttentiveStatsPool(nn.Module):
     def __init__(self, in_dim, attention_channels=128, global_context_att=False):
@@ -126,16 +176,24 @@ class AttentiveStatsPool(nn.Module):
         # Use Conv1d with stride == 1 rather than Linear, then we don't need to transpose inputs.
         if global_context_att:
-            self.linear1 = nn.Conv1d(in_dim * 3, attention_channels, kernel_size=1)  # equals W and b in the paper
         else:
-            self.linear1 = nn.Conv1d(in_dim, attention_channels, kernel_size=1)  # equals W and b in the paper
-        self.linear2 = nn.Conv1d(attention_channels, in_dim, kernel_size=1)  # equals V and k in the paper
     def forward(self, x):
         if self.global_context_att:
             context_mean = torch.mean(x, dim=-1, keepdim=True).expand_as(x)
-            context_std = torch.sqrt(torch.var(x, dim=-1, keepdim=True) + 1e-10).expand_as(x)
             x_in = torch.cat((x, context_mean, context_std), dim=1)
         else:
             x_in = x
@@ -145,42 +203,52 @@ class AttentiveStatsPool(nn.Module):
         # alpha = F.relu(self.linear1(x_in))
         alpha = torch.softmax(self.linear2(alpha), dim=2)
         mean = torch.sum(alpha * x, dim=2)
-        residuals = torch.sum(alpha * (x ** 2), dim=2) - mean ** 2
         std = torch.sqrt(residuals.clamp(min=1e-9))
         return torch.cat([mean, std], dim=1)
 class ECAPA_TDNN(nn.Module):
-    def __init__(self, channels=512, emb_dim=512,
-        global_context_att=False, use_fp16=True,
         ctc_cls=ConformerCTC,
-        ctc_path='/data4/F5TTS/ckpts/F5TTS_norm_ASR_vocos_pinyin_Emilia_ZH_EN/model_last.pt',
-        ctc_args={'vocab_size': 2545, 'mel_dim': 100, 'num_heads': 8, 'd_hid': 512, 'nlayers': 6},
-        ctc_no_grad=False
     ):
         super().__init__()
         if ctc_path != None:
             ctc_path = Path(ctc_path)
             model = ctc_cls(**ctc_args)
-            state_dict = torch.load(ctc_path, map_location='cpu')
-            model.load_state_dict(state_dict['model_state_dict'])
             print(f"Initialized pretrained ConformerCTC backbone from {ctc_path}.")
         else:
             raise ValueError(ctc_path)
         self.ctc_model = model
         self.ctc_model.out.requires_grad_(False)
         if ctc_cls == ConformerCTC:
-            self.feat_num = ctc_args['nlayers'] + 2 + 1
         # elif ctc_cls == ConformerCTCNoPool:
         #     self.feat_num = ctc_args['nlayers'] + 1
         else:
             raise ValueError(ctc_cls)
-        feat_dim = ctc_args['d_hid']
         self.emb_dim = emb_dim
         self.feature_weight = nn.Parameter(torch.zeros(self.feat_num))
         self.instance_norm = nn.InstanceNorm1d(feat_dim)
@@ -188,14 +256,45 @@ class ECAPA_TDNN(nn.Module):
         self.channels = [channels] * 4 + [1536]
         self.layer1 = Conv1dReluBn(feat_dim, self.channels[0], kernel_size=5, padding=2)
-        self.layer2 = SE_Res2Block(self.channels[0], self.channels[1], kernel_size=3, stride=1, padding=2, dilation=2, scale=8, se_bottleneck_dim=128)
-        self.layer3 = SE_Res2Block(self.channels[1], self.channels[2], kernel_size=3, stride=1, padding=3, dilation=3, scale=8, se_bottleneck_dim=128)
-        self.layer4 = SE_Res2Block(self.channels[2], self.channels[3], kernel_size=3, stride=1, padding=4, dilation=4, scale=8, se_bottleneck_dim=128)
         # self.conv = nn.Conv1d(self.channels[-1], self.channels[-1], kernel_size=1)
         cat_channels = channels * 3
         self.conv = nn.Conv1d(cat_channels, self.channels[-1], kernel_size=1)
-        self.pooling = AttentiveStatsPool(self.channels[-1], attention_channels=128, global_context_att=global_context_att)
         self.bn = nn.BatchNorm1d(self.channels[-1] * 2)
         self.linear = nn.Linear(self.channels[-1] * 2, emb_dim)
@@ -206,21 +305,26 @@ class ECAPA_TDNN(nn.Module):
         else:
             self.ctc_model = self.ctc_model.train()
         self.ctc_no_grad = ctc_no_grad
-        print('ctc_no_grad: ', self.ctc_no_grad)
-    def forward(self, latent, input_lengths,  return_asr=False):
         if self.ctc_no_grad:
             with torch.no_grad():
                 asr, h = self.ctc_model(latent, input_lengths)
         else:
             asr, h = self.ctc_model(latent, input_lengths)
         x = torch.stack(h, dim=0)
-        norm_weights = F.softmax(self.feature_weight, dim=-1).unsqueeze(-1).unsqueeze(-1).unsqueeze(-1)
         x = (norm_weights * x).sum(dim=0)
         x = x + 1e-6
         # x = torch.transpose(x, 1, 2) + 1e-6
         x = self.instance_norm(x)
         # x = torch.transpose(x, 1, 2)
@@ -238,9 +342,10 @@ class ECAPA_TDNN(nn.Module):
             return out, asr
         return out
 if __name__ == "__main__":
-    from diffspeech.ldm.model import DiT
     from diffspeech.data.collate import get_mask_from_lengths
     from diffspeech.tools.text.vocab import IPA
     bsz = 3
@@ -265,4 +370,4 @@ if __name__ == "__main__":
     emb = model(latent, latent_mask.sum(axis=-1))
-    print(emb.shape)

 # part of the code is borrowed from https://github.com/lawlict/ECAPA-TDNN
+# from ctcmodel_nopool import ConformerCTC as ConformerCTCNoPool
+from pathlib import Path
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torchaudio.transforms as trans
 from ctcmodel import ConformerCTC
+""" Res2Conv1d + BatchNorm1d + ReLU
+"""
 class Res2Conv1dReluBn(nn.Module):
+    """
     in_channels == out_channels == channels
+    """
+    def __init__(
+        self,
+        channels,
+        kernel_size=1,
+        stride=1,
+        padding=0,
+        dilation=1,
+        bias=True,
+        scale=4,
+    ):
         super().__init__()
         assert channels % scale == 0, "{} % {} != 0".format(channels, scale)
         self.scale = scale
         self.convs = []
         self.bns = []
         for i in range(self.nums):
+            self.convs.append(
+                nn.Conv1d(
+                    self.width,
+                    self.width,
+                    kernel_size,
+                    stride,
+                    padding,
+                    dilation,
+                    bias=bias,
+                )
+            )
             self.bns.append(nn.BatchNorm1d(self.width))
         self.convs = nn.ModuleList(self.convs)
         self.bns = nn.ModuleList(self.bns)
         return out
+""" Conv1d + BatchNorm1d + ReLU
+"""
 class Conv1dReluBn(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=1,
+        stride=1,
+        padding=0,
+        dilation=1,
+        bias=True,
+    ):
         super().__init__()
+        self.conv = nn.Conv1d(
+            in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias
+        )
         self.bn = nn.BatchNorm1d(out_channels)
     def forward(self, x):
         return self.bn(F.relu(self.conv(x)))
+""" The SE connection of 1D case.
+"""
 class SE_Connect(nn.Module):
         return out
+""" SE-Res2Block of the ECAPA-TDNN architecture.
+"""
 class SE_Res2Block(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        scale,
+        se_bottleneck_dim,
+    ):
         super().__init__()
+        self.Conv1dReluBn1 = Conv1dReluBn(
+            in_channels, out_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.Res2Conv1dReluBn = Res2Conv1dReluBn(
+            out_channels, kernel_size, stride, padding, dilation, scale=scale
+        )
+        self.Conv1dReluBn2 = Conv1dReluBn(
+            out_channels, out_channels, kernel_size=1, stride=1, padding=0
+        )
         self.SE_Connect = SE_Connect(out_channels, se_bottleneck_dim)
         self.shortcut = None
         return x + residual
+""" Attentive weighted mean and standard deviation pooling.
+"""
 class AttentiveStatsPool(nn.Module):
     def __init__(self, in_dim, attention_channels=128, global_context_att=False):
         # Use Conv1d with stride == 1 rather than Linear, then we don't need to transpose inputs.
         if global_context_att:
+            self.linear1 = nn.Conv1d(
+                in_dim * 3, attention_channels, kernel_size=1
+            )  # equals W and b in the paper
         else:
+            self.linear1 = nn.Conv1d(
+                in_dim, attention_channels, kernel_size=1
+            )  # equals W and b in the paper
+        self.linear2 = nn.Conv1d(
+            attention_channels, in_dim, kernel_size=1
+        )  # equals V and k in the paper
     def forward(self, x):
         if self.global_context_att:
             context_mean = torch.mean(x, dim=-1, keepdim=True).expand_as(x)
+            context_std = torch.sqrt(
+                torch.var(x, dim=-1, keepdim=True) + 1e-10
+            ).expand_as(x)
             x_in = torch.cat((x, context_mean, context_std), dim=1)
         else:
             x_in = x
         # alpha = F.relu(self.linear1(x_in))
         alpha = torch.softmax(self.linear2(alpha), dim=2)
         mean = torch.sum(alpha * x, dim=2)
+        residuals = torch.sum(alpha * (x**2), dim=2) - mean**2
         std = torch.sqrt(residuals.clamp(min=1e-9))
         return torch.cat([mean, std], dim=1)
 class ECAPA_TDNN(nn.Module):
+    def __init__(
+        self,
+        channels=512,
+        emb_dim=512,
+        global_context_att=False,
+        use_fp16=True,
         ctc_cls=ConformerCTC,
+        ctc_path="/data4/F5TTS/ckpts/F5TTS_norm_ASR_vocos_pinyin_Emilia_ZH_EN/model_last.pt",
+        ctc_args={
+            "vocab_size": 2545,
+            "mel_dim": 100,
+            "num_heads": 8,
+            "d_hid": 512,
+            "nlayers": 6,
+        },
+        ctc_no_grad=False,
     ):
         super().__init__()
         if ctc_path != None:
             ctc_path = Path(ctc_path)
             model = ctc_cls(**ctc_args)
+            state_dict = torch.load(ctc_path, map_location="cpu")
+            model.load_state_dict(state_dict["model_state_dict"])
             print(f"Initialized pretrained ConformerCTC backbone from {ctc_path}.")
         else:
             raise ValueError(ctc_path)
         self.ctc_model = model
         self.ctc_model.out.requires_grad_(False)
         if ctc_cls == ConformerCTC:
+            self.feat_num = ctc_args["nlayers"] + 2 + 1
         # elif ctc_cls == ConformerCTCNoPool:
         #     self.feat_num = ctc_args['nlayers'] + 1
         else:
             raise ValueError(ctc_cls)
+        feat_dim = ctc_args["d_hid"]
         self.emb_dim = emb_dim
         self.feature_weight = nn.Parameter(torch.zeros(self.feat_num))
         self.instance_norm = nn.InstanceNorm1d(feat_dim)
         self.channels = [channels] * 4 + [1536]
         self.layer1 = Conv1dReluBn(feat_dim, self.channels[0], kernel_size=5, padding=2)
+        self.layer2 = SE_Res2Block(
+            self.channels[0],
+            self.channels[1],
+            kernel_size=3,
+            stride=1,
+            padding=2,
+            dilation=2,
+            scale=8,
+            se_bottleneck_dim=128,
+        )
+        self.layer3 = SE_Res2Block(
+            self.channels[1],
+            self.channels[2],
+            kernel_size=3,
+            stride=1,
+            padding=3,
+            dilation=3,
+            scale=8,
+            se_bottleneck_dim=128,
+        )
+        self.layer4 = SE_Res2Block(
+            self.channels[2],
+            self.channels[3],
+            kernel_size=3,
+            stride=1,
+            padding=4,
+            dilation=4,
+            scale=8,
+            se_bottleneck_dim=128,
+        )
         # self.conv = nn.Conv1d(self.channels[-1], self.channels[-1], kernel_size=1)
         cat_channels = channels * 3
         self.conv = nn.Conv1d(cat_channels, self.channels[-1], kernel_size=1)
+        self.pooling = AttentiveStatsPool(
+            self.channels[-1],
+            attention_channels=128,
+            global_context_att=global_context_att,
+        )
         self.bn = nn.BatchNorm1d(self.channels[-1] * 2)
         self.linear = nn.Linear(self.channels[-1] * 2, emb_dim)
         else:
             self.ctc_model = self.ctc_model.train()
         self.ctc_no_grad = ctc_no_grad
+        print("ctc_no_grad: ", self.ctc_no_grad)
+    def forward(self, latent, input_lengths, return_asr=False):
         if self.ctc_no_grad:
             with torch.no_grad():
                 asr, h = self.ctc_model(latent, input_lengths)
         else:
             asr, h = self.ctc_model(latent, input_lengths)
         x = torch.stack(h, dim=0)
+        norm_weights = (
+            F.softmax(self.feature_weight, dim=-1)
+            .unsqueeze(-1)
+            .unsqueeze(-1)
+            .unsqueeze(-1)
+        )
         x = (norm_weights * x).sum(dim=0)
         x = x + 1e-6
         # x = torch.transpose(x, 1, 2) + 1e-6
         x = self.instance_norm(x)
         # x = torch.transpose(x, 1, 2)
             return out, asr
         return out
 if __name__ == "__main__":
     from diffspeech.data.collate import get_mask_from_lengths
+    from diffspeech.ldm.model import DiT
     from diffspeech.tools.text.vocab import IPA
     bsz = 3
     emb = model(latent, latent_mask.sum(axis=-1))
+    print(emb.shape)

f5_tts/api.py CHANGED Viewed

@@ -8,15 +8,10 @@ from cached_path import cached_path
 from hydra.utils import get_class
 from omegaconf import OmegaConf
-from f5_tts.infer.utils_infer import (
-    infer_process,
-    load_model,
-    load_vocoder,
-    preprocess_ref_audio_text,
-    remove_silence_for_generated_wav,
-    save_spectrogram,
-    transcribe,
-)
 from f5_tts.model.utils import seed_everything
@@ -32,7 +27,9 @@ class F5TTS:
         device=None,
         hf_cache_dir=None,
     ):
-        model_cfg = OmegaConf.load(str(files("f5_tts").joinpath(f"configs/{model}.yaml")))
         model_cls = get_class(f"f5_tts.model.{model_cfg.model.backbone}")
         model_arc = model_cfg.model.arch
@@ -50,16 +47,20 @@ class F5TTS:
             self.device = (
                 "cuda"
                 if torch.cuda.is_available()
-                else "xpu"
-                if torch.xpu.is_available()
-                else "mps"
-                if torch.backends.mps.is_available()
-                else "cpu"
             )
         # Load models
         self.vocoder = load_vocoder(
-            self.mel_spec_type, vocoder_local_path is not None, vocoder_local_path, self.device, hf_cache_dir
         )
         repo_name, ckpt_step, ckpt_type = "F5-TTS", 1250000, "safetensors"
@@ -77,10 +78,20 @@ class F5TTS:
         if not ckpt_file:
             ckpt_file = str(
-                cached_path(f"hf://SWivid/{repo_name}/{model}/model_{ckpt_step}.{ckpt_type}", cache_dir=hf_cache_dir)
             )
         self.ema_model = load_model(
-            model_cls, model_arc, ckpt_file, self.mel_spec_type, vocab_file, self.ode_method, self.use_ema, self.device
         )
     def transcribe(self, ref_audio, language=None):

 from hydra.utils import get_class
 from omegaconf import OmegaConf
+from f5_tts.infer.utils_infer import (infer_process, load_model, load_vocoder,
+                                      preprocess_ref_audio_text,
+                                      remove_silence_for_generated_wav,
+                                      save_spectrogram, transcribe)
 from f5_tts.model.utils import seed_everything
         device=None,
         hf_cache_dir=None,
     ):
+        model_cfg = OmegaConf.load(
+            str(files("f5_tts").joinpath(f"configs/{model}.yaml"))
+        )
         model_cls = get_class(f"f5_tts.model.{model_cfg.model.backbone}")
         model_arc = model_cfg.model.arch
             self.device = (
                 "cuda"
                 if torch.cuda.is_available()
+                else (
+                    "xpu"
+                    if torch.xpu.is_available()
+                    else "mps" if torch.backends.mps.is_available() else "cpu"
+                )
             )
         # Load models
         self.vocoder = load_vocoder(
+            self.mel_spec_type,
+            vocoder_local_path is not None,
+            vocoder_local_path,
+            self.device,
+            hf_cache_dir,
         )
         repo_name, ckpt_step, ckpt_type = "F5-TTS", 1250000, "safetensors"
         if not ckpt_file:
             ckpt_file = str(
+                cached_path(
+                    f"hf://SWivid/{repo_name}/{model}/model_{ckpt_step}.{ckpt_type}",
+                    cache_dir=hf_cache_dir,
+                )
             )
         self.ema_model = load_model(
+            model_cls,
+            model_arc,
+            ckpt_file,
+            self.mel_spec_type,
+            vocab_file,
+            self.ode_method,
+            self.use_ema,
+            self.device,
         )
     def transcribe(self, ref_audio, language=None):

f5_tts/eval/ecapa_tdnn.py CHANGED Viewed

@@ -9,7 +9,6 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 """ Res2Conv1d + BatchNorm1d + ReLU
 """
@@ -19,7 +18,16 @@ class Res2Conv1dReluBn(nn.Module):
     in_channels == out_channels == channels
     """
-    def __init__(self, channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=True, scale=4):
         super().__init__()
         assert channels % scale == 0, "{} % {} != 0".format(channels, scale)
         self.scale = scale
@@ -29,7 +37,17 @@ class Res2Conv1dReluBn(nn.Module):
         self.convs = []
         self.bns = []
         for i in range(self.nums):
-            self.convs.append(nn.Conv1d(self.width, self.width, kernel_size, stride, padding, dilation, bias=bias))
             self.bns.append(nn.BatchNorm1d(self.width))
         self.convs = nn.ModuleList(self.convs)
         self.bns = nn.ModuleList(self.bns)
@@ -58,9 +76,20 @@ class Res2Conv1dReluBn(nn.Module):
 class Conv1dReluBn(nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=True):
         super().__init__()
-        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias)
         self.bn = nn.BatchNorm1d(out_channels)
     def forward(self, x):
@@ -99,11 +128,27 @@ class SE_Connect(nn.Module):
 class SE_Res2Block(nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, dilation, scale, se_bottleneck_dim):
         super().__init__()
-        self.Conv1dReluBn1 = Conv1dReluBn(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
-        self.Res2Conv1dReluBn = Res2Conv1dReluBn(out_channels, kernel_size, stride, padding, dilation, scale=scale)
-        self.Conv1dReluBn2 = Conv1dReluBn(out_channels, out_channels, kernel_size=1, stride=1, padding=0)
         self.SE_Connect = SE_Connect(out_channels, se_bottleneck_dim)
         self.shortcut = None
@@ -138,15 +183,23 @@ class AttentiveStatsPool(nn.Module):
         # Use Conv1d with stride == 1 rather than Linear, then we don't need to transpose inputs.
         if global_context_att:
-            self.linear1 = nn.Conv1d(in_dim * 3, attention_channels, kernel_size=1)  # equals W and b in the paper
         else:
-            self.linear1 = nn.Conv1d(in_dim, attention_channels, kernel_size=1)  # equals W and b in the paper
-        self.linear2 = nn.Conv1d(attention_channels, in_dim, kernel_size=1)  # equals V and k in the paper
     def forward(self, x):
         if self.global_context_att:
             context_mean = torch.mean(x, dim=-1, keepdim=True).expand_as(x)
-            context_std = torch.sqrt(torch.var(x, dim=-1, keepdim=True) + 1e-10).expand_as(x)
             x_in = torch.cat((x, context_mean, context_std), dim=1)
         else:
             x_in = x
@@ -184,24 +237,36 @@ class ECAPA_TDNN(nn.Module):
         torch.hub._validate_not_a_forked_repo = lambda a, b, c: True
         try:
             local_s3prl_path = os.path.expanduser("~/.cache/torch/hub/s3prl_s3prl_main")
-            self.feature_extract = torch.hub.load(local_s3prl_path, feat_type, source="local", config_path=config_path)
         except:  # noqa: E722
             self.feature_extract = torch.hub.load("s3prl/s3prl", feat_type)
         if len(self.feature_extract.model.encoder.layers) == 24 and hasattr(
             self.feature_extract.model.encoder.layers[23].self_attn, "fp32_attention"
         ):
-            self.feature_extract.model.encoder.layers[23].self_attn.fp32_attention = False
         if len(self.feature_extract.model.encoder.layers) == 24 and hasattr(
             self.feature_extract.model.encoder.layers[11].self_attn, "fp32_attention"
         ):
-            self.feature_extract.model.encoder.layers[11].self_attn.fp32_attention = False
         self.feat_num = self.get_feat_num()
         self.feature_weight = nn.Parameter(torch.zeros(self.feat_num))
         if feat_type != "fbank" and feat_type != "mfcc":
-            freeze_list = ["final_proj", "label_embs_concat", "mask_emb", "project_q", "quantizer"]
             for name, param in self.feature_extract.named_parameters():
                 for freeze_val in freeze_list:
                     if freeze_val in name:
@@ -252,7 +317,9 @@ class ECAPA_TDNN(nn.Module):
         cat_channels = channels * 3
         self.conv = nn.Conv1d(cat_channels, self.channels[-1], kernel_size=1)
         self.pooling = AttentiveStatsPool(
-            self.channels[-1], attention_channels=128, global_context_att=global_context_att
         )
         self.bn = nn.BatchNorm1d(self.channels[-1] * 2)
         self.linear = nn.Linear(self.channels[-1] * 2, emb_dim)
@@ -287,7 +354,12 @@ class ECAPA_TDNN(nn.Module):
                 x = torch.stack(x, dim=0)
             else:
                 x = x.unsqueeze(0)
-            norm_weights = F.softmax(self.feature_weight, dim=-1).unsqueeze(-1).unsqueeze(-1).unsqueeze(-1)
             x = (norm_weights * x).sum(dim=0)
             x = torch.transpose(x, 1, 2) + 1e-6

 import torch.nn as nn
 import torch.nn.functional as F
 """ Res2Conv1d + BatchNorm1d + ReLU
 """
     in_channels == out_channels == channels
     """
+    def __init__(
+        self,
+        channels,
+        kernel_size=1,
+        stride=1,
+        padding=0,
+        dilation=1,
+        bias=True,
+        scale=4,
+    ):
         super().__init__()
         assert channels % scale == 0, "{} % {} != 0".format(channels, scale)
         self.scale = scale
         self.convs = []
         self.bns = []
         for i in range(self.nums):
+            self.convs.append(
+                nn.Conv1d(
+                    self.width,
+                    self.width,
+                    kernel_size,
+                    stride,
+                    padding,
+                    dilation,
+                    bias=bias,
+                )
+            )
             self.bns.append(nn.BatchNorm1d(self.width))
         self.convs = nn.ModuleList(self.convs)
         self.bns = nn.ModuleList(self.bns)
 class Conv1dReluBn(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=1,
+        stride=1,
+        padding=0,
+        dilation=1,
+        bias=True,
+    ):
         super().__init__()
+        self.conv = nn.Conv1d(
+            in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias
+        )
         self.bn = nn.BatchNorm1d(out_channels)
     def forward(self, x):
 class SE_Res2Block(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        scale,
+        se_bottleneck_dim,
+    ):
         super().__init__()
+        self.Conv1dReluBn1 = Conv1dReluBn(
+            in_channels, out_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.Res2Conv1dReluBn = Res2Conv1dReluBn(
+            out_channels, kernel_size, stride, padding, dilation, scale=scale
+        )
+        self.Conv1dReluBn2 = Conv1dReluBn(
+            out_channels, out_channels, kernel_size=1, stride=1, padding=0
+        )
         self.SE_Connect = SE_Connect(out_channels, se_bottleneck_dim)
         self.shortcut = None
         # Use Conv1d with stride == 1 rather than Linear, then we don't need to transpose inputs.
         if global_context_att:
+            self.linear1 = nn.Conv1d(
+                in_dim * 3, attention_channels, kernel_size=1
+            )  # equals W and b in the paper
         else:
+            self.linear1 = nn.Conv1d(
+                in_dim, attention_channels, kernel_size=1
+            )  # equals W and b in the paper
+        self.linear2 = nn.Conv1d(
+            attention_channels, in_dim, kernel_size=1
+        )  # equals V and k in the paper
     def forward(self, x):
         if self.global_context_att:
             context_mean = torch.mean(x, dim=-1, keepdim=True).expand_as(x)
+            context_std = torch.sqrt(
+                torch.var(x, dim=-1, keepdim=True) + 1e-10
+            ).expand_as(x)
             x_in = torch.cat((x, context_mean, context_std), dim=1)
         else:
             x_in = x
         torch.hub._validate_not_a_forked_repo = lambda a, b, c: True
         try:
             local_s3prl_path = os.path.expanduser("~/.cache/torch/hub/s3prl_s3prl_main")
+            self.feature_extract = torch.hub.load(
+                local_s3prl_path, feat_type, source="local", config_path=config_path
+            )
         except:  # noqa: E722
             self.feature_extract = torch.hub.load("s3prl/s3prl", feat_type)
         if len(self.feature_extract.model.encoder.layers) == 24 and hasattr(
             self.feature_extract.model.encoder.layers[23].self_attn, "fp32_attention"
         ):
+            self.feature_extract.model.encoder.layers[23].self_attn.fp32_attention = (
+                False
+            )
         if len(self.feature_extract.model.encoder.layers) == 24 and hasattr(
             self.feature_extract.model.encoder.layers[11].self_attn, "fp32_attention"
         ):
+            self.feature_extract.model.encoder.layers[11].self_attn.fp32_attention = (
+                False
+            )
         self.feat_num = self.get_feat_num()
         self.feature_weight = nn.Parameter(torch.zeros(self.feat_num))
         if feat_type != "fbank" and feat_type != "mfcc":
+            freeze_list = [
+                "final_proj",
+                "label_embs_concat",
+                "mask_emb",
+                "project_q",
+                "quantizer",
+            ]
             for name, param in self.feature_extract.named_parameters():
                 for freeze_val in freeze_list:
                     if freeze_val in name:
         cat_channels = channels * 3
         self.conv = nn.Conv1d(cat_channels, self.channels[-1], kernel_size=1)
         self.pooling = AttentiveStatsPool(
+            self.channels[-1],
+            attention_channels=128,
+            global_context_att=global_context_att,
         )
         self.bn = nn.BatchNorm1d(self.channels[-1] * 2)
         self.linear = nn.Linear(self.channels[-1] * 2, emb_dim)
                 x = torch.stack(x, dim=0)
             else:
                 x = x.unsqueeze(0)
+            norm_weights = (
+                F.softmax(self.feature_weight, dim=-1)
+                .unsqueeze(-1)
+                .unsqueeze(-1)
+                .unsqueeze(-1)
+            )
             x = (norm_weights * x).sum(dim=0)
             x = torch.transpose(x, 1, 2) + 1e-6

f5_tts/eval/eval_infer_batch.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import os
 import sys
 sys.path.append(os.getcwd())
 import argparse
@@ -15,16 +14,13 @@ from hydra.utils import get_class
 from omegaconf import OmegaConf
 from tqdm import tqdm
-from f5_tts.eval.utils_eval import (
-    get_inference_prompt,
-    get_librispeech_test_clean_metainfo,
-    get_seedtts_testset_metainfo,
-)
 from f5_tts.infer.utils_infer import load_checkpoint, load_vocoder
 from f5_tts.model import CFM
 from f5_tts.model.utils import get_tokenizer
 accelerator = Accelerator()
 device = f"cuda:{accelerator.process_index}"
@@ -67,7 +63,9 @@ def main():
     use_truth_duration = False
     no_ref_audio = False
-    model_cfg = OmegaConf.load(str(files("f5_tts").joinpath(f"configs/{exp_name}.yaml")))
     model_cls = get_class(f"f5_tts.model.{model_cfg.model.backbone}")
     model_arc = model_cfg.model.arch
@@ -83,8 +81,12 @@ def main():
     if testset == "ls_pc_test_clean":
         metalst = rel_path + "/data/librispeech_pc_test_clean_cross_sentence.lst"
-        librispeech_test_clean_path = "<SOME_PATH>/LibriSpeech/test-clean"  # test-clean path
-        metainfo = get_librispeech_test_clean_metainfo(metalst, librispeech_test_clean_path)
     elif testset == "seedtts_test_zh":
         metalst = rel_path + "/data/seedtts_testset/zh/meta.lst"
@@ -126,14 +128,18 @@ def main():
         vocoder_local_path = "../checkpoints/charactr/vocos-mel-24khz"
     elif mel_spec_type == "bigvgan":
         vocoder_local_path = "../checkpoints/bigvgan_v2_24khz_100band_256x"
-    vocoder = load_vocoder(vocoder_name=mel_spec_type, is_local=local, local_path=vocoder_local_path)
     # Tokenizer
     vocab_char_map, vocab_size = get_tokenizer(dataset_name, tokenizer)
     # Model
     model = CFM(
-        transformer=model_cls(**model_arc, text_num_embeds=vocab_size, mel_dim=n_mel_channels),
         mel_spec_kwargs=dict(
             n_fft=n_fft,
             hop_length=hop_length,
@@ -154,7 +160,9 @@ def main():
     elif os.path.exists(ckpt_prefix + ".safetensors"):
         ckpt_path = ckpt_prefix + ".safetensors"
     else:
-        print("Loading from self-organized training checkpoints rather than released pretrained.")
         ckpt_path = rel_path + f"/{model_cfg.ckpts.save_dir}/model_{ckpt_step}.pt"
     dtype = torch.float32 if mel_spec_type == "bigvgan" else None
@@ -169,7 +177,14 @@ def main():
     with accelerator.split_between_processes(prompts_all) as prompts:
         for prompt in tqdm(prompts, disable=not accelerator.is_local_main_process):
-            utts, ref_rms_list, ref_mels, ref_mel_lens, total_mel_lens, final_text_list = prompt
             ref_mels = ref_mels.to(device)
             ref_mel_lens = torch.tensor(ref_mel_lens, dtype=torch.long).to(device)
             total_mel_lens = torch.tensor(total_mel_lens, dtype=torch.long).to(device)
@@ -198,7 +213,11 @@ def main():
                     if ref_rms_list[i] < target_rms:
                         generated_wave = generated_wave * ref_rms_list[i] / target_rms
-                    torchaudio.save(f"{output_dir}/{utts[i]}.wav", generated_wave, target_sample_rate)
     accelerator.wait_for_everyone()
     if accelerator.is_main_process:

 import os
 import sys
 sys.path.append(os.getcwd())
 import argparse
 from omegaconf import OmegaConf
 from tqdm import tqdm
+from f5_tts.eval.utils_eval import (get_inference_prompt,
+                                    get_librispeech_test_clean_metainfo,
+                                    get_seedtts_testset_metainfo)
 from f5_tts.infer.utils_infer import load_checkpoint, load_vocoder
 from f5_tts.model import CFM
 from f5_tts.model.utils import get_tokenizer
 accelerator = Accelerator()
 device = f"cuda:{accelerator.process_index}"
     use_truth_duration = False
     no_ref_audio = False
+    model_cfg = OmegaConf.load(
+        str(files("f5_tts").joinpath(f"configs/{exp_name}.yaml"))
+    )
     model_cls = get_class(f"f5_tts.model.{model_cfg.model.backbone}")
     model_arc = model_cfg.model.arch
     if testset == "ls_pc_test_clean":
         metalst = rel_path + "/data/librispeech_pc_test_clean_cross_sentence.lst"
+        librispeech_test_clean_path = (
+            "<SOME_PATH>/LibriSpeech/test-clean"  # test-clean path
+        )
+        metainfo = get_librispeech_test_clean_metainfo(
+            metalst, librispeech_test_clean_path
+        )
     elif testset == "seedtts_test_zh":
         metalst = rel_path + "/data/seedtts_testset/zh/meta.lst"
         vocoder_local_path = "../checkpoints/charactr/vocos-mel-24khz"
     elif mel_spec_type == "bigvgan":
         vocoder_local_path = "../checkpoints/bigvgan_v2_24khz_100band_256x"
+    vocoder = load_vocoder(
+        vocoder_name=mel_spec_type, is_local=local, local_path=vocoder_local_path
+    )
     # Tokenizer
     vocab_char_map, vocab_size = get_tokenizer(dataset_name, tokenizer)
     # Model
     model = CFM(
+        transformer=model_cls(
+            **model_arc, text_num_embeds=vocab_size, mel_dim=n_mel_channels
+        ),
         mel_spec_kwargs=dict(
             n_fft=n_fft,
             hop_length=hop_length,
     elif os.path.exists(ckpt_prefix + ".safetensors"):
         ckpt_path = ckpt_prefix + ".safetensors"
     else:
+        print(
+            "Loading from self-organized training checkpoints rather than released pretrained."
+        )
         ckpt_path = rel_path + f"/{model_cfg.ckpts.save_dir}/model_{ckpt_step}.pt"
     dtype = torch.float32 if mel_spec_type == "bigvgan" else None
     with accelerator.split_between_processes(prompts_all) as prompts:
         for prompt in tqdm(prompts, disable=not accelerator.is_local_main_process):
+            (
+                utts,
+                ref_rms_list,
+                ref_mels,
+                ref_mel_lens,
+                total_mel_lens,
+                final_text_list,
+            ) = prompt
             ref_mels = ref_mels.to(device)
             ref_mel_lens = torch.tensor(ref_mel_lens, dtype=torch.long).to(device)
             total_mel_lens = torch.tensor(total_mel_lens, dtype=torch.long).to(device)
                     if ref_rms_list[i] < target_rms:
                         generated_wave = generated_wave * ref_rms_list[i] / target_rms
+                    torchaudio.save(
+                        f"{output_dir}/{utts[i]}.wav",
+                        generated_wave,
+                        target_sample_rate,
+                    )
     accelerator.wait_for_everyone()
     if accelerator.is_main_process:

f5_tts/eval/eval_librispeech_test_clean.py CHANGED Viewed

@@ -5,7 +5,6 @@ import json
 import os
 import sys
 sys.path.append(os.getcwd())
 import multiprocessing as mp
@@ -15,18 +14,23 @@ import numpy as np
 from f5_tts.eval.utils_eval import get_librispeech_test, run_asr_wer, run_sim
 rel_path = str(files("f5_tts").joinpath("../../"))
 def get_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("-e", "--eval_task", type=str, default="wer", choices=["sim", "wer"])
     parser.add_argument("-l", "--lang", type=str, default="en")
     parser.add_argument("-g", "--gen_wav_dir", type=str, required=True)
     parser.add_argument("-p", "--librispeech_test_clean_path", type=str, required=True)
-    parser.add_argument("-n", "--gpu_nums", type=int, default=8, help="Number of GPUs to use")
-    parser.add_argument("--local", action="store_true", help="Use local custom checkpoint directory")
     return parser.parse_args()
@@ -39,7 +43,9 @@ def main():
     metalst = rel_path + "/data/librispeech_pc_test_clean_cross_sentence.lst"
     gpus = list(range(args.gpu_nums))
-    test_set = get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path)
     ## In LibriSpeech, some speakers utilized varying voice characteristics for different characters in the book,
     ## leading to a low similarity for the ground truth in some cases.
@@ -59,13 +65,19 @@ def main():
     if eval_task == "wer":
         with mp.Pool(processes=len(gpus)) as pool:
-            args = [(rank, lang, sub_test_set, asr_ckpt_dir) for (rank, sub_test_set) in test_set]
             results = pool.map(run_asr_wer, args)
             for r in results:
                 full_results.extend(r)
     elif eval_task == "sim":
         with mp.Pool(processes=len(gpus)) as pool:
-            args = [(rank, sub_test_set, wavlm_ckpt_dir) for (rank, sub_test_set) in test_set]
             results = pool.map(run_sim, args)
             for r in results:
                 full_results.extend(r)

 import os
 import sys
 sys.path.append(os.getcwd())
 import multiprocessing as mp
 from f5_tts.eval.utils_eval import get_librispeech_test, run_asr_wer, run_sim
 rel_path = str(files("f5_tts").joinpath("../../"))
 def get_args():
     parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-e", "--eval_task", type=str, default="wer", choices=["sim", "wer"]
+    )
     parser.add_argument("-l", "--lang", type=str, default="en")
     parser.add_argument("-g", "--gen_wav_dir", type=str, required=True)
     parser.add_argument("-p", "--librispeech_test_clean_path", type=str, required=True)
+    parser.add_argument(
+        "-n", "--gpu_nums", type=int, default=8, help="Number of GPUs to use"
+    )
+    parser.add_argument(
+        "--local", action="store_true", help="Use local custom checkpoint directory"
+    )
     return parser.parse_args()
     metalst = rel_path + "/data/librispeech_pc_test_clean_cross_sentence.lst"
     gpus = list(range(args.gpu_nums))
+    test_set = get_librispeech_test(
+        metalst, gen_wav_dir, gpus, librispeech_test_clean_path
+    )
     ## In LibriSpeech, some speakers utilized varying voice characteristics for different characters in the book,
     ## leading to a low similarity for the ground truth in some cases.
     if eval_task == "wer":
         with mp.Pool(processes=len(gpus)) as pool:
+            args = [
+                (rank, lang, sub_test_set, asr_ckpt_dir)
+                for (rank, sub_test_set) in test_set
+            ]
             results = pool.map(run_asr_wer, args)
             for r in results:
                 full_results.extend(r)
     elif eval_task == "sim":
         with mp.Pool(processes=len(gpus)) as pool:
+            args = [
+                (rank, sub_test_set, wavlm_ckpt_dir)
+                for (rank, sub_test_set) in test_set
+            ]
             results = pool.map(run_sim, args)
             for r in results:
                 full_results.extend(r)

f5_tts/eval/eval_seedtts_testset.py CHANGED Viewed

@@ -5,7 +5,6 @@ import json
 import os
 import sys
 sys.path.append(os.getcwd())
 import multiprocessing as mp
@@ -15,17 +14,22 @@ import numpy as np
 from f5_tts.eval.utils_eval import get_seed_tts_test, run_asr_wer, run_sim
 rel_path = str(files("f5_tts").joinpath("../../"))
 def get_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("-e", "--eval_task", type=str, default="wer", choices=["sim", "wer"])
     parser.add_argument("-l", "--lang", type=str, default="en", choices=["zh", "en"])
     parser.add_argument("-g", "--gen_wav_dir", type=str, required=True)
-    parser.add_argument("-n", "--gpu_nums", type=int, default=8, help="Number of GPUs to use")
-    parser.add_argument("--local", action="store_true", help="Use local custom checkpoint directory")
     return parser.parse_args()
@@ -58,13 +62,19 @@ def main():
     if eval_task == "wer":
         with mp.Pool(processes=len(gpus)) as pool:
-            args = [(rank, lang, sub_test_set, asr_ckpt_dir) for (rank, sub_test_set) in test_set]
             results = pool.map(run_asr_wer, args)
             for r in results:
                 full_results.extend(r)
     elif eval_task == "sim":
         with mp.Pool(processes=len(gpus)) as pool:
-            args = [(rank, sub_test_set, wavlm_ckpt_dir) for (rank, sub_test_set) in test_set]
             results = pool.map(run_sim, args)
             for r in results:
                 full_results.extend(r)

 import os
 import sys
 sys.path.append(os.getcwd())
 import multiprocessing as mp
 from f5_tts.eval.utils_eval import get_seed_tts_test, run_asr_wer, run_sim
 rel_path = str(files("f5_tts").joinpath("../../"))
 def get_args():
     parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-e", "--eval_task", type=str, default="wer", choices=["sim", "wer"]
+    )
     parser.add_argument("-l", "--lang", type=str, default="en", choices=["zh", "en"])
     parser.add_argument("-g", "--gen_wav_dir", type=str, required=True)
+    parser.add_argument(
+        "-n", "--gpu_nums", type=int, default=8, help="Number of GPUs to use"
+    )
+    parser.add_argument(
+        "--local", action="store_true", help="Use local custom checkpoint directory"
+    )
     return parser.parse_args()
     if eval_task == "wer":
         with mp.Pool(processes=len(gpus)) as pool:
+            args = [
+                (rank, lang, sub_test_set, asr_ckpt_dir)
+                for (rank, sub_test_set) in test_set
+            ]
             results = pool.map(run_asr_wer, args)
             for r in results:
                 full_results.extend(r)
     elif eval_task == "sim":
         with mp.Pool(processes=len(gpus)) as pool:
+            args = [
+                (rank, sub_test_set, wavlm_ckpt_dir)
+                for (rank, sub_test_set) in test_set
+            ]
             results = pool.map(run_sim, args)
             for r in results:
                 full_results.extend(r)

f5_tts/eval/eval_utmos.py CHANGED Viewed

@@ -13,9 +13,15 @@ def main():
     parser.add_argument("--ext", type=str, default="wav", help="Audio extension.")
     args = parser.parse_args()
-    device = "cuda" if torch.cuda.is_available() else "xpu" if torch.xpu.is_available() else "cpu"
-    predictor = torch.hub.load("tarepan/SpeechMOS:v1.2.0", "utmos22_strong", trust_repo=True)
     predictor = predictor.to(device)
     audio_paths = list(Path(args.audio_dir).rglob(f"*.{args.ext}"))

     parser.add_argument("--ext", type=str, default="wav", help="Audio extension.")
     args = parser.parse_args()
+    device = (
+        "cuda"
+        if torch.cuda.is_available()
+        else "xpu" if torch.xpu.is_available() else "cpu"
+    )
+    predictor = torch.hub.load(
+        "tarepan/SpeechMOS:v1.2.0", "utmos22_strong", trust_repo=True
+    )
     predictor = predictor.to(device)
     audio_paths = list(Path(args.audio_dir).rglob(f"*.{args.ext}"))

f5_tts/eval/utils_eval.py CHANGED Viewed

@@ -43,11 +43,15 @@ def get_librispeech_test_clean_metainfo(metalst, librispeech_test_clean_path):
         # ref_txt = ref_txt[0] + ref_txt[1:].lower() + '.'  # if use librispeech test-clean (no-pc)
         ref_spk_id, ref_chaptr_id, _ = ref_utt.split("-")
-        ref_wav = os.path.join(librispeech_test_clean_path, ref_spk_id, ref_chaptr_id, ref_utt + ".flac")
         # gen_txt = gen_txt[0] + gen_txt[1:].lower() + '.'  # if use librispeech test-clean (no-pc)
         gen_spk_id, gen_chaptr_id, _ = gen_utt.split("-")
-        gen_wav = os.path.join(librispeech_test_clean_path, gen_spk_id, gen_chaptr_id, gen_utt + ".flac")
         metainfo.append((gen_utt, ref_txt, ref_wav, " " + gen_txt, gen_wav))
@@ -106,13 +110,17 @@ def get_inference_prompt(
         mel_spec_type=mel_spec_type,
     )
-    for utt, prompt_text, prompt_wav, gt_text, gt_wav in tqdm(metainfo, desc="Processing prompts..."):
         # Audio
         ref_audio, ref_sr = torchaudio.load(prompt_wav)
         ref_rms = torch.sqrt(torch.mean(torch.square(ref_audio)))
         if ref_rms < target_rms:
             ref_audio = ref_audio * target_rms / ref_rms
-        assert ref_audio.shape[-1] > 5000, f"Empty prompt wav: {prompt_wav}, or torchaudio backend issue."
         if ref_sr != target_sample_rate:
             resampler = torchaudio.transforms.Resample(ref_sr, target_sample_rate)
             ref_audio = resampler(ref_audio)
@@ -145,14 +153,18 @@ def get_inference_prompt(
         else:
             ref_text_len = len(prompt_text.encode("utf-8"))
             gen_text_len = len(gt_text.encode("utf-8"))
-            total_mel_len = ref_mel_len + int(ref_mel_len / ref_text_len * gen_text_len / speed)
         # deal with batch
         assert infer_batch_size > 0, "infer_batch_size should be greater than 0."
-        assert min_tokens <= total_mel_len <= max_tokens, (
-            f"Audio {utt} has duration {total_mel_len * hop_length // target_sample_rate}s out of range [{min_secs}, {max_secs}]."
         )
-        bucket_i = math.floor((total_mel_len - min_tokens) / (max_tokens - min_tokens + 1) * num_buckets)
         utts[bucket_i].append(utt)
         ref_rms_list[bucket_i].append(ref_rms)
@@ -183,7 +195,14 @@ def get_inference_prompt(
                 ref_mel_lens[bucket_i],
                 total_mel_lens[bucket_i],
                 final_text_list[bucket_i],
-            ) = [], [], [], [], [], []
     # add residual
     for bucket_i, bucket_frames in enumerate(batch_accum):
@@ -244,7 +263,9 @@ def get_seed_tts_test(metalst, gen_wav_dir, gpus):
 # get librispeech test-clean cross sentence test
-def get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path, eval_ground_truth=False):
     f = open(metalst)
     lines = f.readlines()
     f.close()
@@ -255,14 +276,21 @@ def get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path
         if eval_ground_truth:
             gen_spk_id, gen_chaptr_id, _ = gen_utt.split("-")
-            gen_wav = os.path.join(librispeech_test_clean_path, gen_spk_id, gen_chaptr_id, gen_utt + ".flac")
         else:
             if not os.path.exists(os.path.join(gen_wav_dir, gen_utt + ".wav")):
                 raise FileNotFoundError(f"Generated wav not found: {gen_utt}")
             gen_wav = os.path.join(gen_wav_dir, gen_utt + ".wav")
         ref_spk_id, ref_chaptr_id, _ = ref_utt.split("-")
-        ref_wav = os.path.join(librispeech_test_clean_path, ref_spk_id, ref_chaptr_id, ref_utt + ".flac")
         test_set_.append((gen_wav, ref_wav, gen_txt))
@@ -382,7 +410,9 @@ def run_sim(args):
     device = f"cuda:{rank}"
     model = ECAPA_TDNN_SMALL(feat_dim=1024, feat_type="wavlm_large", config_path=None)
-    state_dict = torch.load(ckpt_dir, weights_only=True, map_location=lambda storage, loc: storage)
     model.load_state_dict(state_dict["model"], strict=False)
     use_gpu = True if torch.cuda.is_available() else False

         # ref_txt = ref_txt[0] + ref_txt[1:].lower() + '.'  # if use librispeech test-clean (no-pc)
         ref_spk_id, ref_chaptr_id, _ = ref_utt.split("-")
+        ref_wav = os.path.join(
+            librispeech_test_clean_path, ref_spk_id, ref_chaptr_id, ref_utt + ".flac"
+        )
         # gen_txt = gen_txt[0] + gen_txt[1:].lower() + '.'  # if use librispeech test-clean (no-pc)
         gen_spk_id, gen_chaptr_id, _ = gen_utt.split("-")
+        gen_wav = os.path.join(
+            librispeech_test_clean_path, gen_spk_id, gen_chaptr_id, gen_utt + ".flac"
+        )
         metainfo.append((gen_utt, ref_txt, ref_wav, " " + gen_txt, gen_wav))
         mel_spec_type=mel_spec_type,
     )
+    for utt, prompt_text, prompt_wav, gt_text, gt_wav in tqdm(
+        metainfo, desc="Processing prompts..."
+    ):
         # Audio
         ref_audio, ref_sr = torchaudio.load(prompt_wav)
         ref_rms = torch.sqrt(torch.mean(torch.square(ref_audio)))
         if ref_rms < target_rms:
             ref_audio = ref_audio * target_rms / ref_rms
+        assert (
+            ref_audio.shape[-1] > 5000
+        ), f"Empty prompt wav: {prompt_wav}, or torchaudio backend issue."
         if ref_sr != target_sample_rate:
             resampler = torchaudio.transforms.Resample(ref_sr, target_sample_rate)
             ref_audio = resampler(ref_audio)
         else:
             ref_text_len = len(prompt_text.encode("utf-8"))
             gen_text_len = len(gt_text.encode("utf-8"))
+            total_mel_len = ref_mel_len + int(
+                ref_mel_len / ref_text_len * gen_text_len / speed
+            )
         # deal with batch
         assert infer_batch_size > 0, "infer_batch_size should be greater than 0."
+        assert (
+            min_tokens <= total_mel_len <= max_tokens
+        ), f"Audio {utt} has duration {total_mel_len * hop_length // target_sample_rate}s out of range [{min_secs}, {max_secs}]."
+        bucket_i = math.floor(
+            (total_mel_len - min_tokens) / (max_tokens - min_tokens + 1) * num_buckets
         )
         utts[bucket_i].append(utt)
         ref_rms_list[bucket_i].append(ref_rms)
                 ref_mel_lens[bucket_i],
                 total_mel_lens[bucket_i],
                 final_text_list[bucket_i],
+            ) = (
+                [],
+                [],
+                [],
+                [],
+                [],
+                [],
+            )
     # add residual
     for bucket_i, bucket_frames in enumerate(batch_accum):
 # get librispeech test-clean cross sentence test
+def get_librispeech_test(
+    metalst, gen_wav_dir, gpus, librispeech_test_clean_path, eval_ground_truth=False
+):
     f = open(metalst)
     lines = f.readlines()
     f.close()
         if eval_ground_truth:
             gen_spk_id, gen_chaptr_id, _ = gen_utt.split("-")
+            gen_wav = os.path.join(
+                librispeech_test_clean_path,
+                gen_spk_id,
+                gen_chaptr_id,
+                gen_utt + ".flac",
+            )
         else:
             if not os.path.exists(os.path.join(gen_wav_dir, gen_utt + ".wav")):
                 raise FileNotFoundError(f"Generated wav not found: {gen_utt}")
             gen_wav = os.path.join(gen_wav_dir, gen_utt + ".wav")
         ref_spk_id, ref_chaptr_id, _ = ref_utt.split("-")
+        ref_wav = os.path.join(
+            librispeech_test_clean_path, ref_spk_id, ref_chaptr_id, ref_utt + ".flac"
+        )
         test_set_.append((gen_wav, ref_wav, gen_txt))
     device = f"cuda:{rank}"
     model = ECAPA_TDNN_SMALL(feat_dim=1024, feat_type="wavlm_large", config_path=None)
+    state_dict = torch.load(
+        ckpt_dir, weights_only=True, map_location=lambda storage, loc: storage
+    )
     model.load_state_dict(state_dict["model"], strict=False)
     use_gpu = True if torch.cuda.is_available() else False

f5_tts/infer/infer_cli.py CHANGED Viewed

@@ -14,23 +14,12 @@ from hydra.utils import get_class
 from omegaconf import OmegaConf
 from unidecode import unidecode
-from f5_tts.infer.utils_infer import (
-    cfg_strength,
-    cross_fade_duration,
-    device,
-    fix_duration,
-    infer_process,
-    load_model,
-    load_vocoder,
-    mel_spec_type,
-    nfe_step,
-    preprocess_ref_audio_text,
-    remove_silence_for_generated_wav,
-    speed,
-    sway_sampling_coef,
-    target_rms,
-)
 parser = argparse.ArgumentParser(
     prog="python3 infer-cli.py",
@@ -41,7 +30,9 @@ parser.add_argument(
     "-c",
     "--config",
     type=str,
-    default=os.path.join(files("f5_tts").joinpath("infer/examples/basic"), "basic.toml"),
     help="The configuration file, default see infer/examples/basic/basic.toml",
 )
@@ -188,13 +179,17 @@ model = args.model or config.get("model", "F5TTS_v1_Base")
 ckpt_file = args.ckpt_file or config.get("ckpt_file", "")
 vocab_file = args.vocab_file or config.get("vocab_file", "")
-ref_audio = args.ref_audio or config.get("ref_audio", "infer/examples/basic/basic_ref_en.wav")
 ref_text = (
     args.ref_text
     if args.ref_text is not None
     else config.get("ref_text", "Some call me nature, others call me mother nature.")
 )
-gen_text = args.gen_text or config.get("gen_text", "Here we generate something just for test.")
 gen_file = args.gen_file or config.get("gen_file", "")
 output_dir = args.output_dir or config.get("output_dir", "tests")
@@ -203,21 +198,29 @@ output_file = args.output_file or config.get(
 )
 save_chunk = args.save_chunk or config.get("save_chunk", False)
-use_legacy_text = args.no_legacy_text or config.get("no_legacy_text", False)  # no_legacy_text is a store_false arg
 if save_chunk and use_legacy_text:
     print(
         "\nWarning to --save_chunk: lossy ASCII transliterations of unicode text for legacy (.wav) file names, --no_legacy_text to disable.\n"
     )
 remove_silence = args.remove_silence or config.get("remove_silence", False)
-load_vocoder_from_local = args.load_vocoder_from_local or config.get("load_vocoder_from_local", False)
 vocoder_name = args.vocoder_name or config.get("vocoder_name", mel_spec_type)
 target_rms = args.target_rms or config.get("target_rms", target_rms)
-cross_fade_duration = args.cross_fade_duration or config.get("cross_fade_duration", cross_fade_duration)
 nfe_step = args.nfe_step or config.get("nfe_step", nfe_step)
 cfg_strength = args.cfg_strength or config.get("cfg_strength", cfg_strength)
-sway_sampling_coef = args.sway_sampling_coef or config.get("sway_sampling_coef", sway_sampling_coef)
 speed = args.speed or config.get("speed", speed)
 fix_duration = args.fix_duration or config.get("fix_duration", fix_duration)
 device = args.device or config.get("device", device)
@@ -232,7 +235,9 @@ if "voices" in config:
     for voice in config["voices"]:
         voice_ref_audio = config["voices"][voice]["ref_audio"]
         if "infer/examples/" in voice_ref_audio:
-            config["voices"][voice]["ref_audio"] = str(files("f5_tts").joinpath(f"{voice_ref_audio}"))
 # ignore gen_text if gen_file provided
@@ -259,14 +264,18 @@ elif vocoder_name == "bigvgan":
     vocoder_local_path = "../checkpoints/bigvgan_v2_24khz_100band_256x"
 vocoder = load_vocoder(
-    vocoder_name=vocoder_name, is_local=load_vocoder_from_local, local_path=vocoder_local_path, device=device
 )
 # load TTS model
 model_cfg = OmegaConf.load(
-    args.model_cfg or config.get("model_cfg", str(files("f5_tts").joinpath(f"configs/{model}.yaml")))
 )
 model_cls = get_class(f"f5_tts.model.{model_cfg.model.backbone}")
 model_arc = model_cfg.model.arch
@@ -288,11 +297,18 @@ elif model == "E2TTS_Base":
     ckpt_step = 1200000
 if not ckpt_file:
-    ckpt_file = str(cached_path(f"hf://SWivid/{repo_name}/{model}/model_{ckpt_step}.{ckpt_type}"))
 print(f"Using {model}...")
 ema_model = load_model(
-    model_cls, model_arc, ckpt_file, mel_spec_type=vocoder_name, vocab_file=vocab_file, device=device
 )
@@ -309,8 +325,10 @@ def main():
     for voice in voices:
         print("Voice:", voice)
         print("ref_audio ", voices[voice]["ref_audio"])
-        voices[voice]["ref_audio"], voices[voice]["ref_text"] = preprocess_ref_audio_text(
-            voices[voice]["ref_audio"], voices[voice]["ref_text"]
         )
         print("ref_audio_", voices[voice]["ref_audio"], "\n\n")
@@ -360,7 +378,10 @@ def main():
             if use_legacy_text:
                 gen_text_ = unidecode(gen_text_)
             sf.write(
-                os.path.join(output_chunk_dir, f"{len(generated_audio_segments) - 1}_{gen_text_}.wav"),
                 audio_segment,
                 final_sample_rate,
             )

 from omegaconf import OmegaConf
 from unidecode import unidecode
+from f5_tts.infer.utils_infer import (cfg_strength, cross_fade_duration,
+                                      device, fix_duration, infer_process,
+                                      load_model, load_vocoder, mel_spec_type,
+                                      nfe_step, preprocess_ref_audio_text,
+                                      remove_silence_for_generated_wav, speed,
+                                      sway_sampling_coef, target_rms)
 parser = argparse.ArgumentParser(
     prog="python3 infer-cli.py",
     "-c",
     "--config",
     type=str,
+    default=os.path.join(
+        files("f5_tts").joinpath("infer/examples/basic"), "basic.toml"
+    ),
     help="The configuration file, default see infer/examples/basic/basic.toml",
 )
 ckpt_file = args.ckpt_file or config.get("ckpt_file", "")
 vocab_file = args.vocab_file or config.get("vocab_file", "")
+ref_audio = args.ref_audio or config.get(
+    "ref_audio", "infer/examples/basic/basic_ref_en.wav"
+)
 ref_text = (
     args.ref_text
     if args.ref_text is not None
     else config.get("ref_text", "Some call me nature, others call me mother nature.")
 )
+gen_text = args.gen_text or config.get(
+    "gen_text", "Here we generate something just for test."
+)
 gen_file = args.gen_file or config.get("gen_file", "")
 output_dir = args.output_dir or config.get("output_dir", "tests")
 )
 save_chunk = args.save_chunk or config.get("save_chunk", False)
+use_legacy_text = args.no_legacy_text or config.get(
+    "no_legacy_text", False
+)  # no_legacy_text is a store_false arg
 if save_chunk and use_legacy_text:
     print(
         "\nWarning to --save_chunk: lossy ASCII transliterations of unicode text for legacy (.wav) file names, --no_legacy_text to disable.\n"
     )
 remove_silence = args.remove_silence or config.get("remove_silence", False)
+load_vocoder_from_local = args.load_vocoder_from_local or config.get(
+    "load_vocoder_from_local", False
+)
 vocoder_name = args.vocoder_name or config.get("vocoder_name", mel_spec_type)
 target_rms = args.target_rms or config.get("target_rms", target_rms)
+cross_fade_duration = args.cross_fade_duration or config.get(
+    "cross_fade_duration", cross_fade_duration
+)
 nfe_step = args.nfe_step or config.get("nfe_step", nfe_step)
 cfg_strength = args.cfg_strength or config.get("cfg_strength", cfg_strength)
+sway_sampling_coef = args.sway_sampling_coef or config.get(
+    "sway_sampling_coef", sway_sampling_coef
+)
 speed = args.speed or config.get("speed", speed)
 fix_duration = args.fix_duration or config.get("fix_duration", fix_duration)
 device = args.device or config.get("device", device)
     for voice in config["voices"]:
         voice_ref_audio = config["voices"][voice]["ref_audio"]
         if "infer/examples/" in voice_ref_audio:
+            config["voices"][voice]["ref_audio"] = str(
+                files("f5_tts").joinpath(f"{voice_ref_audio}")
+            )
 # ignore gen_text if gen_file provided
     vocoder_local_path = "../checkpoints/bigvgan_v2_24khz_100band_256x"
 vocoder = load_vocoder(
+    vocoder_name=vocoder_name,
+    is_local=load_vocoder_from_local,
+    local_path=vocoder_local_path,
+    device=device,
 )
 # load TTS model
 model_cfg = OmegaConf.load(
+    args.model_cfg
+    or config.get("model_cfg", str(files("f5_tts").joinpath(f"configs/{model}.yaml")))
 )
 model_cls = get_class(f"f5_tts.model.{model_cfg.model.backbone}")
 model_arc = model_cfg.model.arch
     ckpt_step = 1200000
 if not ckpt_file:
+    ckpt_file = str(
+        cached_path(f"hf://SWivid/{repo_name}/{model}/model_{ckpt_step}.{ckpt_type}")
+    )
 print(f"Using {model}...")
 ema_model = load_model(
+    model_cls,
+    model_arc,
+    ckpt_file,
+    mel_spec_type=vocoder_name,
+    vocab_file=vocab_file,
+    device=device,
 )
     for voice in voices:
         print("Voice:", voice)
         print("ref_audio ", voices[voice]["ref_audio"])
+        voices[voice]["ref_audio"], voices[voice]["ref_text"] = (
+            preprocess_ref_audio_text(
+                voices[voice]["ref_audio"], voices[voice]["ref_text"]
+            )
         )
         print("ref_audio_", voices[voice]["ref_audio"], "\n\n")
             if use_legacy_text:
                 gen_text_ = unidecode(gen_text_)
             sf.write(
+                os.path.join(
+                    output_chunk_dir,
+                    f"{len(generated_audio_segments) - 1}_{gen_text_}.wav",
+                ),
                 audio_segment,
                 final_sample_rate,
             )

f5_tts/infer/infer_gradio.py CHANGED Viewed

@@ -19,7 +19,6 @@ import torchaudio
 from cached_path import cached_path
 from transformers import AutoModelForCausalLM, AutoTokenizer
 try:
     import spaces
@@ -35,25 +34,21 @@ def gpu_decorator(func):
         return func
-from f5_tts.infer.utils_infer import (
-    infer_process,
-    load_model,
-    load_vocoder,
-    preprocess_ref_audio_text,
-    remove_silence_for_generated_wav,
-    save_spectrogram,
-    tempfile_kwargs,
-)
 from f5_tts.model import DiT, UNetT
 DEFAULT_TTS_MODEL = "F5-TTS_v1"
 tts_model_choice = DEFAULT_TTS_MODEL
 DEFAULT_TTS_MODEL_CFG = [
     "hf://SWivid/F5-TTS/F5TTS_v1_Base/model_1250000.safetensors",
     "hf://SWivid/F5-TTS/F5TTS_v1_Base/vocab.txt",
-    json.dumps(dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)),
 ]
@@ -69,8 +64,12 @@ def load_f5tts():
 def load_e2tts():
-    ckpt_path = str(cached_path("hf://SWivid/E2-TTS/E2TTS_Base/model_1200000.safetensors"))
-    E2TTS_model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4, text_mask_padding=False, pe_attn_head=1)
     return load_model(UNetT, E2TTS_model_cfg, ckpt_path)
@@ -113,7 +112,8 @@ def chat_model_inference(messages, model, tokenizer):
     )
     generated_ids = [
-        output_ids[len(input_ids) :] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
     ]
     return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
@@ -157,7 +157,9 @@ def infer(
         gr.Warning("Please enter text to generate or upload a text file.")
         return gr.update(), gr.update(), ref_text
-    ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
     if model == DEFAULT_TTS_MODEL:
         ema_model = F5TTS_ema_model
@@ -172,7 +174,9 @@ def infer(
         global custom_ema_model, pre_custom_path
         if pre_custom_path != model[1]:
             show_info("Loading Custom TTS model...")
-            custom_ema_model = load_custom(model[1], vocab_path=model[2], model_cfg=model[3])
             pre_custom_path = model[1]
         ema_model = custom_ema_model
@@ -202,7 +206,9 @@ def infer(
         final_wave = final_wave.squeeze().cpu().numpy()
     # Save the spectrogram
-    with tempfile.NamedTemporaryFile(suffix=".png", **tempfile_kwargs) as tmp_spectrogram:
         spectrogram_path = tmp_spectrogram.name
     save_spectrogram(combined_spectrogram, spectrogram_path)
@@ -219,7 +225,9 @@ with gr.Blocks() as app_tts:
             max_lines=40,
             scale=4,
         )
-        gen_text_file = gr.File(label="Load Text to Generate from File (.txt)", file_types=[".txt"], scale=1)
     generate_btn = gr.Button("Synthesize", variant="primary")
     with gr.Accordion("Advanced Settings", open=False):
         with gr.Row():
@@ -229,7 +237,11 @@ with gr.Blocks() as app_tts:
                 lines=2,
                 scale=4,
             )
-            ref_text_file = gr.File(label="Load Reference Text from File (.txt)", file_types=[".txt"], scale=1)
         with gr.Row():
             randomize_seed = gr.Checkbox(
                 label="Randomize Seed",
@@ -417,13 +429,25 @@ with gr.Blocks() as app_multistyle:
             regular_ref_text = gr.Textbox(label="Reference Text (Regular)", lines=4)
             with gr.Row():
                 regular_seed_slider = gr.Slider(
-                    show_label=False, minimum=-1, maximum=999, value=-1, step=1, info="Seed, -1 for random"
                 )
                 regular_speed_slider = gr.Slider(
-                    show_label=False, minimum=0.3, maximum=2.0, value=1.0, step=0.1, info="Adjust the speed"
                 )
         with gr.Column(scale=1, min_width=160):
-            regular_ref_text_file = gr.File(label="Load Reference Text from File (.txt)", file_types=[".txt"])
     # Regular speech type (max 100)
     max_speech_types = 100
@@ -450,13 +474,25 @@ with gr.Blocks() as app_multistyle:
                 ref_text_input = gr.Textbox(label="Reference Text", lines=4)
                 with gr.Row():
                     seed_input = gr.Slider(
-                        show_label=False, minimum=-1, maximum=999, value=-1, step=1, info="Seed. -1 for random"
                     )
                     speed_input = gr.Slider(
-                        show_label=False, minimum=0.3, maximum=2.0, value=1.0, step=0.1, info="Adjust the speed"
                     )
             with gr.Column(scale=1, min_width=160):
-                ref_text_file_input = gr.File(label="Load Reference Text from File (.txt)", file_types=[".txt"])
         speech_type_rows.append(row)
         speech_type_names.append(name_input)
         speech_type_audios.append(audio_input)
@@ -494,7 +530,9 @@ with gr.Blocks() as app_multistyle:
             row_updates[speech_type_count] = gr.update(visible=True)
             speech_type_count += 1
         else:
-            gr.Warning("Exhausted maximum number of speech types. Consider restart the app.")
         return row_updates
     add_speech_type_btn.click(add_speech_type_fn, outputs=speech_type_rows)
@@ -525,10 +563,14 @@ with gr.Blocks() as app_multistyle:
             scale=4,
             placeholder="Enter the script with speaker names (or emotion types) at the start of each block, e.g.:\n\n{Regular} Hello, I'd like to order a sandwich please.\n{Surprised} What do you mean you're out of bread?\n{Sad} I really wanted a sandwich though...\n{Angry} You know what, darn you and your little shop!\n{Whisper} I'll just go back home and cry now.\n{Shouting} Why me?!",
         )
-        gen_text_file_multistyle = gr.File(label="Load Text to Generate from File (.txt)", file_types=[".txt"], scale=1)
     def make_insert_speech_type_fn(index):
-        def insert_speech_type_fn(current_text, speech_type_name, speech_type_seed, speech_type_speed):
             current_text = current_text or ""
             if not speech_type_name:
                 gr.Warning("Please enter speech type name before insert.")
@@ -547,7 +589,12 @@ with gr.Blocks() as app_multistyle:
         insert_fn = make_insert_speech_type_fn(i)
         insert_btn.click(
             insert_fn,
-            inputs=[gen_text_input_multistyle, speech_type_names[i], speech_type_seeds[i], speech_type_speeds[i]],
             outputs=gen_text_input_multistyle,
         )
@@ -567,7 +614,9 @@ with gr.Blocks() as app_multistyle:
                 )
     # Generate button
-    generate_multistyle_btn = gr.Button("Generate Multi-Style Speech", variant="primary")
     # Output audio
     audio_output_multistyle = gr.Audio(label="Synthesized Audio")
@@ -613,7 +662,10 @@ with gr.Blocks() as app_multistyle:
             speech_type_names_list, speech_type_audios_list, speech_type_ref_texts_list
         ):
             if name_input and audio_input:
-                speech_types[name_input] = {"audio": audio_input, "ref_text": ref_text_input}
             else:
                 speech_types[f"@{ref_text_idx}@"] = {"audio": "", "ref_text": ""}
             ref_text_idx += 1
@@ -635,14 +687,22 @@ with gr.Blocks() as app_multistyle:
             if name in speech_types:
                 current_type_name = name
             else:
-                gr.Warning(f"Type {name} is not available, will use Regular as default.")
                 current_type_name = "Regular"
             try:
                 ref_audio = speech_types[current_type_name]["audio"]
             except KeyError:
-                gr.Warning(f"Please provide reference audio for type {current_type_name}.")
-                return [None] + [speech_types[name]["ref_text"] for name in speech_types] + [None]
             ref_text = speech_types[current_type_name].get("ref_text", "")
             if seed_input == -1:
@@ -664,7 +724,9 @@ with gr.Blocks() as app_multistyle:
             generated_audio_segments.append(audio_data)
             speech_types[current_type_name]["ref_text"] = ref_text_out
-            inference_meta_data += json.dumps(dict(name=name, seed=used_seed, speed=speed)) + f" {text}\n"
         # Concatenate all audio segments
         if generated_audio_segments:
@@ -676,7 +738,11 @@ with gr.Blocks() as app_multistyle:
             )
         else:
             gr.Warning("No audio generated.")
-            return [None] + [speech_types[name]["ref_text"] for name in speech_types] + [None]
     generate_multistyle_btn.click(
         generate_multistyle_speech,
@@ -689,7 +755,9 @@ with gr.Blocks() as app_multistyle:
         + [
             remove_silence_multistyle,
         ],
-        outputs=[audio_output_multistyle] + speech_type_ref_texts + [cherrypick_interface_multistyle],
     )
     # Validation function to disable Generate button if speech types are missing
@@ -753,7 +821,9 @@ Have a conversation with an AI using your reference voice!
             torch.cuda.empty_cache()
         show_info(f"Loading chat model: {chat_model_name}")
-        chat_model_state = AutoModelForCausalLM.from_pretrained(chat_model_name, torch_dtype="auto", device_map="auto")
         chat_tokenizer_state = AutoTokenizer.from_pretrained(chat_model_name)
         show_info(f"Chat model {chat_model_name} loaded successfully!")
@@ -769,7 +839,9 @@ Have a conversation with an AI using your reference voice!
         info="Enter the name of a HuggingFace chat model",
         allow_custom_value=not USING_SPACES,
     )
-    load_chat_model_btn = gr.Button("Load Chat Model", variant="primary", visible=not USING_SPACES)
     chat_interface_container = gr.Column(visible=USING_SPACES)
     chat_model_name_input.change(
@@ -779,7 +851,9 @@ Have a conversation with an AI using your reference voice!
         show_progress="hidden",
     )
     load_chat_model_btn.click(
-        load_chat_model, inputs=[chat_model_name_input], outputs=[load_chat_model_btn, chat_interface_container]
     )
     with chat_interface_container:
@@ -796,7 +870,9 @@ Have a conversation with an AI using your reference voice!
                             scale=3,
                         )
                         ref_text_file_chat = gr.File(
-                            label="Load Reference Text from File (.txt)", file_types=[".txt"], scale=1
                         )
                     with gr.Row():
                         randomize_seed_chat = gr.Checkbox(
@@ -805,7 +881,9 @@ Have a conversation with an AI using your reference voice!
                             info="Uncheck to use the seed specified.",
                             scale=3,
                         )
-                        seed_input_chat = gr.Number(show_label=False, value=0, precision=0, scale=1)
                     remove_silence_chat = gr.Checkbox(
                         label="Remove Silences",
                         value=True,
@@ -855,13 +933,17 @@ Have a conversation with an AI using your reference voice!
             """Generate text response from AI"""
             system_prompt_state = [{"role": "system", "content": system_prompt}]
-            response = chat_model_inference(system_prompt_state + conv_state, chat_model_state, chat_tokenizer_state)
             conv_state.append({"role": "assistant", "content": response})
             return conv_state
         @gpu_decorator
-        def generate_audio_response(conv_state, ref_audio, ref_text, remove_silence, randomize_seed, seed_input):
             """Generate TTS audio for AI response"""
             if not conv_state or not ref_audio:
                 return None, ref_text, seed_input
@@ -896,7 +978,11 @@ Have a conversation with an AI using your reference voice!
             outputs=[ref_text_chat],
         )
-        for user_operation in [audio_input_chat.stop_recording, text_input_chat.submit, send_btn_chat.click]:
             user_operation(
                 process_audio_input,
                 inputs=[chatbot_interface, audio_input_chat, text_input_chat],
@@ -923,7 +1009,11 @@ Have a conversation with an AI using your reference voice!
             )
         # Handle clear button or system prompt change and reset conversation
-        for user_operation in [clear_btn_chat.click, system_prompt_chat.change, chatbot_interface.clear]:
             user_operation(
                 clear_conversation,
                 outputs=[chatbot_interface, audio_output_chat],
@@ -931,13 +1021,15 @@ Have a conversation with an AI using your reference voice!
 with gr.Blocks() as app_credits:
-    gr.Markdown("""
 # Credits
 * [mrfakename](https://github.com/fakerybakery) for the original [online demo](https://huggingface.co/spaces/mrfakename/E2-F5-TTS)
 * [RootingInLoad](https://github.com/RootingInLoad) for initial chunk generation and podcast app exploration
 * [jpgallegoar](https://github.com/jpgallegoar) for multiple speech-type generation & voice chat
-""")
 with gr.Blocks() as app:
@@ -958,7 +1050,9 @@ If you're having issues, try converting your reference audio to WAV or MP3, clip
 """
     )
-    last_used_custom = files("f5_tts").joinpath("infer/.cache/last_used_custom_model_info_v1.txt")
     def load_last_used_custom():
         try:
@@ -974,8 +1068,15 @@ If you're having issues, try converting your reference audio to WAV or MP3, clip
     def switch_tts_model(new_choice):
         global tts_model_choice
         if new_choice == "Custom":  # override in case webpage is refreshed
-            custom_ckpt_path, custom_vocab_path, custom_model_cfg = load_last_used_custom()
-            tts_model_choice = ("Custom", custom_ckpt_path, custom_vocab_path, custom_model_cfg)
             return (
                 gr.update(visible=True, value=custom_ckpt_path),
                 gr.update(visible=True, value=custom_vocab_path),
@@ -983,22 +1084,42 @@ If you're having issues, try converting your reference audio to WAV or MP3, clip
             )
         else:
             tts_model_choice = new_choice
-            return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
     def set_custom_model(custom_ckpt_path, custom_vocab_path, custom_model_cfg):
         global tts_model_choice
-        tts_model_choice = ("Custom", custom_ckpt_path, custom_vocab_path, custom_model_cfg)
         with open(last_used_custom, "w", encoding="utf-8") as f:
-            f.write(custom_ckpt_path + "\n" + custom_vocab_path + "\n" + custom_model_cfg + "\n")
     with gr.Row():
         if not USING_SPACES:
             choose_tts_model = gr.Radio(
-                choices=[DEFAULT_TTS_MODEL, "E2-TTS", "Custom"], label="Choose TTS Model", value=DEFAULT_TTS_MODEL
             )
         else:
             choose_tts_model = gr.Radio(
-                choices=[DEFAULT_TTS_MODEL, "E2-TTS"], label="Choose TTS Model", value=DEFAULT_TTS_MODEL
             )
         custom_ckpt_path = gr.Dropdown(
             choices=[DEFAULT_TTS_MODEL_CFG[0]],

 from cached_path import cached_path
 from transformers import AutoModelForCausalLM, AutoTokenizer
 try:
     import spaces
         return func
+from f5_tts.infer.utils_infer import (infer_process, load_model, load_vocoder,
+                                      preprocess_ref_audio_text,
+                                      remove_silence_for_generated_wav,
+                                      save_spectrogram, tempfile_kwargs)
 from f5_tts.model import DiT, UNetT
 DEFAULT_TTS_MODEL = "F5-TTS_v1"
 tts_model_choice = DEFAULT_TTS_MODEL
 DEFAULT_TTS_MODEL_CFG = [
     "hf://SWivid/F5-TTS/F5TTS_v1_Base/model_1250000.safetensors",
     "hf://SWivid/F5-TTS/F5TTS_v1_Base/vocab.txt",
+    json.dumps(
+        dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
+    ),
 ]
 def load_e2tts():
+    ckpt_path = str(
+        cached_path("hf://SWivid/E2-TTS/E2TTS_Base/model_1200000.safetensors")
+    )
+    E2TTS_model_cfg = dict(
+        dim=1024, depth=24, heads=16, ff_mult=4, text_mask_padding=False, pe_attn_head=1
+    )
     return load_model(UNetT, E2TTS_model_cfg, ckpt_path)
     )
     generated_ids = [
+        output_ids[len(input_ids) :]
+        for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
     ]
     return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
         gr.Warning("Please enter text to generate or upload a text file.")
         return gr.update(), gr.update(), ref_text
+    ref_audio, ref_text = preprocess_ref_audio_text(
+        ref_audio_orig, ref_text, show_info=show_info
+    )
     if model == DEFAULT_TTS_MODEL:
         ema_model = F5TTS_ema_model
         global custom_ema_model, pre_custom_path
         if pre_custom_path != model[1]:
             show_info("Loading Custom TTS model...")
+            custom_ema_model = load_custom(
+                model[1], vocab_path=model[2], model_cfg=model[3]
+            )
             pre_custom_path = model[1]
         ema_model = custom_ema_model
         final_wave = final_wave.squeeze().cpu().numpy()
     # Save the spectrogram
+    with tempfile.NamedTemporaryFile(
+        suffix=".png", **tempfile_kwargs
+    ) as tmp_spectrogram:
         spectrogram_path = tmp_spectrogram.name
     save_spectrogram(combined_spectrogram, spectrogram_path)
             max_lines=40,
             scale=4,
         )
+        gen_text_file = gr.File(
+            label="Load Text to Generate from File (.txt)", file_types=[".txt"], scale=1
+        )
     generate_btn = gr.Button("Synthesize", variant="primary")
     with gr.Accordion("Advanced Settings", open=False):
         with gr.Row():
                 lines=2,
                 scale=4,
             )
+            ref_text_file = gr.File(
+                label="Load Reference Text from File (.txt)",
+                file_types=[".txt"],
+                scale=1,
+            )
         with gr.Row():
             randomize_seed = gr.Checkbox(
                 label="Randomize Seed",
             regular_ref_text = gr.Textbox(label="Reference Text (Regular)", lines=4)
             with gr.Row():
                 regular_seed_slider = gr.Slider(
+                    show_label=False,
+                    minimum=-1,
+                    maximum=999,
+                    value=-1,
+                    step=1,
+                    info="Seed, -1 for random",
                 )
                 regular_speed_slider = gr.Slider(
+                    show_label=False,
+                    minimum=0.3,
+                    maximum=2.0,
+                    value=1.0,
+                    step=0.1,
+                    info="Adjust the speed",
                 )
         with gr.Column(scale=1, min_width=160):
+            regular_ref_text_file = gr.File(
+                label="Load Reference Text from File (.txt)", file_types=[".txt"]
+            )
     # Regular speech type (max 100)
     max_speech_types = 100
                 ref_text_input = gr.Textbox(label="Reference Text", lines=4)
                 with gr.Row():
                     seed_input = gr.Slider(
+                        show_label=False,
+                        minimum=-1,
+                        maximum=999,
+                        value=-1,
+                        step=1,
+                        info="Seed. -1 for random",
                     )
                     speed_input = gr.Slider(
+                        show_label=False,
+                        minimum=0.3,
+                        maximum=2.0,
+                        value=1.0,
+                        step=0.1,
+                        info="Adjust the speed",
                     )
             with gr.Column(scale=1, min_width=160):
+                ref_text_file_input = gr.File(
+                    label="Load Reference Text from File (.txt)", file_types=[".txt"]
+                )
         speech_type_rows.append(row)
         speech_type_names.append(name_input)
         speech_type_audios.append(audio_input)
             row_updates[speech_type_count] = gr.update(visible=True)
             speech_type_count += 1
         else:
+            gr.Warning(
+                "Exhausted maximum number of speech types. Consider restart the app."
+            )
         return row_updates
     add_speech_type_btn.click(add_speech_type_fn, outputs=speech_type_rows)
             scale=4,
             placeholder="Enter the script with speaker names (or emotion types) at the start of each block, e.g.:\n\n{Regular} Hello, I'd like to order a sandwich please.\n{Surprised} What do you mean you're out of bread?\n{Sad} I really wanted a sandwich though...\n{Angry} You know what, darn you and your little shop!\n{Whisper} I'll just go back home and cry now.\n{Shouting} Why me?!",
         )
+        gen_text_file_multistyle = gr.File(
+            label="Load Text to Generate from File (.txt)", file_types=[".txt"], scale=1
+        )
     def make_insert_speech_type_fn(index):
+        def insert_speech_type_fn(
+            current_text, speech_type_name, speech_type_seed, speech_type_speed
+        ):
             current_text = current_text or ""
             if not speech_type_name:
                 gr.Warning("Please enter speech type name before insert.")
         insert_fn = make_insert_speech_type_fn(i)
         insert_btn.click(
             insert_fn,
+            inputs=[
+                gen_text_input_multistyle,
+                speech_type_names[i],
+                speech_type_seeds[i],
+                speech_type_speeds[i],
+            ],
             outputs=gen_text_input_multistyle,
         )
                 )
     # Generate button
+    generate_multistyle_btn = gr.Button(
+        "Generate Multi-Style Speech", variant="primary"
+    )
     # Output audio
     audio_output_multistyle = gr.Audio(label="Synthesized Audio")
             speech_type_names_list, speech_type_audios_list, speech_type_ref_texts_list
         ):
             if name_input and audio_input:
+                speech_types[name_input] = {
+                    "audio": audio_input,
+                    "ref_text": ref_text_input,
+                }
             else:
                 speech_types[f"@{ref_text_idx}@"] = {"audio": "", "ref_text": ""}
             ref_text_idx += 1
             if name in speech_types:
                 current_type_name = name
             else:
+                gr.Warning(
+                    f"Type {name} is not available, will use Regular as default."
+                )
                 current_type_name = "Regular"
             try:
                 ref_audio = speech_types[current_type_name]["audio"]
             except KeyError:
+                gr.Warning(
+                    f"Please provide reference audio for type {current_type_name}."
+                )
+                return (
+                    [None]
+                    + [speech_types[name]["ref_text"] for name in speech_types]
+                    + [None]
+                )
             ref_text = speech_types[current_type_name].get("ref_text", "")
             if seed_input == -1:
             generated_audio_segments.append(audio_data)
             speech_types[current_type_name]["ref_text"] = ref_text_out
+            inference_meta_data += (
+                json.dumps(dict(name=name, seed=used_seed, speed=speed)) + f" {text}\n"
+            )
         # Concatenate all audio segments
         if generated_audio_segments:
             )
         else:
             gr.Warning("No audio generated.")
+            return (
+                [None]
+                + [speech_types[name]["ref_text"] for name in speech_types]
+                + [None]
+            )
     generate_multistyle_btn.click(
         generate_multistyle_speech,
         + [
             remove_silence_multistyle,
         ],
+        outputs=[audio_output_multistyle]
+        + speech_type_ref_texts
+        + [cherrypick_interface_multistyle],
     )
     # Validation function to disable Generate button if speech types are missing
             torch.cuda.empty_cache()
         show_info(f"Loading chat model: {chat_model_name}")
+        chat_model_state = AutoModelForCausalLM.from_pretrained(
+            chat_model_name, torch_dtype="auto", device_map="auto"
+        )
         chat_tokenizer_state = AutoTokenizer.from_pretrained(chat_model_name)
         show_info(f"Chat model {chat_model_name} loaded successfully!")
         info="Enter the name of a HuggingFace chat model",
         allow_custom_value=not USING_SPACES,
     )
+    load_chat_model_btn = gr.Button(
+        "Load Chat Model", variant="primary", visible=not USING_SPACES
+    )
     chat_interface_container = gr.Column(visible=USING_SPACES)
     chat_model_name_input.change(
         show_progress="hidden",
     )
     load_chat_model_btn.click(
+        load_chat_model,
+        inputs=[chat_model_name_input],
+        outputs=[load_chat_model_btn, chat_interface_container],
     )
     with chat_interface_container:
                             scale=3,
                         )
                         ref_text_file_chat = gr.File(
+                            label="Load Reference Text from File (.txt)",
+                            file_types=[".txt"],
+                            scale=1,
                         )
                     with gr.Row():
                         randomize_seed_chat = gr.Checkbox(
                             info="Uncheck to use the seed specified.",
                             scale=3,
                         )
+                        seed_input_chat = gr.Number(
+                            show_label=False, value=0, precision=0, scale=1
+                        )
                     remove_silence_chat = gr.Checkbox(
                         label="Remove Silences",
                         value=True,
             """Generate text response from AI"""
             system_prompt_state = [{"role": "system", "content": system_prompt}]
+            response = chat_model_inference(
+                system_prompt_state + conv_state, chat_model_state, chat_tokenizer_state
+            )
             conv_state.append({"role": "assistant", "content": response})
             return conv_state
         @gpu_decorator
+        def generate_audio_response(
+            conv_state, ref_audio, ref_text, remove_silence, randomize_seed, seed_input
+        ):
             """Generate TTS audio for AI response"""
             if not conv_state or not ref_audio:
                 return None, ref_text, seed_input
             outputs=[ref_text_chat],
         )
+        for user_operation in [
+            audio_input_chat.stop_recording,
+            text_input_chat.submit,
+            send_btn_chat.click,
+        ]:
             user_operation(
                 process_audio_input,
                 inputs=[chatbot_interface, audio_input_chat, text_input_chat],
             )
         # Handle clear button or system prompt change and reset conversation
+        for user_operation in [
+            clear_btn_chat.click,
+            system_prompt_chat.change,
+            chatbot_interface.clear,
+        ]:
             user_operation(
                 clear_conversation,
                 outputs=[chatbot_interface, audio_output_chat],
 with gr.Blocks() as app_credits:
+    gr.Markdown(
+        """
 # Credits
 * [mrfakename](https://github.com/fakerybakery) for the original [online demo](https://huggingface.co/spaces/mrfakename/E2-F5-TTS)
 * [RootingInLoad](https://github.com/RootingInLoad) for initial chunk generation and podcast app exploration
 * [jpgallegoar](https://github.com/jpgallegoar) for multiple speech-type generation & voice chat
+"""
+    )
 with gr.Blocks() as app:
 """
     )
+    last_used_custom = files("f5_tts").joinpath(
+        "infer/.cache/last_used_custom_model_info_v1.txt"
+    )
     def load_last_used_custom():
         try:
     def switch_tts_model(new_choice):
         global tts_model_choice
         if new_choice == "Custom":  # override in case webpage is refreshed
+            custom_ckpt_path, custom_vocab_path, custom_model_cfg = (
+                load_last_used_custom()
+            )
+            tts_model_choice = (
+                "Custom",
+                custom_ckpt_path,
+                custom_vocab_path,
+                custom_model_cfg,
+            )
             return (
                 gr.update(visible=True, value=custom_ckpt_path),
                 gr.update(visible=True, value=custom_vocab_path),
             )
         else:
             tts_model_choice = new_choice
+            return (
+                gr.update(visible=False),
+                gr.update(visible=False),
+                gr.update(visible=False),
+            )
     def set_custom_model(custom_ckpt_path, custom_vocab_path, custom_model_cfg):
         global tts_model_choice
+        tts_model_choice = (
+            "Custom",
+            custom_ckpt_path,
+            custom_vocab_path,
+            custom_model_cfg,
+        )
         with open(last_used_custom, "w", encoding="utf-8") as f:
+            f.write(
+                custom_ckpt_path
+                + "\n"
+                + custom_vocab_path
+                + "\n"
+                + custom_model_cfg
+                + "\n"
+            )
     with gr.Row():
         if not USING_SPACES:
             choose_tts_model = gr.Radio(
+                choices=[DEFAULT_TTS_MODEL, "E2-TTS", "Custom"],
+                label="Choose TTS Model",
+                value=DEFAULT_TTS_MODEL,
             )
         else:
             choose_tts_model = gr.Radio(
+                choices=[DEFAULT_TTS_MODEL, "E2-TTS"],
+                label="Choose TTS Model",
+                value=DEFAULT_TTS_MODEL,
             )
         custom_ckpt_path = gr.Dropdown(
             choices=[DEFAULT_TTS_MODEL_CFG[0]],

f5_tts/infer/speech_edit.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import os
 os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"  # for MPS device compatibility
 from importlib.resources import files
@@ -12,19 +11,19 @@ from cached_path import cached_path
 from hydra.utils import get_class
 from omegaconf import OmegaConf
-from f5_tts.infer.utils_infer import load_checkpoint, load_vocoder, save_spectrogram
 from f5_tts.model import CFM
 from f5_tts.model.utils import convert_char_to_pinyin, get_tokenizer
 device = (
     "cuda"
     if torch.cuda.is_available()
-    else "xpu"
-    if torch.xpu.is_available()
-    else "mps"
-    if torch.backends.mps.is_available()
-    else "cpu"
 )
@@ -59,7 +58,9 @@ n_fft = model_cfg.model.mel_spec.n_fft
 # ckpt_path = str(files("f5_tts").joinpath("../../")) + f"/ckpts/{exp_name}/model_{ckpt_step}.safetensors"
-ckpt_path = str(cached_path(f"hf://SWivid/F5-TTS/{exp_name}/model_{ckpt_step}.safetensors"))
 output_dir = "tests"
@@ -103,14 +104,18 @@ if mel_spec_type == "vocos":
     vocoder_local_path = "../checkpoints/charactr/vocos-mel-24khz"
 elif mel_spec_type == "bigvgan":
     vocoder_local_path = "../checkpoints/bigvgan_v2_24khz_100band_256x"
-vocoder = load_vocoder(vocoder_name=mel_spec_type, is_local=local, local_path=vocoder_local_path)
 # Tokenizer
 vocab_char_map, vocab_size = get_tokenizer(dataset_name, tokenizer)
 # Model
 model = CFM(
-    transformer=model_cls(**model_arc, text_num_embeds=vocab_size, mel_dim=n_mel_channels),
     mel_spec_kwargs=dict(
         n_fft=n_fft,
         hop_length=hop_length,
@@ -146,7 +151,14 @@ for part in parts_to_edit:
     part_dur = end - start if fix_duration is None else fix_duration.pop(0)
     part_dur = part_dur * target_sample_rate
     start = start * target_sample_rate
-    audio_ = torch.cat((audio_, audio[:, round(offset) : round(start)], torch.zeros(1, round(part_dur))), dim=-1)
     edit_mask = torch.cat(
         (
             edit_mask,
@@ -157,7 +169,9 @@ for part in parts_to_edit:
     )
     offset = end * target_sample_rate
 audio = torch.cat((audio_, audio[:, round(offset) :]), dim=-1)
-edit_mask = F.pad(edit_mask, (0, audio.shape[-1] // hop_length - edit_mask.shape[-1] + 1), value=True)
 audio = audio.to(device)
 edit_mask = edit_mask.to(device)
@@ -201,5 +215,7 @@ with torch.inference_mode():
         generated_wave = generated_wave * rms / target_rms
     save_spectrogram(gen_mel_spec[0].cpu().numpy(), f"{output_dir}/speech_edit_out.png")
-    torchaudio.save(f"{output_dir}/speech_edit_out.wav", generated_wave, target_sample_rate)
     print(f"Generated wav: {generated_wave.shape}")

 import os
 os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"  # for MPS device compatibility
 from importlib.resources import files
 from hydra.utils import get_class
 from omegaconf import OmegaConf
+from f5_tts.infer.utils_infer import (load_checkpoint, load_vocoder,
+                                      save_spectrogram)
 from f5_tts.model import CFM
 from f5_tts.model.utils import convert_char_to_pinyin, get_tokenizer
 device = (
     "cuda"
     if torch.cuda.is_available()
+    else (
+        "xpu"
+        if torch.xpu.is_available()
+        else "mps" if torch.backends.mps.is_available() else "cpu"
+    )
 )
 # ckpt_path = str(files("f5_tts").joinpath("../../")) + f"/ckpts/{exp_name}/model_{ckpt_step}.safetensors"
+ckpt_path = str(
+    cached_path(f"hf://SWivid/F5-TTS/{exp_name}/model_{ckpt_step}.safetensors")
+)
 output_dir = "tests"
     vocoder_local_path = "../checkpoints/charactr/vocos-mel-24khz"
 elif mel_spec_type == "bigvgan":
     vocoder_local_path = "../checkpoints/bigvgan_v2_24khz_100band_256x"
+vocoder = load_vocoder(
+    vocoder_name=mel_spec_type, is_local=local, local_path=vocoder_local_path
+)
 # Tokenizer
 vocab_char_map, vocab_size = get_tokenizer(dataset_name, tokenizer)
 # Model
 model = CFM(
+    transformer=model_cls(
+        **model_arc, text_num_embeds=vocab_size, mel_dim=n_mel_channels
+    ),
     mel_spec_kwargs=dict(
         n_fft=n_fft,
         hop_length=hop_length,
     part_dur = end - start if fix_duration is None else fix_duration.pop(0)
     part_dur = part_dur * target_sample_rate
     start = start * target_sample_rate
+    audio_ = torch.cat(
+        (
+            audio_,
+            audio[:, round(offset) : round(start)],
+            torch.zeros(1, round(part_dur)),
+        ),
+        dim=-1,
+    )
     edit_mask = torch.cat(
         (
             edit_mask,
     )
     offset = end * target_sample_rate
 audio = torch.cat((audio_, audio[:, round(offset) :]), dim=-1)
+edit_mask = F.pad(
+    edit_mask, (0, audio.shape[-1] // hop_length - edit_mask.shape[-1] + 1), value=True
+)
 audio = audio.to(device)
 edit_mask = edit_mask.to(device)
         generated_wave = generated_wave * rms / target_rms
     save_spectrogram(gen_mel_spec[0].cpu().numpy(), f"{output_dir}/speech_edit_out.png")
+    torchaudio.save(
+        f"{output_dir}/speech_edit_out.wav", generated_wave, target_sample_rate
+    )
     print(f"Generated wav: {generated_wave.shape}")

f5_tts/infer/utils_infer.py CHANGED Viewed

@@ -4,9 +4,10 @@ import os
 import sys
 from concurrent.futures import ThreadPoolExecutor
 os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"  # for MPS device compatibility
-sys.path.append(f"{os.path.dirname(os.path.abspath(__file__))}/../../third_party/BigVGAN/")
 import hashlib
 import re
@@ -15,7 +16,6 @@ from importlib.resources import files
 import matplotlib
 matplotlib.use("Agg")
 import matplotlib.pylab as plt
@@ -31,21 +31,22 @@ from vocos import Vocos
 from f5_tts.model import CFM
 from f5_tts.model.utils import convert_char_to_pinyin, get_tokenizer
 _ref_audio_cache = {}
 _ref_text_cache = {}
 device = (
     "cuda"
     if torch.cuda.is_available()
-    else "xpu"
-    if torch.xpu.is_available()
-    else "mps"
-    if torch.backends.mps.is_available()
-    else "cpu"
 )
-tempfile_kwargs = {"delete_on_close": False} if sys.version_info >= (3, 12) else {"delete": False}
 # -----------------------------------------
@@ -87,12 +88,23 @@ def chunk_text(text, max_chars=135):
     sentences = re.split(r"(?<=[;:,.!?])\s+|(?<=[；：，。！？])", text)
     for sentence in sentences:
-        if len(current_chunk.encode("utf-8")) + len(sentence.encode("utf-8")) <= max_chars:
-            current_chunk += sentence + " " if sentence and len(sentence[-1].encode("utf-8")) == 1 else sentence
         else:
             if current_chunk:
                 chunks.append(current_chunk.strip())
-            current_chunk = sentence + " " if sentence and len(sentence[-1].encode("utf-8")) == 1 else sentence
     if current_chunk:
         chunks.append(current_chunk.strip())
@@ -101,7 +113,13 @@ def chunk_text(text, max_chars=135):
 # load vocoder
-def load_vocoder(vocoder_name="vocos", is_local=False, local_path="", device=device, hf_cache_dir=None):
     if vocoder_name == "vocos":
         # vocoder = Vocos.from_pretrained("charactr/vocos-mel-24khz").to(device)
         if is_local:
@@ -111,8 +129,12 @@ def load_vocoder(vocoder_name="vocos", is_local=False, local_path="", device=dev
         else:
             print("Download Vocos from huggingface charactr/vocos-mel-24khz")
             repo_id = "charactr/vocos-mel-24khz"
-            config_path = hf_hub_download(repo_id=repo_id, cache_dir=hf_cache_dir, filename="config.yaml")
-            model_path = hf_hub_download(repo_id=repo_id, cache_dir=hf_cache_dir, filename="pytorch_model.bin")
         vocoder = Vocos.from_hparams(config_path)
         state_dict = torch.load(model_path, map_location="cpu", weights_only=True)
         from vocos.feature_extractors import EncodecFeatures
@@ -129,13 +151,17 @@ def load_vocoder(vocoder_name="vocos", is_local=False, local_path="", device=dev
         try:
             from third_party.BigVGAN import bigvgan
         except ImportError:
-            print("You need to follow the README to init submodule and change the BigVGAN source code.")
         if is_local:
             # download generator from https://huggingface.co/nvidia/bigvgan_v2_24khz_100band_256x/tree/main
             vocoder = bigvgan.BigVGAN.from_pretrained(local_path, use_cuda_kernel=False)
         else:
             vocoder = bigvgan.BigVGAN.from_pretrained(
-                "nvidia/bigvgan_v2_24khz_100band_256x", use_cuda_kernel=False, cache_dir=hf_cache_dir
             )
         vocoder.remove_weight_norm()
@@ -177,7 +203,11 @@ def transcribe(ref_audio, language=None):
         ref_audio,
         chunk_length_s=30,
         batch_size=128,
-        generate_kwargs={"task": "transcribe", "language": language} if language else {"task": "transcribe"},
         return_timestamps=False,
     )["text"].strip()
@@ -214,7 +244,10 @@ def load_checkpoint(model, ckpt_path, device: str, dtype=None, use_ema=True):
         }
         # patch for backward compatibility, 305e3ea
-        for key in ["mel_spec.mel_stft.mel_scale.fb", "mel_spec.mel_stft.spectrogram.window"]:
             if key in checkpoint["model_state_dict"]:
                 del checkpoint["model_state_dict"][key]
@@ -253,7 +286,9 @@ def load_model(
     vocab_char_map, vocab_size = get_tokenizer(vocab_file, tokenizer)
     model = CFM(
-        transformer=model_cls(**model_cfg, text_num_embeds=vocab_size, mel_dim=n_mel_channels),
         mel_spec_kwargs=dict(
             n_fft=n_fft,
             hop_length=hop_length,
@@ -276,7 +311,9 @@ def load_model(
 def remove_silence_edges(audio, silence_threshold=-42):
     # Remove silence from the start
-    non_silent_start_idx = silence.detect_leading_silence(audio, silence_threshold=silence_threshold)
     audio = audio[non_silent_start_idx:]
     # Remove silence from the end
@@ -315,11 +352,18 @@ def preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=print):
         # 1. try to find long silence for clipping
         non_silent_segs = silence.split_on_silence(
-            aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=1000, seek_step=10
         )
         non_silent_wave = AudioSegment.silent(duration=0)
         for non_silent_seg in non_silent_segs:
-            if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 12000:
                 show_info("Audio is over 12s, clipping short. (1)")
                 break
             non_silent_wave += non_silent_seg
@@ -327,11 +371,18 @@ def preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=print):
         # 2. try to find short silence for clipping if 1. failed
         if len(non_silent_wave) > 12000:
             non_silent_segs = silence.split_on_silence(
-                aseg, min_silence_len=100, silence_thresh=-40, keep_silence=1000, seek_step=10
             )
             non_silent_wave = AudioSegment.silent(duration=0)
             for non_silent_seg in non_silent_segs:
-                if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 12000:
                     show_info("Audio is over 12s, clipping short. (2)")
                     break
                 non_silent_wave += non_silent_seg
@@ -399,7 +450,12 @@ def infer_process(
 ):
     # Split the input text into batches
     audio, sr = torchaudio.load(ref_audio)
-    max_chars = int(len(ref_text.encode("utf-8")) / (audio.shape[-1] / sr) * (22 - audio.shape[-1] / sr) * speed)
     gen_text_batches = chunk_text(gen_text, max_chars=max_chars)
     for i, gen_text in enumerate(gen_text_batches):
         print(f"gen_text {i}", gen_text)
@@ -483,7 +539,9 @@ def infer_batch_process(
             # Calculate duration
             ref_text_len = len(ref_text.encode("utf-8"))
             gen_text_len = len(gen_text.encode("utf-8"))
-            duration = ref_audio_len + int(ref_audio_len / ref_text_len * gen_text_len / local_speed)
         # inference
         with torch.inference_mode():
@@ -519,12 +577,19 @@ def infer_batch_process(
                 yield generated_wave, generated_cpu
     if streaming:
-        for gen_text in progress.tqdm(gen_text_batches) if progress is not None else gen_text_batches:
             for chunk in process_batch(gen_text):
                 yield chunk
     else:
         with ThreadPoolExecutor() as executor:
-            futures = [executor.submit(process_batch, gen_text) for gen_text in gen_text_batches]
             for future in progress.tqdm(futures) if progress is not None else futures:
                 result = future.result()
                 if result:
@@ -545,7 +610,9 @@ def infer_batch_process(
                     # Calculate cross-fade samples, ensuring it does not exceed wave lengths
                     cross_fade_samples = int(cross_fade_duration * target_sample_rate)
-                    cross_fade_samples = min(cross_fade_samples, len(prev_wave), len(next_wave))
                     if cross_fade_samples <= 0:
                         # No overlap possible, concatenate
@@ -561,11 +628,17 @@ def infer_batch_process(
                     fade_in = np.linspace(0, 1, cross_fade_samples)
                     # Cross-faded overlap
-                    cross_faded_overlap = prev_overlap * fade_out + next_overlap * fade_in
                     # Combine
                     new_wave = np.concatenate(
-                        [prev_wave[:-cross_fade_samples], cross_faded_overlap, next_wave[cross_fade_samples:]]
                     )
                     final_wave = new_wave

 import sys
 from concurrent.futures import ThreadPoolExecutor
 os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"  # for MPS device compatibility
+sys.path.append(
+    f"{os.path.dirname(os.path.abspath(__file__))}/../../third_party/BigVGAN/"
+)
 import hashlib
 import re
 import matplotlib
 matplotlib.use("Agg")
 import matplotlib.pylab as plt
 from f5_tts.model import CFM
 from f5_tts.model.utils import convert_char_to_pinyin, get_tokenizer
 _ref_audio_cache = {}
 _ref_text_cache = {}
 device = (
     "cuda"
     if torch.cuda.is_available()
+    else (
+        "xpu"
+        if torch.xpu.is_available()
+        else "mps" if torch.backends.mps.is_available() else "cpu"
+    )
 )
+tempfile_kwargs = (
+    {"delete_on_close": False} if sys.version_info >= (3, 12) else {"delete": False}
+)
 # -----------------------------------------
     sentences = re.split(r"(?<=[;:,.!?])\s+|(?<=[；：，。！？])", text)
     for sentence in sentences:
+        if (
+            len(current_chunk.encode("utf-8")) + len(sentence.encode("utf-8"))
+            <= max_chars
+        ):
+            current_chunk += (
+                sentence + " "
+                if sentence and len(sentence[-1].encode("utf-8")) == 1
+                else sentence
+            )
         else:
             if current_chunk:
                 chunks.append(current_chunk.strip())
+            current_chunk = (
+                sentence + " "
+                if sentence and len(sentence[-1].encode("utf-8")) == 1
+                else sentence
+            )
     if current_chunk:
         chunks.append(current_chunk.strip())
 # load vocoder
+def load_vocoder(
+    vocoder_name="vocos",
+    is_local=False,
+    local_path="",
+    device=device,
+    hf_cache_dir=None,
+):
     if vocoder_name == "vocos":
         # vocoder = Vocos.from_pretrained("charactr/vocos-mel-24khz").to(device)
         if is_local:
         else:
             print("Download Vocos from huggingface charactr/vocos-mel-24khz")
             repo_id = "charactr/vocos-mel-24khz"
+            config_path = hf_hub_download(
+                repo_id=repo_id, cache_dir=hf_cache_dir, filename="config.yaml"
+            )
+            model_path = hf_hub_download(
+                repo_id=repo_id, cache_dir=hf_cache_dir, filename="pytorch_model.bin"
+            )
         vocoder = Vocos.from_hparams(config_path)
         state_dict = torch.load(model_path, map_location="cpu", weights_only=True)
         from vocos.feature_extractors import EncodecFeatures
         try:
             from third_party.BigVGAN import bigvgan
         except ImportError:
+            print(
+                "You need to follow the README to init submodule and change the BigVGAN source code."
+            )
         if is_local:
             # download generator from https://huggingface.co/nvidia/bigvgan_v2_24khz_100band_256x/tree/main
             vocoder = bigvgan.BigVGAN.from_pretrained(local_path, use_cuda_kernel=False)
         else:
             vocoder = bigvgan.BigVGAN.from_pretrained(
+                "nvidia/bigvgan_v2_24khz_100band_256x",
+                use_cuda_kernel=False,
+                cache_dir=hf_cache_dir,
             )
         vocoder.remove_weight_norm()
         ref_audio,
         chunk_length_s=30,
         batch_size=128,
+        generate_kwargs=(
+            {"task": "transcribe", "language": language}
+            if language
+            else {"task": "transcribe"}
+        ),
         return_timestamps=False,
     )["text"].strip()
         }
         # patch for backward compatibility, 305e3ea
+        for key in [
+            "mel_spec.mel_stft.mel_scale.fb",
+            "mel_spec.mel_stft.spectrogram.window",
+        ]:
             if key in checkpoint["model_state_dict"]:
                 del checkpoint["model_state_dict"][key]
     vocab_char_map, vocab_size = get_tokenizer(vocab_file, tokenizer)
     model = CFM(
+        transformer=model_cls(
+            **model_cfg, text_num_embeds=vocab_size, mel_dim=n_mel_channels
+        ),
         mel_spec_kwargs=dict(
             n_fft=n_fft,
             hop_length=hop_length,
 def remove_silence_edges(audio, silence_threshold=-42):
     # Remove silence from the start
+    non_silent_start_idx = silence.detect_leading_silence(
+        audio, silence_threshold=silence_threshold
+    )
     audio = audio[non_silent_start_idx:]
     # Remove silence from the end
         # 1. try to find long silence for clipping
         non_silent_segs = silence.split_on_silence(
+            aseg,
+            min_silence_len=1000,
+            silence_thresh=-50,
+            keep_silence=1000,
+            seek_step=10,
         )
         non_silent_wave = AudioSegment.silent(duration=0)
         for non_silent_seg in non_silent_segs:
+            if (
+                len(non_silent_wave) > 6000
+                and len(non_silent_wave + non_silent_seg) > 12000
+            ):
                 show_info("Audio is over 12s, clipping short. (1)")
                 break
             non_silent_wave += non_silent_seg
         # 2. try to find short silence for clipping if 1. failed
         if len(non_silent_wave) > 12000:
             non_silent_segs = silence.split_on_silence(
+                aseg,
+                min_silence_len=100,
+                silence_thresh=-40,
+                keep_silence=1000,
+                seek_step=10,
             )
             non_silent_wave = AudioSegment.silent(duration=0)
             for non_silent_seg in non_silent_segs:
+                if (
+                    len(non_silent_wave) > 6000
+                    and len(non_silent_wave + non_silent_seg) > 12000
+                ):
                     show_info("Audio is over 12s, clipping short. (2)")
                     break
                 non_silent_wave += non_silent_seg
 ):
     # Split the input text into batches
     audio, sr = torchaudio.load(ref_audio)
+    max_chars = int(
+        len(ref_text.encode("utf-8"))
+        / (audio.shape[-1] / sr)
+        * (22 - audio.shape[-1] / sr)
+        * speed
+    )
     gen_text_batches = chunk_text(gen_text, max_chars=max_chars)
     for i, gen_text in enumerate(gen_text_batches):
         print(f"gen_text {i}", gen_text)
             # Calculate duration
             ref_text_len = len(ref_text.encode("utf-8"))
             gen_text_len = len(gen_text.encode("utf-8"))
+            duration = ref_audio_len + int(
+                ref_audio_len / ref_text_len * gen_text_len / local_speed
+            )
         # inference
         with torch.inference_mode():
                 yield generated_wave, generated_cpu
     if streaming:
+        for gen_text in (
+            progress.tqdm(gen_text_batches)
+            if progress is not None
+            else gen_text_batches
+        ):
             for chunk in process_batch(gen_text):
                 yield chunk
     else:
         with ThreadPoolExecutor() as executor:
+            futures = [
+                executor.submit(process_batch, gen_text)
+                for gen_text in gen_text_batches
+            ]
             for future in progress.tqdm(futures) if progress is not None else futures:
                 result = future.result()
                 if result:
                     # Calculate cross-fade samples, ensuring it does not exceed wave lengths
                     cross_fade_samples = int(cross_fade_duration * target_sample_rate)
+                    cross_fade_samples = min(
+                        cross_fade_samples, len(prev_wave), len(next_wave)
+                    )
                     if cross_fade_samples <= 0:
                         # No overlap possible, concatenate
                     fade_in = np.linspace(0, 1, cross_fade_samples)
                     # Cross-faded overlap
+                    cross_faded_overlap = (
+                        prev_overlap * fade_out + next_overlap * fade_in
+                    )
                     # Combine
                     new_wave = np.concatenate(
+                        [
+                            prev_wave[:-cross_fade_samples],
+                            cross_faded_overlap,
+                            next_wave[cross_fade_samples:],
+                        ]
                     )
                     final_wave = new_wave

f5_tts/model/__init__.py CHANGED Viewed

@@ -1,10 +1,7 @@
-from f5_tts.model.cfm import CFM
-from f5_tts.model.backbones.unett import UNetT
 from f5_tts.model.backbones.dit import DiT
 from f5_tts.model.backbones.mmdit import MMDiT
 from f5_tts.model.trainer import Trainer
 __all__ = ["CFM", "UNetT", "DiT", "MMDiT", "Trainer"]

 from f5_tts.model.backbones.dit import DiT
 from f5_tts.model.backbones.mmdit import MMDiT
+from f5_tts.model.backbones.unett import UNetT
+from f5_tts.model.cfm import CFM
 from f5_tts.model.trainer import Trainer
 __all__ = ["CFM", "UNetT", "DiT", "MMDiT", "Trainer"]

f5_tts/model/backbones/dit.py CHANGED Viewed

@@ -10,21 +10,14 @@ d - dimension
 from __future__ import annotations
 import torch
-from torch import nn
 import torch.nn.functional as F
 from x_transformers.x_transformers import RotaryEmbedding
-from f5_tts.model.modules import (
-    TimestepEmbedding,
-    ConvNeXtV2Block,
-    ConvPositionEmbedding,
-    DiTBlock,
-    AdaLayerNormZero_Final,
-    precompute_freqs_cis,
-    get_pos_embed_indices,
-)
 # Text embedding
@@ -32,34 +25,49 @@ from f5_tts.model.modules import (
 class TextEmbedding(nn.Module):
     def __init__(self, text_num_embeds, text_dim, conv_layers=0, conv_mult=2):
         super().__init__()
-        self.text_embed = nn.Embedding(text_num_embeds + 1, text_dim)  # use 0 as filler token
         if conv_layers > 0:
             self.extra_modeling = True
             self.precompute_max_pos = 4096  # ~44s of 24khz audio
-            self.register_buffer("freqs_cis", precompute_freqs_cis(text_dim, self.precompute_max_pos), persistent=False)
             self.text_blocks = nn.Sequential(
-                *[ConvNeXtV2Block(text_dim, text_dim * conv_mult) for _ in range(conv_layers)]
             )
         else:
             self.extra_modeling = False
     def forward(self, text: int["b nt"], seq_len, drop_text=False):  # noqa: F722
-        text = text + 1  # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
-        text = text[:, :seq_len]  # curtail if character tokens are more than the mel spec tokens
         batch, text_len = text.shape[0], text.shape[1]
         text = F.pad(text, (0, seq_len - text_len), value=0)
         if drop_text:  # cfg for text
             text = torch.zeros_like(text)
         text = self.text_embed(text)  # b n -> b n d
         # possible extra modeling
         if self.extra_modeling:
             # sinus pos emb
             batch_start = torch.zeros((batch,), dtype=torch.long)
-            pos_idx = get_pos_embed_indices(batch_start, seq_len, max_pos=self.precompute_max_pos)
             text_pos_embed = self.freqs_cis[pos_idx]
             text = text + text_pos_embed
@@ -78,7 +86,13 @@ class InputEmbedding(nn.Module):
         self.proj = nn.Linear(mel_dim * 2 + text_dim, out_dim)
         self.conv_pos_embed = ConvPositionEmbedding(dim=out_dim)
-    def forward(self, x: float["b n d"], cond: float["b n d"], text_embed: float["b n d"], drop_audio_cond=False):  # noqa: F722
         if drop_audio_cond:  # cfg for cond audio
             cond = torch.zeros_like(cond)
@@ -114,17 +128,23 @@ class DiT(nn.Module):
         if second_time:
             self.time_embed2 = TimestepEmbedding(dim)
             # Zero-init the weights and biases of the first and last Linear layers in time_mlp
-            nn.init.zeros_(self.time_embed2.time_mlp[0].weight)  # First Linear layer weights
-            nn.init.zeros_(self.time_embed2.time_mlp[0].bias)    # First Linear layer bias
-            nn.init.zeros_(self.time_embed2.time_mlp[-1].weight) # Last Linear layer weights
-            nn.init.zeros_(self.time_embed2.time_mlp[-1].bias)   # Last Linear layer bias
         else:
             self.time_embed2 = None
         if text_dim is None:
             text_dim = mel_dim
         self.vocab_size = text_num_embeds
-        self.text_embed = TextEmbedding(text_num_embeds, text_dim, conv_layers=conv_layers)
         self.input_embed = InputEmbedding(mel_dim, text_dim, dim)
         self.rotary_embed = RotaryEmbedding(dim_head)
@@ -133,9 +153,20 @@ class DiT(nn.Module):
         self.depth = depth
         self.transformer_blocks = nn.ModuleList(
-            [DiTBlock(dim=dim, heads=heads, dim_head=dim_head, ff_mult=ff_mult, dropout=dropout) for _ in range(depth)]
         )
-        self.long_skip_connection = nn.Linear(dim * 2, dim, bias=False) if long_skip_connection else None
         self.norm_out = AdaLayerNormZero_Final(dim)  # final modulation
         self.proj_out = nn.Linear(dim, mel_dim)
@@ -171,7 +202,7 @@ class DiT(nn.Module):
         if second_time is not None and self.time_embed2 is not None:
             t2 = self.time_embed2(second_time)
             t = t + t2
         text_embed = self.text_embed(text, seq_len, drop_text=drop_text)
         x = self.input_embed(x, cond, text_embed, drop_audio_cond=drop_audio_cond)
@@ -185,7 +216,9 @@ class DiT(nn.Module):
         for block in self.transformer_blocks:
             if self.checkpoint_activations:
-                x = torch.utils.checkpoint.checkpoint(self.ckpt_wrapper(block), x, t, mask, rope)
             else:
                 x = block(x, t, mask=mask, rope=rope)

 from __future__ import annotations
 import torch
 import torch.nn.functional as F
+from torch import nn
 from x_transformers.x_transformers import RotaryEmbedding
+from f5_tts.model.modules import (AdaLayerNormZero_Final, ConvNeXtV2Block,
+                                  ConvPositionEmbedding, DiTBlock,
+                                  TimestepEmbedding, get_pos_embed_indices,
+                                  precompute_freqs_cis)
 # Text embedding
 class TextEmbedding(nn.Module):
     def __init__(self, text_num_embeds, text_dim, conv_layers=0, conv_mult=2):
         super().__init__()
+        self.text_embed = nn.Embedding(
+            text_num_embeds + 1, text_dim
+        )  # use 0 as filler token
         if conv_layers > 0:
             self.extra_modeling = True
             self.precompute_max_pos = 4096  # ~44s of 24khz audio
+            self.register_buffer(
+                "freqs_cis",
+                precompute_freqs_cis(text_dim, self.precompute_max_pos),
+                persistent=False,
+            )
             self.text_blocks = nn.Sequential(
+                *[
+                    ConvNeXtV2Block(text_dim, text_dim * conv_mult)
+                    for _ in range(conv_layers)
+                ]
             )
         else:
             self.extra_modeling = False
     def forward(self, text: int["b nt"], seq_len, drop_text=False):  # noqa: F722
+        text = (
+            text + 1
+        )  # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
+        text = text[
+            :, :seq_len
+        ]  # curtail if character tokens are more than the mel spec tokens
         batch, text_len = text.shape[0], text.shape[1]
         text = F.pad(text, (0, seq_len - text_len), value=0)
         if drop_text:  # cfg for text
             text = torch.zeros_like(text)
         text = self.text_embed(text)  # b n -> b n d
         # possible extra modeling
         if self.extra_modeling:
             # sinus pos emb
             batch_start = torch.zeros((batch,), dtype=torch.long)
+            pos_idx = get_pos_embed_indices(
+                batch_start, seq_len, max_pos=self.precompute_max_pos
+            )
             text_pos_embed = self.freqs_cis[pos_idx]
             text = text + text_pos_embed
         self.proj = nn.Linear(mel_dim * 2 + text_dim, out_dim)
         self.conv_pos_embed = ConvPositionEmbedding(dim=out_dim)
+    def forward(
+        self,
+        x: float["b n d"],
+        cond: float["b n d"],
+        text_embed: float["b n d"],
+        drop_audio_cond=False,
+    ):  # noqa: F722
         if drop_audio_cond:  # cfg for cond audio
             cond = torch.zeros_like(cond)
         if second_time:
             self.time_embed2 = TimestepEmbedding(dim)
             # Zero-init the weights and biases of the first and last Linear layers in time_mlp
+            nn.init.zeros_(
+                self.time_embed2.time_mlp[0].weight
+            )  # First Linear layer weights
+            nn.init.zeros_(self.time_embed2.time_mlp[0].bias)  # First Linear layer bias
+            nn.init.zeros_(
+                self.time_embed2.time_mlp[-1].weight
+            )  # Last Linear layer weights
+            nn.init.zeros_(self.time_embed2.time_mlp[-1].bias)  # Last Linear layer bias
         else:
             self.time_embed2 = None
         if text_dim is None:
             text_dim = mel_dim
         self.vocab_size = text_num_embeds
+        self.text_embed = TextEmbedding(
+            text_num_embeds, text_dim, conv_layers=conv_layers
+        )
         self.input_embed = InputEmbedding(mel_dim, text_dim, dim)
         self.rotary_embed = RotaryEmbedding(dim_head)
         self.depth = depth
         self.transformer_blocks = nn.ModuleList(
+            [
+                DiTBlock(
+                    dim=dim,
+                    heads=heads,
+                    dim_head=dim_head,
+                    ff_mult=ff_mult,
+                    dropout=dropout,
+                )
+                for _ in range(depth)
+            ]
+        )
+        self.long_skip_connection = (
+            nn.Linear(dim * 2, dim, bias=False) if long_skip_connection else None
         )
         self.norm_out = AdaLayerNormZero_Final(dim)  # final modulation
         self.proj_out = nn.Linear(dim, mel_dim)
         if second_time is not None and self.time_embed2 is not None:
             t2 = self.time_embed2(second_time)
             t = t + t2
         text_embed = self.text_embed(text, seq_len, drop_text=drop_text)
         x = self.input_embed(x, cond, text_embed, drop_audio_cond=drop_audio_cond)
         for block in self.transformer_blocks:
             if self.checkpoint_activations:
+                x = torch.utils.checkpoint.checkpoint(
+                    self.ckpt_wrapper(block), x, t, mask, rope
+                )
             else:
                 x = block(x, t, mask=mask, rope=rope)

f5_tts/model/backbones/mmdit.py CHANGED Viewed

@@ -10,41 +10,37 @@ d - dimension
 from __future__ import annotations
 import torch
-from torch import nn
 import torch.nn.functional as F
 from x_transformers.x_transformers import RotaryEmbedding
-from f5_tts.model.modules import (
-    TimestepEmbedding,
-    ConvPositionEmbedding,
-    MMDiTBlock,
-    DiTBlock,
-    AdaLayerNormZero_Final,
-    precompute_freqs_cis,
-    get_pos_embed_indices,
-)
-from f5_tts.model.utils import (
-    default,
-    exists,
-    lens_to_mask,
-    list_str_to_idx,
-    list_str_to_tensor,
-    mask_from_frac_lengths,
-)
 # text embedding
 class TextEmbedding(nn.Module):
     def __init__(self, out_dim, text_num_embeds):
         super().__init__()
-        self.text_embed = nn.Embedding(text_num_embeds + 1, out_dim)  # will use 0 as filler token
         self.precompute_max_pos = 1024
-        self.register_buffer("freqs_cis", precompute_freqs_cis(out_dim, self.precompute_max_pos), persistent=False)
-    def forward(self, text: int["b nt"], drop_text=False) -> int["b nt d"]:  # noqa: F722
         text = text + 1
         if drop_text:
             text = torch.zeros_like(text)
@@ -53,7 +49,9 @@ class TextEmbedding(nn.Module):
         # sinus pos emb
         batch_start = torch.zeros((text.shape[0],), dtype=torch.long)
         batch_text_len = text.shape[1]
-        pos_idx = get_pos_embed_indices(batch_start, batch_text_len, max_pos=self.precompute_max_pos)
         text_pos_embed = self.freqs_cis[pos_idx]
         text = text + text_pos_embed
@@ -70,7 +68,9 @@ class AudioEmbedding(nn.Module):
         self.linear = nn.Linear(2 * in_dim, out_dim)
         self.conv_pos_embed = ConvPositionEmbedding(out_dim)
-    def forward(self, x: float["b n d"], cond: float["b n d"], drop_audio_cond=False):  # noqa: F722
         if drop_audio_cond:
             cond = torch.zeros_like(cond)
         x = torch.cat((x, cond), dim=-1)
@@ -97,23 +97,24 @@ class MMDiT(nn.Module):
         mel_dim=100,
         checkpoint_activations=False,
         text_encoder=True,
     ):
         super().__init__()
         self.time_embed = TimestepEmbedding(dim)
         if text_encoder:
-            self.text_encoder = TextEncoder(text_num_embeds=text_num_embeds,
-                                        text_dim=dim,
-                                        depth=text_depth,
-                                        heads=heads,
-                                        dim_head=dim_head,
-                                        ff_mult=ff_mult,
-                                        dropout=dropout)
         else:
             self.text_encoder = None
             self.text_embed = TextEmbedding(dim, text_num_embeds)
         self.audio_embed = AudioEmbedding(mel_dim, dim)
         self.rotary_embed = RotaryEmbedding(dim_head)
@@ -136,9 +137,8 @@ class MMDiT(nn.Module):
         )
         self.norm_out = AdaLayerNormZero_Final(dim)  # final modulation
         self.proj_out = nn.Linear(dim, mel_dim)
-        self.checkpoint_activations = checkpoint_activations
     def forward(
         self,
@@ -161,45 +161,53 @@ class MMDiT(nn.Module):
             c = self.text_encoder(text, t, mask=text_mask, drop_text=drop_text)
         else:
             c = self.text_embed(text, drop_text=drop_text)
         x = self.audio_embed(x, cond, drop_audio_cond=drop_audio_cond)
         seq_len = x.shape[1]
         text_len = text.shape[1]
         rope_audio = self.rotary_embed.forward_from_seq_len(seq_len)
         rope_text = self.rotary_embed.forward_from_seq_len(text_len)
         # if mask is not None:
         #     rope_audio = self.rotary_embed.forward_from_seq_len(seq_len + 1)
         #     dummy_token = torch.zeros((x.shape[0], 1, x.shape[-1]), device=x.device, dtype=x.dtype)
         #     x = torch.cat([x, dummy_token], dim=1)  # shape is now [b, nw+1, d]
         #     # pad the mask so that new dummy token is always masked out
         #     # mask: [b, nw] -> [b, nw+1]
         #     false_col = torch.zeros((x.shape[0], 1), dtype=torch.bool, device=x.device)
         #     mask = torch.cat([mask, false_col], dim=1)
         # if text_mask is not None:
         #     rope_text = self.rotary_embed.forward_from_seq_len(text_len + 1)
         #     dummy_token = torch.zeros((c.shape[0], 1, c.shape[-1]), device=c.device, dtype=c.dtype)
         #     c = torch.cat([c, dummy_token], dim=1)  # shape is now [b, nt+1, d]
         #     # pad the text mask so that new dummy token is always masked out
         #     # text_mask: [b, nt] -> [b, nt+1]
         #     false_col = torch.zeros((c.shape[0], 1), dtype=torch.bool, device=c.device)
         #     text_mask = torch.cat([text_mask, false_col], dim=1)
         for block in self.transformer_blocks:
-            c, x = block(x, c, t, mask=mask, src_mask=text_mask, rope=rope_audio, c_rope=rope_text)
         x = self.norm_out(x, t)
         output = self.proj_out(x)
         return output
 class TextEncoder(nn.Module):
     def __init__(
         self,
@@ -219,7 +227,7 @@ class TextEncoder(nn.Module):
         # Embeddings
         self.text_embed = TextEmbedding(text_dim, text_num_embeds)
         self.rotary_embed = RotaryEmbedding(dim_head)
         # Example stack of DiTBlocks or any custom blocks
         self.transformer_blocks = nn.ModuleList(
             [
@@ -239,7 +247,7 @@ class TextEncoder(nn.Module):
         text: int["b nt"],  # noqa: F821
         time: float["b"] | float[""],  # time step  # noqa: F821 F722
         mask: bool["b nt"] | None = None,  # noqa: F821 F722
-        drop_text: bool = False
     ):
         """
         Encode text into hidden states of shape [b, nt, d].
@@ -251,7 +259,7 @@ class TextEncoder(nn.Module):
         # Basic embedding
         hidden_states = self.text_embed(text, seq_len)  # [b, nt, d]
         # lens and mask
         rope = self.rotary_embed.forward_from_seq_len(seq_len)
@@ -260,17 +268,18 @@ class TextEncoder(nn.Module):
             # Here, you likely want standard self-attn, so no cross-attn
             hidden_states = block(
                 x=hidden_states,
-                t=time,       # no time embedding for the text encoder by default
-                mask=mask,    # or pass a text mask if needed
-                rope=rope     # pass a rope if you want rotary embeddings for text
             )
         return hidden_states
 if __name__ == "__main__":
     from f5_tts.model.utils import get_tokenizer
     bsz = 16
     tokenizer = "pinyin"  # 'pinyin', 'char', or 'custom'
     tokenizer_path = None  # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
     dataset_name = "Emilia_ZH_EN"
@@ -279,23 +288,22 @@ if __name__ == "__main__":
     else:
         tokenizer_path = dataset_name
     vocab_char_map, vocab_size = get_tokenizer(tokenizer_path, tokenizer)
     text = ["hello world"] * bsz
-    text_lens = torch.ones((bsz, ), dtype=torch.long) * len("hello world")
     text_lens[-1] = 5
     device = "cuda"
     batch = bsz
     time_embed = TimestepEmbedding(512).to(device)
     # handle text as string
     if isinstance(text, list):
         if exists(vocab_char_map):
             text = list_str_to_idx(text, vocab_char_map).to(device)
         else:
             text = list_str_to_tensor(text).to(device)
-        assert text.shape[0] == batch
     time = torch.rand((batch,), device=device)
     text_mask = lens_to_mask(text_lens).to(device)
@@ -311,7 +319,7 @@ if __name__ == "__main__":
     # ).to('cuda')
     # hidden_states = text_encoder(text, time_embed(time), mask)
     # print(hidden_states.shape)  # [bsz, seq_len, text_dim]
     # test MMDiT
     mel_dim = 80
     model = MMDiT(
@@ -323,14 +331,23 @@ if __name__ == "__main__":
         dropout=0.1,
         ff_mult=4,
         text_num_embeds=vocab_size,
-        mel_dim=mel_dim
     ).to(device)
     x = torch.rand((batch, 100, mel_dim), device=device)
     cond = torch.rand((batch, 100, mel_dim), device=device)
     lens = torch.ones((batch,), dtype=torch.long) * 100
     mask = lens_to_mask(lens).to(device)
-    output = model(x, cond, text, time, drop_audio_cond=False, drop_text=False, mask=mask, text_mask=text_mask)
-    print(output.shape)  # [bsz, seq_len, mel_dim]

 from __future__ import annotations
 import torch
 import torch.nn.functional as F
+from torch import nn
 from x_transformers.x_transformers import RotaryEmbedding
+from f5_tts.model.modules import (AdaLayerNormZero_Final,
+                                  ConvPositionEmbedding, DiTBlock, MMDiTBlock,
+                                  TimestepEmbedding, get_pos_embed_indices,
+                                  precompute_freqs_cis)
+from f5_tts.model.utils import (default, exists, lens_to_mask, list_str_to_idx,
+                                list_str_to_tensor, mask_from_frac_lengths)
 # text embedding
 class TextEmbedding(nn.Module):
     def __init__(self, out_dim, text_num_embeds):
         super().__init__()
+        self.text_embed = nn.Embedding(
+            text_num_embeds + 1, out_dim
+        )  # will use 0 as filler token
         self.precompute_max_pos = 1024
+        self.register_buffer(
+            "freqs_cis",
+            precompute_freqs_cis(out_dim, self.precompute_max_pos),
+            persistent=False,
+        )
+    def forward(
+        self, text: int["b nt"], drop_text=False
+    ) -> int["b nt d"]:  # noqa: F722
         text = text + 1
         if drop_text:
             text = torch.zeros_like(text)
         # sinus pos emb
         batch_start = torch.zeros((text.shape[0],), dtype=torch.long)
         batch_text_len = text.shape[1]
+        pos_idx = get_pos_embed_indices(
+            batch_start, batch_text_len, max_pos=self.precompute_max_pos
+        )
         text_pos_embed = self.freqs_cis[pos_idx]
         text = text + text_pos_embed
         self.linear = nn.Linear(2 * in_dim, out_dim)
         self.conv_pos_embed = ConvPositionEmbedding(out_dim)
+    def forward(
+        self, x: float["b n d"], cond: float["b n d"], drop_audio_cond=False
+    ):  # noqa: F722
         if drop_audio_cond:
             cond = torch.zeros_like(cond)
         x = torch.cat((x, cond), dim=-1)
         mel_dim=100,
         checkpoint_activations=False,
         text_encoder=True,
     ):
         super().__init__()
         self.time_embed = TimestepEmbedding(dim)
         if text_encoder:
+            self.text_encoder = TextEncoder(
+                text_num_embeds=text_num_embeds,
+                text_dim=dim,
+                depth=text_depth,
+                heads=heads,
+                dim_head=dim_head,
+                ff_mult=ff_mult,
+                dropout=dropout,
+            )
         else:
             self.text_encoder = None
             self.text_embed = TextEmbedding(dim, text_num_embeds)
         self.audio_embed = AudioEmbedding(mel_dim, dim)
         self.rotary_embed = RotaryEmbedding(dim_head)
         )
         self.norm_out = AdaLayerNormZero_Final(dim)  # final modulation
         self.proj_out = nn.Linear(dim, mel_dim)
+        self.checkpoint_activations = checkpoint_activations
     def forward(
         self,
             c = self.text_encoder(text, t, mask=text_mask, drop_text=drop_text)
         else:
             c = self.text_embed(text, drop_text=drop_text)
         x = self.audio_embed(x, cond, drop_audio_cond=drop_audio_cond)
         seq_len = x.shape[1]
         text_len = text.shape[1]
         rope_audio = self.rotary_embed.forward_from_seq_len(seq_len)
         rope_text = self.rotary_embed.forward_from_seq_len(text_len)
         # if mask is not None:
         #     rope_audio = self.rotary_embed.forward_from_seq_len(seq_len + 1)
         #     dummy_token = torch.zeros((x.shape[0], 1, x.shape[-1]), device=x.device, dtype=x.dtype)
         #     x = torch.cat([x, dummy_token], dim=1)  # shape is now [b, nw+1, d]
         #     # pad the mask so that new dummy token is always masked out
         #     # mask: [b, nw] -> [b, nw+1]
         #     false_col = torch.zeros((x.shape[0], 1), dtype=torch.bool, device=x.device)
         #     mask = torch.cat([mask, false_col], dim=1)
         # if text_mask is not None:
         #     rope_text = self.rotary_embed.forward_from_seq_len(text_len + 1)
         #     dummy_token = torch.zeros((c.shape[0], 1, c.shape[-1]), device=c.device, dtype=c.dtype)
         #     c = torch.cat([c, dummy_token], dim=1)  # shape is now [b, nt+1, d]
         #     # pad the text mask so that new dummy token is always masked out
         #     # text_mask: [b, nt] -> [b, nt+1]
         #     false_col = torch.zeros((c.shape[0], 1), dtype=torch.bool, device=c.device)
         #     text_mask = torch.cat([text_mask, false_col], dim=1)
         for block in self.transformer_blocks:
+            c, x = block(
+                x,
+                c,
+                t,
+                mask=mask,
+                src_mask=text_mask,
+                rope=rope_audio,
+                c_rope=rope_text,
+            )
         x = self.norm_out(x, t)
         output = self.proj_out(x)
         return output
 class TextEncoder(nn.Module):
     def __init__(
         self,
         # Embeddings
         self.text_embed = TextEmbedding(text_dim, text_num_embeds)
         self.rotary_embed = RotaryEmbedding(dim_head)
         # Example stack of DiTBlocks or any custom blocks
         self.transformer_blocks = nn.ModuleList(
             [
         text: int["b nt"],  # noqa: F821
         time: float["b"] | float[""],  # time step  # noqa: F821 F722
         mask: bool["b nt"] | None = None,  # noqa: F821 F722
+        drop_text: bool = False,
     ):
         """
         Encode text into hidden states of shape [b, nt, d].
         # Basic embedding
         hidden_states = self.text_embed(text, seq_len)  # [b, nt, d]
         # lens and mask
         rope = self.rotary_embed.forward_from_seq_len(seq_len)
             # Here, you likely want standard self-attn, so no cross-attn
             hidden_states = block(
                 x=hidden_states,
+                t=time,  # no time embedding for the text encoder by default
+                mask=mask,  # or pass a text mask if needed
+                rope=rope,  # pass a rope if you want rotary embeddings for text
             )
         return hidden_states
 if __name__ == "__main__":
     from f5_tts.model.utils import get_tokenizer
     bsz = 16
     tokenizer = "pinyin"  # 'pinyin', 'char', or 'custom'
     tokenizer_path = None  # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
     dataset_name = "Emilia_ZH_EN"
     else:
         tokenizer_path = dataset_name
     vocab_char_map, vocab_size = get_tokenizer(tokenizer_path, tokenizer)
     text = ["hello world"] * bsz
+    text_lens = torch.ones((bsz,), dtype=torch.long) * len("hello world")
     text_lens[-1] = 5
     device = "cuda"
     batch = bsz
     time_embed = TimestepEmbedding(512).to(device)
     # handle text as string
     if isinstance(text, list):
         if exists(vocab_char_map):
             text = list_str_to_idx(text, vocab_char_map).to(device)
         else:
             text = list_str_to_tensor(text).to(device)
+        assert text.shape[0] == batch
     time = torch.rand((batch,), device=device)
     text_mask = lens_to_mask(text_lens).to(device)
     # ).to('cuda')
     # hidden_states = text_encoder(text, time_embed(time), mask)
     # print(hidden_states.shape)  # [bsz, seq_len, text_dim]
     # test MMDiT
     mel_dim = 80
     model = MMDiT(
         dropout=0.1,
         ff_mult=4,
         text_num_embeds=vocab_size,
+        mel_dim=mel_dim,
     ).to(device)
     x = torch.rand((batch, 100, mel_dim), device=device)
     cond = torch.rand((batch, 100, mel_dim), device=device)
     lens = torch.ones((batch,), dtype=torch.long) * 100
     mask = lens_to_mask(lens).to(device)
+    output = model(
+        x,
+        cond,
+        text,
+        time,
+        drop_audio_cond=False,
+        drop_text=False,
+        mask=mask,
+        text_mask=text_mask,
+    )
+    print(output.shape)  # [bsz, seq_len, mel_dim]

f5_tts/model/backbones/unett.py CHANGED Viewed

@@ -8,26 +8,19 @@ d - dimension
 """
 from __future__ import annotations
 from typing import Literal
 import torch
-from torch import nn
 import torch.nn.functional as F
 from x_transformers import RMSNorm
 from x_transformers.x_transformers import RotaryEmbedding
-from f5_tts.model.modules import (
-    TimestepEmbedding,
-    ConvNeXtV2Block,
-    ConvPositionEmbedding,
-    Attention,
-    AttnProcessor,
-    FeedForward,
-    precompute_freqs_cis,
-    get_pos_embed_indices,
-)
 # Text embedding
@@ -35,21 +28,34 @@ from f5_tts.model.modules import (
 class TextEmbedding(nn.Module):
     def __init__(self, text_num_embeds, text_dim, conv_layers=0, conv_mult=2):
         super().__init__()
-        self.text_embed = nn.Embedding(text_num_embeds + 1, text_dim)  # use 0 as filler token
         if conv_layers > 0:
             self.extra_modeling = True
             self.precompute_max_pos = 4096  # ~44s of 24khz audio
-            self.register_buffer("freqs_cis", precompute_freqs_cis(text_dim, self.precompute_max_pos), persistent=False)
             self.text_blocks = nn.Sequential(
-                *[ConvNeXtV2Block(text_dim, text_dim * conv_mult) for _ in range(conv_layers)]
             )
         else:
             self.extra_modeling = False
     def forward(self, text: int["b nt"], seq_len, drop_text=False):  # noqa: F722
-        text = text + 1  # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
-        text = text[:, :seq_len]  # curtail if character tokens are more than the mel spec tokens
         batch, text_len = text.shape[0], text.shape[1]
         text = F.pad(text, (0, seq_len - text_len), value=0)
@@ -62,7 +68,9 @@ class TextEmbedding(nn.Module):
         if self.extra_modeling:
             # sinus pos emb
             batch_start = torch.zeros((batch,), dtype=torch.long)
-            pos_idx = get_pos_embed_indices(batch_start, seq_len, max_pos=self.precompute_max_pos)
             text_pos_embed = self.freqs_cis[pos_idx]
             text = text + text_pos_embed
@@ -81,7 +89,13 @@ class InputEmbedding(nn.Module):
         self.proj = nn.Linear(mel_dim * 2 + text_dim, out_dim)
         self.conv_pos_embed = ConvPositionEmbedding(dim=out_dim)
-    def forward(self, x: float["b n d"], cond: float["b n d"], text_embed: float["b n d"], drop_audio_cond=False):  # noqa: F722
         if drop_audio_cond:  # cfg for cond audio
             cond = torch.zeros_like(cond)
@@ -115,7 +129,9 @@ class UNetT(nn.Module):
         self.time_embed = TimestepEmbedding(dim)
         if text_dim is None:
             text_dim = mel_dim
-        self.text_embed = TextEmbedding(text_num_embeds, text_dim, conv_layers=conv_layers)
         self.input_embed = InputEmbedding(mel_dim, text_dim, dim)
         self.rotary_embed = RotaryEmbedding(dim_head)
@@ -144,7 +160,11 @@ class UNetT(nn.Module):
             ff_norm = RMSNorm(dim)
             ff = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
-            skip_proj = nn.Linear(dim * 2, dim, bias=False) if needs_skip_proj and is_later_half else None
             self.layers.append(
                 nn.ModuleList(
@@ -190,7 +210,9 @@ class UNetT(nn.Module):
         # flat unet transformer
         skip_connect_type = self.skip_connect_type
         skips = []
-        for idx, (maybe_skip_proj, attn_norm, attn, ff_norm, ff) in enumerate(self.layers):
             layer = idx + 1
             # skip connection logic

 """
 from __future__ import annotations
 from typing import Literal
 import torch
 import torch.nn.functional as F
+from torch import nn
 from x_transformers import RMSNorm
 from x_transformers.x_transformers import RotaryEmbedding
+from f5_tts.model.modules import (Attention, AttnProcessor, ConvNeXtV2Block,
+                                  ConvPositionEmbedding, FeedForward,
+                                  TimestepEmbedding, get_pos_embed_indices,
+                                  precompute_freqs_cis)
 # Text embedding
 class TextEmbedding(nn.Module):
     def __init__(self, text_num_embeds, text_dim, conv_layers=0, conv_mult=2):
         super().__init__()
+        self.text_embed = nn.Embedding(
+            text_num_embeds + 1, text_dim
+        )  # use 0 as filler token
         if conv_layers > 0:
             self.extra_modeling = True
             self.precompute_max_pos = 4096  # ~44s of 24khz audio
+            self.register_buffer(
+                "freqs_cis",
+                precompute_freqs_cis(text_dim, self.precompute_max_pos),
+                persistent=False,
+            )
             self.text_blocks = nn.Sequential(
+                *[
+                    ConvNeXtV2Block(text_dim, text_dim * conv_mult)
+                    for _ in range(conv_layers)
+                ]
             )
         else:
             self.extra_modeling = False
     def forward(self, text: int["b nt"], seq_len, drop_text=False):  # noqa: F722
+        text = (
+            text + 1
+        )  # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
+        text = text[
+            :, :seq_len
+        ]  # curtail if character tokens are more than the mel spec tokens
         batch, text_len = text.shape[0], text.shape[1]
         text = F.pad(text, (0, seq_len - text_len), value=0)
         if self.extra_modeling:
             # sinus pos emb
             batch_start = torch.zeros((batch,), dtype=torch.long)
+            pos_idx = get_pos_embed_indices(
+                batch_start, seq_len, max_pos=self.precompute_max_pos
+            )
             text_pos_embed = self.freqs_cis[pos_idx]
             text = text + text_pos_embed
         self.proj = nn.Linear(mel_dim * 2 + text_dim, out_dim)
         self.conv_pos_embed = ConvPositionEmbedding(dim=out_dim)
+    def forward(
+        self,
+        x: float["b n d"],
+        cond: float["b n d"],
+        text_embed: float["b n d"],
+        drop_audio_cond=False,
+    ):  # noqa: F722
         if drop_audio_cond:  # cfg for cond audio
             cond = torch.zeros_like(cond)
         self.time_embed = TimestepEmbedding(dim)
         if text_dim is None:
             text_dim = mel_dim
+        self.text_embed = TextEmbedding(
+            text_num_embeds, text_dim, conv_layers=conv_layers
+        )
         self.input_embed = InputEmbedding(mel_dim, text_dim, dim)
         self.rotary_embed = RotaryEmbedding(dim_head)
             ff_norm = RMSNorm(dim)
             ff = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
+            skip_proj = (
+                nn.Linear(dim * 2, dim, bias=False)
+                if needs_skip_proj and is_later_half
+                else None
+            )
             self.layers.append(
                 nn.ModuleList(
         # flat unet transformer
         skip_connect_type = self.skip_connect_type
         skips = []
+        for idx, (maybe_skip_proj, attn_norm, attn, ff_norm, ff) in enumerate(
+            self.layers
+        ):
             layer = idx + 1
             # skip connection logic

f5_tts/model/cfm.py CHANGED Viewed

@@ -19,14 +19,8 @@ from torch.nn.utils.rnn import pad_sequence
 from torchdiffeq import odeint
 from f5_tts.model.modules import MelSpec
-from f5_tts.model.utils import (
-    default,
-    exists,
-    lens_to_mask,
-    list_str_to_idx,
-    list_str_to_tensor,
-    mask_from_frac_lengths,
-)
 class CFM(nn.Module):
@@ -74,7 +68,7 @@ class CFM(nn.Module):
         # vocab map for tokenization
         self.vocab_char_map = vocab_char_map
         self.scale = scale
     @property
@@ -109,11 +103,11 @@ class CFM(nn.Module):
             assert cond.shape[-1] == self.num_channels
         cond = cond.to(next(self.parameters()).dtype)
         print(self.scale)
         cond = cond / self.scale
         batch, cond_seq_len, device = *cond.shape[:2], cond.device
         if not exists(lens):
             lens = torch.full((batch,), cond_seq_len, device=device, dtype=torch.long)
@@ -129,7 +123,9 @@ class CFM(nn.Module):
         if exists(text):
             text_lens = (text != -1).sum(dim=-1)
-            lens = torch.maximum(text_lens, lens)  # make sure lengths are at least those of the text characters
         # duration
@@ -140,19 +136,25 @@ class CFM(nn.Module):
         if isinstance(duration, int):
             duration = torch.full((batch,), duration, device=device, dtype=torch.long)
-        duration = torch.maximum(lens + 1, duration)  # just add one token so something is generated
         duration = duration.clamp(max=max_duration)
         max_duration = duration.amax()
         # duplicate test corner for inner time step oberservation
         if duplicate_test:
-            test_cond = F.pad(cond, (0, 0, cond_seq_len, max_duration - 2 * cond_seq_len), value=0.0)
         cond = F.pad(cond, (0, 0, 0, max_duration - cond_seq_len), value=0.0)
         if no_ref_audio:
             cond = torch.zeros_like(cond)
-        cond_mask = F.pad(cond_mask, (0, max_duration - cond_mask.shape[-1]), value=False)
         cond_mask = cond_mask.unsqueeze(-1)
         step_cond = torch.where(
             cond_mask, cond, torch.zeros_like(cond)
@@ -171,13 +173,25 @@ class CFM(nn.Module):
             # predict flow
             pred = self.transformer(
-                x=x, cond=step_cond, text=text, time=t, mask=mask, drop_audio_cond=False, drop_text=False
             )
             if cfg_strength < 1e-5:
                 return pred
             null_pred = self.transformer(
-                x=x, cond=step_cond, text=text, time=t, mask=mask, drop_audio_cond=True, drop_text=True
             )
             return pred + (pred - null_pred) * cfg_strength
@@ -188,7 +202,11 @@ class CFM(nn.Module):
         for dur in duration:
             if exists(seed):
                 torch.manual_seed(seed)
-            y0.append(torch.randn(dur, self.num_channels, device=self.device, dtype=step_cond.dtype))
         y0 = pad_sequence(y0, padding_value=0, batch_first=True)
         t_start = 0
@@ -199,7 +217,9 @@ class CFM(nn.Module):
             y0 = (1 - t_start) * y0 + t_start * test_cond
             steps = int(steps * (1 - t_start))
-        t = torch.linspace(t_start, 1, steps + 1, device=self.device, dtype=step_cond.dtype)
         if sway_sampling_coef is not None:
             t = t + sway_sampling_coef * (torch.cos(torch.pi / 2 * t) - 1 + t)
@@ -210,7 +230,7 @@ class CFM(nn.Module):
         out = torch.where(cond_mask, cond, out)
         out = out * self.scale
         if exists(vocoder):
             out = out.permute(0, 2, 1)
             out = vocoder(out)
@@ -231,7 +251,12 @@ class CFM(nn.Module):
             inp = inp.permute(0, 2, 1)
             assert inp.shape[-1] == self.num_channels
-        batch, seq_len, dtype, device, _σ1 = *inp.shape[:2], inp.dtype, self.device, self.sigma
         # handle text as string
         if isinstance(text, list):
@@ -245,10 +270,16 @@ class CFM(nn.Module):
         if not exists(lens):
             lens = torch.full((batch,), seq_len, device=device)
-        mask = lens_to_mask(lens, length=seq_len)  # useless here, as collate_fn will pad to max length in batch
         # get a random span to mask out for training conditionally
-        frac_lengths = torch.zeros((batch,), device=self.device).float().uniform_(*self.frac_lengths_mask)
         rand_span_mask = mask_from_frac_lengths(lens, frac_lengths)
         if exists(mask):
@@ -283,11 +314,16 @@ class CFM(nn.Module):
         # if want rigourously mask out padding, record in collate_fn in dataset.py, and pass in here
         # adding mask will use more memory, thus also need to adjust batchsampler with scaled down threshold for long sequences
         pred = self.transformer(
-            x=φ, cond=cond, text=text, time=time, drop_audio_cond=drop_audio_cond, drop_text=drop_text
         )
         # flow matching loss
         loss = F.mse_loss(pred, flow, reduction="none")
         loss = loss[rand_span_mask]
-        return loss.mean(), cond, pred, t

 from torchdiffeq import odeint
 from f5_tts.model.modules import MelSpec
+from f5_tts.model.utils import (default, exists, lens_to_mask, list_str_to_idx,
+                                list_str_to_tensor, mask_from_frac_lengths)
 class CFM(nn.Module):
         # vocab map for tokenization
         self.vocab_char_map = vocab_char_map
         self.scale = scale
     @property
             assert cond.shape[-1] == self.num_channels
         cond = cond.to(next(self.parameters()).dtype)
         print(self.scale)
         cond = cond / self.scale
         batch, cond_seq_len, device = *cond.shape[:2], cond.device
         if not exists(lens):
             lens = torch.full((batch,), cond_seq_len, device=device, dtype=torch.long)
         if exists(text):
             text_lens = (text != -1).sum(dim=-1)
+            lens = torch.maximum(
+                text_lens, lens
+            )  # make sure lengths are at least those of the text characters
         # duration
         if isinstance(duration, int):
             duration = torch.full((batch,), duration, device=device, dtype=torch.long)
+        duration = torch.maximum(
+            lens + 1, duration
+        )  # just add one token so something is generated
         duration = duration.clamp(max=max_duration)
         max_duration = duration.amax()
         # duplicate test corner for inner time step oberservation
         if duplicate_test:
+            test_cond = F.pad(
+                cond, (0, 0, cond_seq_len, max_duration - 2 * cond_seq_len), value=0.0
+            )
         cond = F.pad(cond, (0, 0, 0, max_duration - cond_seq_len), value=0.0)
         if no_ref_audio:
             cond = torch.zeros_like(cond)
+        cond_mask = F.pad(
+            cond_mask, (0, max_duration - cond_mask.shape[-1]), value=False
+        )
         cond_mask = cond_mask.unsqueeze(-1)
         step_cond = torch.where(
             cond_mask, cond, torch.zeros_like(cond)
             # predict flow
             pred = self.transformer(
+                x=x,
+                cond=step_cond,
+                text=text,
+                time=t,
+                mask=mask,
+                drop_audio_cond=False,
+                drop_text=False,
             )
             if cfg_strength < 1e-5:
                 return pred
             null_pred = self.transformer(
+                x=x,
+                cond=step_cond,
+                text=text,
+                time=t,
+                mask=mask,
+                drop_audio_cond=True,
+                drop_text=True,
             )
             return pred + (pred - null_pred) * cfg_strength
         for dur in duration:
             if exists(seed):
                 torch.manual_seed(seed)
+            y0.append(
+                torch.randn(
+                    dur, self.num_channels, device=self.device, dtype=step_cond.dtype
+                )
+            )
         y0 = pad_sequence(y0, padding_value=0, batch_first=True)
         t_start = 0
             y0 = (1 - t_start) * y0 + t_start * test_cond
             steps = int(steps * (1 - t_start))
+        t = torch.linspace(
+            t_start, 1, steps + 1, device=self.device, dtype=step_cond.dtype
+        )
         if sway_sampling_coef is not None:
             t = t + sway_sampling_coef * (torch.cos(torch.pi / 2 * t) - 1 + t)
         out = torch.where(cond_mask, cond, out)
         out = out * self.scale
         if exists(vocoder):
             out = out.permute(0, 2, 1)
             out = vocoder(out)
             inp = inp.permute(0, 2, 1)
             assert inp.shape[-1] == self.num_channels
+        batch, seq_len, dtype, device, _σ1 = (
+            *inp.shape[:2],
+            inp.dtype,
+            self.device,
+            self.sigma,
+        )
         # handle text as string
         if isinstance(text, list):
         if not exists(lens):
             lens = torch.full((batch,), seq_len, device=device)
+        mask = lens_to_mask(
+            lens, length=seq_len
+        )  # useless here, as collate_fn will pad to max length in batch
         # get a random span to mask out for training conditionally
+        frac_lengths = (
+            torch.zeros((batch,), device=self.device)
+            .float()
+            .uniform_(*self.frac_lengths_mask)
+        )
         rand_span_mask = mask_from_frac_lengths(lens, frac_lengths)
         if exists(mask):
         # if want rigourously mask out padding, record in collate_fn in dataset.py, and pass in here
         # adding mask will use more memory, thus also need to adjust batchsampler with scaled down threshold for long sequences
         pred = self.transformer(
+            x=φ,
+            cond=cond,
+            text=text,
+            time=time,
+            drop_audio_cond=drop_audio_cond,
+            drop_text=drop_text,
         )
         # flow matching loss
         loss = F.mse_loss(pred, flow, reduction="none")
         loss = loss[rand_span_mask]
+        return loss.mean(), cond, pred, t

f5_tts/model/dataset.py CHANGED Viewed

@@ -1,6 +1,6 @@
-import re
 import json
 import random
 from importlib.resources import files
 import torch
@@ -15,8 +15,9 @@ from tqdm import tqdm
 from f5_tts.model.modules import MelSpec
 from f5_tts.model.utils import default
 def get_speaker_id(path):
-    parts = path.split('/')
     speaker_id = parts[-3]
     return speaker_id
@@ -40,7 +41,7 @@ class CustomDataset(Dataset):
         return_wavform=False,
         remove_starting_space=True,
         need_prompt_speech=False,
-        prompt_repository: dict=None,
     ):
         self.data = custom_dataset
         self.durations = durations
@@ -63,42 +64,32 @@ class CustomDataset(Dataset):
                     mel_spec_type=mel_spec_type,
                 ),
             )
         self.validation = validation
         self.validation_num = validation_num
         if (not validation) and data_augmentation:
-            print('Using data augmentation.')
-            self.augment = Compose([
-                AddBackgroundNoise(
-                sounds_path="/data5/ESC-50-master",
-                min_snr_db=3.0,
-                max_snr_db=30.0,
-                noise_transform=PolarityInversion(),
-                p=0.5
-                ),
-                AddGaussianNoise(
-                    min_amplitude=0.001,
-                    max_amplitude=0.015,
-                    p=0.5
-                ),
-                PitchShift(
-                    min_semitones=-12.0,
-                    max_semitones=12.0,
-                    p=0.8
-                ),
-                ApplyImpulseResponse(ir_path="/data5/Audio", p=1.0),
-                Aliasing(min_sample_rate=4000, max_sample_rate=30000, p=0.3),
-                BandPassFilter(min_center_freq=100.0, max_center_freq=6000, p=0.2),
-                SevenBandParametricEQ(p=0.2),
-                TanhDistortion(
-                    min_distortion=0.01,
-                    max_distortion=0.7,
-                    p=0.2
-                ),
-            ])
         else:
-            print('No data augmentation.')
             self.augment = None
         self.return_wavform = return_wavform
@@ -112,7 +103,7 @@ class CustomDataset(Dataset):
                     text = row["text"]
                     duration = row["duration"]
                     spk_id = get_speaker_id(audio_path)
-                    assert spk_id != None and spk_id != 'mp3'
                     if spk_id not in self.prompt_repository:
                         self.prompt_repository[spk_id] = [row]
                     else:
@@ -120,13 +111,14 @@ class CustomDataset(Dataset):
             else:
                 self.prompt_repository = prompt_repository
-            print(f'Grouped samples into {len(self.prompt_repository.keys())} speakers.')
             self.need_prompt_speech = True
         else:
             self.need_prompt_speech = False
     def get_frame_len(self, index):
         if self.validation:
             index += len(self.data) - self.validation_num
@@ -164,9 +156,9 @@ class CustomDataset(Dataset):
             index = (index + 1) % len(self.data)
         if self.remove_starting_space:
-            while len(text) > 1 and text[0] == ' ':
                 text = text[1:]
         if self.preprocessed_mel:
             mel_spec = torch.tensor(row["mel_spec"])
         else:
@@ -178,31 +170,37 @@ class CustomDataset(Dataset):
             # resample if necessary
             if source_sample_rate != self.target_sample_rate:
-                resampler = torchaudio.transforms.Resample(source_sample_rate, self.target_sample_rate)
                 audio = resampler(audio)
             if not self.validation:
                 if self.augment != None:
-                    audio = self.augment(audio.squeeze().numpy(), sample_rate=self.target_sample_rate)
                     audio = torch.from_numpy(audio).float().unsqueeze(0)
             # to mel spectrogram
             mel_spec = self.mel_spectrogram(audio)
             mel_spec = mel_spec.squeeze(0)  # '1 d t -> d t'
-        out['mel_spec'] = mel_spec
-        out['text'] = text
-        out['duration'] = duration
-        out['target_text'] = self.data[(index + len(self.data) // 2) % len(self.data)]["text"]
         if self.return_wavform:
-            out['wav'] = audio
         if return_path:
-            out['path'] = audio_path
         if return_row:
-            out['row'] = row
         # Sample a prompt speech of the same speaker
         # From prompt_repository
@@ -212,9 +210,9 @@ class CustomDataset(Dataset):
             _count = 100
             while True:
                 pmt_row = random.choice(spk_repository)
-                pmt_audio_path = pmt_row['audio_path']
-                pmt_text = pmt_row['text']
-                pmt_duration = pmt_row['duration']
                 if not isinstance(pmt_text, list):
                     pmt_text = list(pmt_text)
@@ -223,14 +221,14 @@ class CustomDataset(Dataset):
                 if 0.3 <= pmt_duration <= 30 and (0 < len(pmt_text) < 2048):
                     if pmt_text != text:
                         break
-                    _count =  _count - 1
                     if _count <= 0:
                         break
             if self.remove_starting_space:
-                while len(pmt_text) > 1 and pmt_text[0] == ' ':
                     pmt_text = pmt_text[1:]
             if self.preprocessed_mel:
                 pmt_mel_spec = torch.tensor(pmt_row["mel_spec"])
             else:
@@ -242,30 +240,35 @@ class CustomDataset(Dataset):
                 # resample if necessary
                 if source_sample_rate != self.target_sample_rate:
-                    resampler = torchaudio.transforms.Resample(source_sample_rate, self.target_sample_rate)
                     pmt_audio = resampler(pmt_audio)
                 if not self.validation:
                     if self.augment != None:
-                        pmt_audio = self.augment(pmt_audio.squeeze().numpy(), sample_rate=self.target_sample_rate)
                         pmt_audio = torch.from_numpy(pmt_audio).float().unsqueeze(0)
                 # to mel spectrogram
                 pmt_mel_spec = self.mel_spectrogram(pmt_audio)
                 pmt_mel_spec = pmt_mel_spec.squeeze(0)  # '1 d t -> d t'
-            out['pmt_mel_spec'] = pmt_mel_spec
-            out['pmt_text'] = pmt_text
-            out['pmt_duration'] = pmt_duration
             if self.return_wavform:
-                out['pmt_wav'] = pmt_audio
             if return_path:
-                out['pmt_path'] = pmt_audio_path
             if return_row:
-                out['pmt_row'] = pmt_row
         return out
@@ -280,7 +283,12 @@ class DynamicBatchSampler(Sampler[list[int]]):
     """
     def __init__(
-        self, sampler: Sampler[int], frames_threshold: int, max_samples=0, random_seed=None, drop_last: bool = False
     ):
         self.sampler = sampler
         self.frames_threshold = frames_threshold
@@ -302,7 +310,9 @@ class DynamicBatchSampler(Sampler[list[int]]):
         #     indices, desc=f"Creating dynamic batches with {frames_threshold} audio frames per gpu"
         # ):
         for idx, frame_len in indices:
-            if batch_frames + frame_len <= self.frames_threshold and (max_samples == 0 or len(batch) < max_samples):
                 batch.append(idx)
                 batch_frames += frame_len
             else:
@@ -337,6 +347,7 @@ class DynamicBatchSampler(Sampler[list[int]]):
 # Load dataset
 def load_dataset(
     dataset_name: str,
     tokenizer: str = "pinyin",
@@ -349,7 +360,7 @@ def load_dataset(
     return_wavform: bool = False,
     remove_starting_space: bool = True,
     need_prompt_speech: bool = False,
-    prompt_repository: dict = None
 ) -> CustomDataset:
     """
     dataset_type    - "CustomDataset" if you want to use tokenizer name and default data path to load for train_dataset
@@ -359,9 +370,13 @@ def load_dataset(
     print("Loading dataset ...")
     if dataset_type == "CustomDataset":
-        rel_data_path = str(f'/home/yl4579/F5-TTS-diff/F5-TTS-DMD-flow-ds/data/{dataset_name}_{tokenizer}')
-        if 'LibriTTS_100_360_500_char_pinyin' in rel_data_path:
-            rel_data_path = rel_data_path.replace('LibriTTS_100_360_500_char_pinyin', 'LibriTTS_100_360_500_char')
         if audio_type == "raw":
             try:
                 train_dataset = load_from_disk(f"{rel_data_path}/raw")
@@ -385,7 +400,7 @@ def load_dataset(
             return_wavform=return_wavform,
             remove_starting_space=remove_starting_space,
             need_prompt_speech=need_prompt_speech,
-            prompt_repository=prompt_repository
         )
     elif dataset_type == "CustomDatasetPath":
@@ -398,7 +413,10 @@ def load_dataset(
             data_dict = json.load(f)
         durations = data_dict["duration"]
         train_dataset = CustomDataset(
-            train_dataset, durations=durations, preprocessed_mel=preprocessed_mel, **mel_spec_kwargs
         )
     return train_dataset
@@ -410,7 +428,7 @@ def collate_fn(batch):
     mel_specs = [item["mel_spec"].squeeze(0) for item in batch]
     mel_lengths = torch.LongTensor([spec.shape[-1] for spec in mel_specs])
     max_mel_length = mel_lengths.amax()
     # Pad mel_specs
     padded_mel_specs = []
     for spec in mel_specs:  # TODO. maybe records mask for attention here
@@ -419,8 +437,8 @@ def collate_fn(batch):
         padded_mel_specs.append(padded_spec)
     mel_specs = torch.stack(padded_mel_specs)
-    text = [item['text'] for item in batch]
-    target_text = [item['target_text'] for item in batch]
     text_lengths = torch.LongTensor([len(item) for item in text])
@@ -432,26 +450,26 @@ def collate_fn(batch):
         target_text=target_text,
     )
-    if 'pmt_mel_spec' in batch[0]:
         pmt_mel_specs = [item["pmt_mel_spec"].squeeze(0) for item in batch]
         pmt_mel_lengths = torch.LongTensor([spec.shape[-1] for spec in pmt_mel_specs])
         max_pmt_mel_length = pmt_mel_lengths.amax()
         # Pad mel_specs
         padded_pmt_mel_specs = []
-        for spec in pmt_mel_specs:
             padding = (0, max_pmt_mel_length - spec.size(-1))
             padded_spec = F.pad(spec, padding, value=0)
             padded_pmt_mel_specs.append(padded_spec)
         pmt_mel_specs = torch.stack(padded_pmt_mel_specs)
-        out['pmt_mel_specs'] = pmt_mel_specs
-    if 'pmt_text' in batch[0]:
-        pmt_text = [item['pmt_text'] for item in batch]
         pmt_text_lengths = torch.LongTensor([len(item) for item in pmt_text])
-        out['pmt_text'] = pmt_text
-        out['pmt_text_lengths'] = pmt_text_lengths
-    return out

 import json
 import random
+import re
 from importlib.resources import files
 import torch
 from f5_tts.model.modules import MelSpec
 from f5_tts.model.utils import default
 def get_speaker_id(path):
+    parts = path.split("/")
     speaker_id = parts[-3]
     return speaker_id
         return_wavform=False,
         remove_starting_space=True,
         need_prompt_speech=False,
+        prompt_repository: dict = None,
     ):
         self.data = custom_dataset
         self.durations = durations
                     mel_spec_type=mel_spec_type,
                 ),
             )
         self.validation = validation
         self.validation_num = validation_num
         if (not validation) and data_augmentation:
+            print("Using data augmentation.")
+            self.augment = Compose(
+                [
+                    AddBackgroundNoise(
+                        sounds_path="/data5/ESC-50-master",
+                        min_snr_db=3.0,
+                        max_snr_db=30.0,
+                        noise_transform=PolarityInversion(),
+                        p=0.5,
+                    ),
+                    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
+                    PitchShift(min_semitones=-12.0, max_semitones=12.0, p=0.8),
+                    ApplyImpulseResponse(ir_path="/data5/Audio", p=1.0),
+                    Aliasing(min_sample_rate=4000, max_sample_rate=30000, p=0.3),
+                    BandPassFilter(min_center_freq=100.0, max_center_freq=6000, p=0.2),
+                    SevenBandParametricEQ(p=0.2),
+                    TanhDistortion(min_distortion=0.01, max_distortion=0.7, p=0.2),
+                ]
+            )
         else:
+            print("No data augmentation.")
             self.augment = None
         self.return_wavform = return_wavform
                     text = row["text"]
                     duration = row["duration"]
                     spk_id = get_speaker_id(audio_path)
+                    assert spk_id != None and spk_id != "mp3"
                     if spk_id not in self.prompt_repository:
                         self.prompt_repository[spk_id] = [row]
                     else:
             else:
                 self.prompt_repository = prompt_repository
+            print(
+                f"Grouped samples into {len(self.prompt_repository.keys())} speakers."
+            )
             self.need_prompt_speech = True
         else:
             self.need_prompt_speech = False
     def get_frame_len(self, index):
         if self.validation:
             index += len(self.data) - self.validation_num
             index = (index + 1) % len(self.data)
         if self.remove_starting_space:
+            while len(text) > 1 and text[0] == " ":
                 text = text[1:]
         if self.preprocessed_mel:
             mel_spec = torch.tensor(row["mel_spec"])
         else:
             # resample if necessary
             if source_sample_rate != self.target_sample_rate:
+                resampler = torchaudio.transforms.Resample(
+                    source_sample_rate, self.target_sample_rate
+                )
                 audio = resampler(audio)
             if not self.validation:
                 if self.augment != None:
+                    audio = self.augment(
+                        audio.squeeze().numpy(), sample_rate=self.target_sample_rate
+                    )
                     audio = torch.from_numpy(audio).float().unsqueeze(0)
             # to mel spectrogram
             mel_spec = self.mel_spectrogram(audio)
             mel_spec = mel_spec.squeeze(0)  # '1 d t -> d t'
+        out["mel_spec"] = mel_spec
+        out["text"] = text
+        out["duration"] = duration
+        out["target_text"] = self.data[(index + len(self.data) // 2) % len(self.data)][
+            "text"
+        ]
         if self.return_wavform:
+            out["wav"] = audio
         if return_path:
+            out["path"] = audio_path
         if return_row:
+            out["row"] = row
         # Sample a prompt speech of the same speaker
         # From prompt_repository
             _count = 100
             while True:
                 pmt_row = random.choice(spk_repository)
+                pmt_audio_path = pmt_row["audio_path"]
+                pmt_text = pmt_row["text"]
+                pmt_duration = pmt_row["duration"]
                 if not isinstance(pmt_text, list):
                     pmt_text = list(pmt_text)
                 if 0.3 <= pmt_duration <= 30 and (0 < len(pmt_text) < 2048):
                     if pmt_text != text:
                         break
+                    _count = _count - 1
                     if _count <= 0:
                         break
             if self.remove_starting_space:
+                while len(pmt_text) > 1 and pmt_text[0] == " ":
                     pmt_text = pmt_text[1:]
             if self.preprocessed_mel:
                 pmt_mel_spec = torch.tensor(pmt_row["mel_spec"])
             else:
                 # resample if necessary
                 if source_sample_rate != self.target_sample_rate:
+                    resampler = torchaudio.transforms.Resample(
+                        source_sample_rate, self.target_sample_rate
+                    )
                     pmt_audio = resampler(pmt_audio)
                 if not self.validation:
                     if self.augment != None:
+                        pmt_audio = self.augment(
+                            pmt_audio.squeeze().numpy(),
+                            sample_rate=self.target_sample_rate,
+                        )
                         pmt_audio = torch.from_numpy(pmt_audio).float().unsqueeze(0)
                 # to mel spectrogram
                 pmt_mel_spec = self.mel_spectrogram(pmt_audio)
                 pmt_mel_spec = pmt_mel_spec.squeeze(0)  # '1 d t -> d t'
+            out["pmt_mel_spec"] = pmt_mel_spec
+            out["pmt_text"] = pmt_text
+            out["pmt_duration"] = pmt_duration
             if self.return_wavform:
+                out["pmt_wav"] = pmt_audio
             if return_path:
+                out["pmt_path"] = pmt_audio_path
             if return_row:
+                out["pmt_row"] = pmt_row
         return out
     """
     def __init__(
+        self,
+        sampler: Sampler[int],
+        frames_threshold: int,
+        max_samples=0,
+        random_seed=None,
+        drop_last: bool = False,
     ):
         self.sampler = sampler
         self.frames_threshold = frames_threshold
         #     indices, desc=f"Creating dynamic batches with {frames_threshold} audio frames per gpu"
         # ):
         for idx, frame_len in indices:
+            if batch_frames + frame_len <= self.frames_threshold and (
+                max_samples == 0 or len(batch) < max_samples
+            ):
                 batch.append(idx)
                 batch_frames += frame_len
             else:
 # Load dataset
 def load_dataset(
     dataset_name: str,
     tokenizer: str = "pinyin",
     return_wavform: bool = False,
     remove_starting_space: bool = True,
     need_prompt_speech: bool = False,
+    prompt_repository: dict = None,
 ) -> CustomDataset:
     """
     dataset_type    - "CustomDataset" if you want to use tokenizer name and default data path to load for train_dataset
     print("Loading dataset ...")
     if dataset_type == "CustomDataset":
+        rel_data_path = str(
+            f"/home/yl4579/F5-TTS-diff/F5-TTS-DMD-flow-ds/data/{dataset_name}_{tokenizer}"
+        )
+        if "LibriTTS_100_360_500_char_pinyin" in rel_data_path:
+            rel_data_path = rel_data_path.replace(
+                "LibriTTS_100_360_500_char_pinyin", "LibriTTS_100_360_500_char"
+            )
         if audio_type == "raw":
             try:
                 train_dataset = load_from_disk(f"{rel_data_path}/raw")
             return_wavform=return_wavform,
             remove_starting_space=remove_starting_space,
             need_prompt_speech=need_prompt_speech,
+            prompt_repository=prompt_repository,
         )
     elif dataset_type == "CustomDatasetPath":
             data_dict = json.load(f)
         durations = data_dict["duration"]
         train_dataset = CustomDataset(
+            train_dataset,
+            durations=durations,
+            preprocessed_mel=preprocessed_mel,
+            **mel_spec_kwargs,
         )
     return train_dataset
     mel_specs = [item["mel_spec"].squeeze(0) for item in batch]
     mel_lengths = torch.LongTensor([spec.shape[-1] for spec in mel_specs])
     max_mel_length = mel_lengths.amax()
     # Pad mel_specs
     padded_mel_specs = []
     for spec in mel_specs:  # TODO. maybe records mask for attention here
         padded_mel_specs.append(padded_spec)
     mel_specs = torch.stack(padded_mel_specs)
+    text = [item["text"] for item in batch]
+    target_text = [item["target_text"] for item in batch]
     text_lengths = torch.LongTensor([len(item) for item in text])
         target_text=target_text,
     )
+    if "pmt_mel_spec" in batch[0]:
         pmt_mel_specs = [item["pmt_mel_spec"].squeeze(0) for item in batch]
         pmt_mel_lengths = torch.LongTensor([spec.shape[-1] for spec in pmt_mel_specs])
         max_pmt_mel_length = pmt_mel_lengths.amax()
         # Pad mel_specs
         padded_pmt_mel_specs = []
+        for spec in pmt_mel_specs:
             padding = (0, max_pmt_mel_length - spec.size(-1))
             padded_spec = F.pad(spec, padding, value=0)
             padded_pmt_mel_specs.append(padded_spec)
         pmt_mel_specs = torch.stack(padded_pmt_mel_specs)
+        out["pmt_mel_specs"] = pmt_mel_specs
+    if "pmt_text" in batch[0]:
+        pmt_text = [item["pmt_text"] for item in batch]
         pmt_text_lengths = torch.LongTensor([len(item) for item in pmt_text])
+        out["pmt_text"] = pmt_text
+        out["pmt_text_lengths"] = pmt_text_lengths
+    return out

f5_tts/model/modules.py CHANGED Viewed

@@ -19,7 +19,6 @@ from librosa.filters import mel as librosa_mel_fn
 from torch import nn
 from x_transformers.x_transformers import apply_rotary_pos_emb
 # raw wav to mel spec
@@ -42,15 +41,25 @@ def get_bigvgan_mel_spectrogram(
     key = f"{n_fft}_{n_mel_channels}_{target_sample_rate}_{hop_length}_{win_length}_{fmin}_{fmax}_{device}"
     if key not in mel_basis_cache:
-        mel = librosa_mel_fn(sr=target_sample_rate, n_fft=n_fft, n_mels=n_mel_channels, fmin=fmin, fmax=fmax)
-        mel_basis_cache[key] = torch.from_numpy(mel).float().to(device)  # TODO: why they need .float()?
         hann_window_cache[key] = torch.hann_window(win_length).to(device)
     mel_basis = mel_basis_cache[key]
     hann_window = hann_window_cache[key]
     padding = (n_fft - hop_length) // 2
-    waveform = torch.nn.functional.pad(waveform.unsqueeze(1), (padding, padding), mode="reflect").squeeze(1)
     spec = torch.stft(
         waveform,
@@ -112,7 +121,9 @@ class MelSpec(nn.Module):
         mel_spec_type="vocos",
     ):
         super().__init__()
-        assert mel_spec_type in ["vocos", "bigvgan"], print("We only support two extract mel backend: vocos or bigvgan")
         self.n_fft = n_fft
         self.hop_length = hop_length
@@ -193,7 +204,9 @@ class ConvPositionEmbedding(nn.Module):
 # rotary positional embedding related
-def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0, theta_rescale_factor=1.0):
     # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
     # has some connection to NTK literature
     # https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
@@ -209,10 +222,15 @@ def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0, theta_resca
 def get_pos_embed_indices(start, length, max_pos, scale=1.0):
     # length = length if isinstance(length, int) else length.max()
-    scale = scale * torch.ones_like(start, dtype=torch.float32)  # in case scale is a scalar
     pos = (
         start.unsqueeze(1)
-        + (torch.arange(length, device=start.device, dtype=torch.float32).unsqueeze(0) * scale.unsqueeze(1)).long()
     )
     # avoid extra long error.
     pos = torch.where(pos < max_pos, pos, max_pos - 1)
@@ -251,7 +269,9 @@ class ConvNeXtV2Block(nn.Module):
             dim, dim, kernel_size=7, padding=padding, groups=dim, dilation=dilation
         )  # depthwise conv
         self.norm = nn.LayerNorm(dim, eps=1e-6)
-        self.pwconv1 = nn.Linear(dim, intermediate_dim)  # pointwise/1x1 convs, implemented with linear layers
         self.act = nn.GELU()
         self.grn = GRN(intermediate_dim)
         self.pwconv2 = nn.Linear(intermediate_dim, dim)
@@ -284,7 +304,9 @@ class AdaLayerNormZero(nn.Module):
     def forward(self, x, emb=None):
         emb = self.linear(self.silu(emb))
-        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = torch.chunk(emb, 6, dim=1)
         x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
         return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
@@ -315,14 +337,18 @@ class AdaLayerNormZero_Final(nn.Module):
 class FeedForward(nn.Module):
-    def __init__(self, dim, dim_out=None, mult=4, dropout=0.0, approximate: str = "none"):
         super().__init__()
         inner_dim = int(dim * mult)
         dim_out = dim_out if dim_out is not None else dim
         activation = nn.GELU(approximate=approximate)
         project_in = nn.Sequential(nn.Linear(dim, inner_dim), activation)
-        self.ff = nn.Sequential(project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out))
     def forward(self, x):
         return self.ff(x)
@@ -346,7 +372,9 @@ class Attention(nn.Module):
         super().__init__()
         if not hasattr(F, "scaled_dot_product_attention"):
-            raise ImportError("Attention equires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
         self.processor = processor
@@ -385,7 +413,9 @@ class Attention(nn.Module):
         c_rope=None,  # rotary position embedding for c
     ) -> torch.Tensor:
         if c is not None:
-            return self.processor(self, x, c=c, mask=mask, src_mask=src_mask, rope=rope, c_rope=c_rope)
         else:
             return self.processor(self, x, mask=mask, rope=rope)
@@ -414,7 +444,9 @@ class AttnProcessor:
         # apply rotary position embedding
         if rope is not None:
             freqs, xpos_scale = rope
-            q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
             query = apply_rotary_pos_emb(query, freqs, q_xpos_scale)
             key = apply_rotary_pos_emb(key, freqs, k_xpos_scale)
@@ -430,11 +462,15 @@ class AttnProcessor:
         if mask is not None:
             attn_mask = mask
             attn_mask = attn_mask.unsqueeze(1).unsqueeze(1)  # 'b n -> b 1 1 n'
-            attn_mask = attn_mask.expand(batch_size, attn.heads, query.shape[-2], key.shape[-2])
         else:
             attn_mask = None
-        x = F.scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=0.0, is_causal=False)
         x = x.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
         x = x.to(query.dtype)
@@ -461,12 +497,12 @@ class JointAttnProcessor:
     def __call__(
         self,
         attn: Attention,
-        x: float["b n d"],         # noised input x
-        c: float["b nt d"] = None, # context c, here text
         mask: bool["b n"] | None = None,
         src_mask: bool["b nt"] | None = None,
-        rope=None,      # rotary position embedding for x
-        c_rope=None,    # rotary position embedding for c
     ) -> torch.FloatTensor:
         residual = x
         batch_size = c.shape[0]
@@ -484,14 +520,18 @@ class JointAttnProcessor:
         # apply rope for x
         if rope is not None:
             freqs, xpos_scale = rope
-            q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
             query = apply_rotary_pos_emb(query, freqs, q_xpos_scale)
             key = apply_rotary_pos_emb(key, freqs, k_xpos_scale)
         # apply rope for c
         if c_rope is not None:
             freqs, xpos_scale = c_rope
-            q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
             c_query = apply_rotary_pos_emb(c_query, freqs, q_xpos_scale)
             c_key = apply_rotary_pos_emb(c_key, freqs, k_xpos_scale)
@@ -515,17 +555,23 @@ class JointAttnProcessor:
                 attn_mask_c = F.pad(src_mask, (x.shape[1], 0), value=True)
                 attn_mask = attn_mask & attn_mask_c
             attn_mask = attn_mask.unsqueeze(1).unsqueeze(1)
-            attn_mask = attn_mask.expand(batch_size, attn.heads, query.shape[-2], key.shape[-2])
         else:
             if src_mask is not None:
                 # if there's no mask for x but there's src_mask
                 attn_mask = F.pad(src_mask, (x.shape[1], 0), value=True)
                 attn_mask = attn_mask.unsqueeze(1).unsqueeze(1)
-                attn_mask = attn_mask.expand(batch_size, attn.heads, query.shape[-2], key.shape[-2])
             else:
                 attn_mask = None
-        x = F.scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=0.0, is_causal=False)
         x = x.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
         x = x.to(query.dtype)
@@ -546,7 +592,6 @@ class JointAttnProcessor:
         return x, c
 # DiT Block
@@ -564,7 +609,9 @@ class DiTBlock(nn.Module):
         )
         self.ff_norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
-        self.ff = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
     def forward(self, x, t, mask=None, rope=None):  # x: noised input, t: time embedding
         # pre-norm & modulation for attention input
@@ -596,12 +643,16 @@ class MMDiTBlock(nn.Module):
     context_pre_only: last layer only do prenorm + modulation cuz no more ffn
     """
-    def __init__(self, dim, heads, dim_head, ff_mult=4, dropout=0.1, context_pre_only=False):
         super().__init__()
         self.context_pre_only = context_pre_only
-        self.attn_norm_c = AdaLayerNormZero_Final(dim) if context_pre_only else AdaLayerNormZero(dim)
         self.attn_norm_x = AdaLayerNormZero(dim)
         self.attn = Attention(
             processor=JointAttnProcessor(),
@@ -615,23 +666,35 @@ class MMDiTBlock(nn.Module):
         if not context_pre_only:
             self.ff_norm_c = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
-            self.ff_c = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
         else:
             self.ff_norm_c = None
             self.ff_c = None
         self.ff_norm_x = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
-        self.ff_x = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
-    def forward(self, x, c, t, mask=None, src_mask=None, rope=None, c_rope=None):  # x: noised input, c: context, t: time embedding
         # pre-norm & modulation for attention input
         if self.context_pre_only:
             norm_c = self.attn_norm_c(c, t)
         else:
-            norm_c, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.attn_norm_c(c, emb=t)
-        norm_x, x_gate_msa, x_shift_mlp, x_scale_mlp, x_gate_mlp = self.attn_norm_x(x, emb=t)
         # attention
-        x_attn_output, c_attn_output = self.attn(x=norm_x, c=norm_c, mask=mask, src_mask=src_mask, rope=rope, c_rope=c_rope)
         # process attention output for context c
         if self.context_pre_only:
@@ -639,7 +702,9 @@ class MMDiTBlock(nn.Module):
         else:  # if not last layer
             c = c + c_gate_msa.unsqueeze(1) * c_attn_output
-            norm_c = self.ff_norm_c(c) * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
             c_ff_output = self.ff_c(norm_c)
             c = c + c_gate_mlp.unsqueeze(1) * c_ff_output
@@ -660,7 +725,9 @@ class TimestepEmbedding(nn.Module):
     def __init__(self, dim, freq_embed_dim=256):
         super().__init__()
         self.time_embed = SinusPositionEmbedding(freq_embed_dim)
-        self.time_mlp = nn.Sequential(nn.Linear(freq_embed_dim, dim), nn.SiLU(), nn.Linear(dim, dim))
     def forward(self, timestep: float["b"]):  # noqa: F821
         time_hidden = self.time_embed(timestep)

 from torch import nn
 from x_transformers.x_transformers import apply_rotary_pos_emb
 # raw wav to mel spec
     key = f"{n_fft}_{n_mel_channels}_{target_sample_rate}_{hop_length}_{win_length}_{fmin}_{fmax}_{device}"
     if key not in mel_basis_cache:
+        mel = librosa_mel_fn(
+            sr=target_sample_rate,
+            n_fft=n_fft,
+            n_mels=n_mel_channels,
+            fmin=fmin,
+            fmax=fmax,
+        )
+        mel_basis_cache[key] = (
+            torch.from_numpy(mel).float().to(device)
+        )  # TODO: why they need .float()?
         hann_window_cache[key] = torch.hann_window(win_length).to(device)
     mel_basis = mel_basis_cache[key]
     hann_window = hann_window_cache[key]
     padding = (n_fft - hop_length) // 2
+    waveform = torch.nn.functional.pad(
+        waveform.unsqueeze(1), (padding, padding), mode="reflect"
+    ).squeeze(1)
     spec = torch.stft(
         waveform,
         mel_spec_type="vocos",
     ):
         super().__init__()
+        assert mel_spec_type in ["vocos", "bigvgan"], print(
+            "We only support two extract mel backend: vocos or bigvgan"
+        )
         self.n_fft = n_fft
         self.hop_length = hop_length
 # rotary positional embedding related
+def precompute_freqs_cis(
+    dim: int, end: int, theta: float = 10000.0, theta_rescale_factor=1.0
+):
     # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
     # has some connection to NTK literature
     # https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
 def get_pos_embed_indices(start, length, max_pos, scale=1.0):
     # length = length if isinstance(length, int) else length.max()
+    scale = scale * torch.ones_like(
+        start, dtype=torch.float32
+    )  # in case scale is a scalar
     pos = (
         start.unsqueeze(1)
+        + (
+            torch.arange(length, device=start.device, dtype=torch.float32).unsqueeze(0)
+            * scale.unsqueeze(1)
+        ).long()
     )
     # avoid extra long error.
     pos = torch.where(pos < max_pos, pos, max_pos - 1)
             dim, dim, kernel_size=7, padding=padding, groups=dim, dilation=dilation
         )  # depthwise conv
         self.norm = nn.LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(
+            dim, intermediate_dim
+        )  # pointwise/1x1 convs, implemented with linear layers
         self.act = nn.GELU()
         self.grn = GRN(intermediate_dim)
         self.pwconv2 = nn.Linear(intermediate_dim, dim)
     def forward(self, x, emb=None):
         emb = self.linear(self.silu(emb))
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = torch.chunk(
+            emb, 6, dim=1
+        )
         x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
         return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
 class FeedForward(nn.Module):
+    def __init__(
+        self, dim, dim_out=None, mult=4, dropout=0.0, approximate: str = "none"
+    ):
         super().__init__()
         inner_dim = int(dim * mult)
         dim_out = dim_out if dim_out is not None else dim
         activation = nn.GELU(approximate=approximate)
         project_in = nn.Sequential(nn.Linear(dim, inner_dim), activation)
+        self.ff = nn.Sequential(
+            project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out)
+        )
     def forward(self, x):
         return self.ff(x)
         super().__init__()
         if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "Attention equires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
         self.processor = processor
         c_rope=None,  # rotary position embedding for c
     ) -> torch.Tensor:
         if c is not None:
+            return self.processor(
+                self, x, c=c, mask=mask, src_mask=src_mask, rope=rope, c_rope=c_rope
+            )
         else:
             return self.processor(self, x, mask=mask, rope=rope)
         # apply rotary position embedding
         if rope is not None:
             freqs, xpos_scale = rope
+            q_xpos_scale, k_xpos_scale = (
+                (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
+            )
             query = apply_rotary_pos_emb(query, freqs, q_xpos_scale)
             key = apply_rotary_pos_emb(key, freqs, k_xpos_scale)
         if mask is not None:
             attn_mask = mask
             attn_mask = attn_mask.unsqueeze(1).unsqueeze(1)  # 'b n -> b 1 1 n'
+            attn_mask = attn_mask.expand(
+                batch_size, attn.heads, query.shape[-2], key.shape[-2]
+            )
         else:
             attn_mask = None
+        x = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attn_mask, dropout_p=0.0, is_causal=False
+        )
         x = x.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
         x = x.to(query.dtype)
     def __call__(
         self,
         attn: Attention,
+        x: float["b n d"],  # noised input x
+        c: float["b nt d"] = None,  # context c, here text
         mask: bool["b n"] | None = None,
         src_mask: bool["b nt"] | None = None,
+        rope=None,  # rotary position embedding for x
+        c_rope=None,  # rotary position embedding for c
     ) -> torch.FloatTensor:
         residual = x
         batch_size = c.shape[0]
         # apply rope for x
         if rope is not None:
             freqs, xpos_scale = rope
+            q_xpos_scale, k_xpos_scale = (
+                (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
+            )
             query = apply_rotary_pos_emb(query, freqs, q_xpos_scale)
             key = apply_rotary_pos_emb(key, freqs, k_xpos_scale)
         # apply rope for c
         if c_rope is not None:
             freqs, xpos_scale = c_rope
+            q_xpos_scale, k_xpos_scale = (
+                (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
+            )
             c_query = apply_rotary_pos_emb(c_query, freqs, q_xpos_scale)
             c_key = apply_rotary_pos_emb(c_key, freqs, k_xpos_scale)
                 attn_mask_c = F.pad(src_mask, (x.shape[1], 0), value=True)
                 attn_mask = attn_mask & attn_mask_c
             attn_mask = attn_mask.unsqueeze(1).unsqueeze(1)
+            attn_mask = attn_mask.expand(
+                batch_size, attn.heads, query.shape[-2], key.shape[-2]
+            )
         else:
             if src_mask is not None:
                 # if there's no mask for x but there's src_mask
                 attn_mask = F.pad(src_mask, (x.shape[1], 0), value=True)
                 attn_mask = attn_mask.unsqueeze(1).unsqueeze(1)
+                attn_mask = attn_mask.expand(
+                    batch_size, attn.heads, query.shape[-2], key.shape[-2]
+                )
             else:
                 attn_mask = None
+        x = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attn_mask, dropout_p=0.0, is_causal=False
+        )
         x = x.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
         x = x.to(query.dtype)
         return x, c
 # DiT Block
         )
         self.ff_norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff = FeedForward(
+            dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh"
+        )
     def forward(self, x, t, mask=None, rope=None):  # x: noised input, t: time embedding
         # pre-norm & modulation for attention input
     context_pre_only: last layer only do prenorm + modulation cuz no more ffn
     """
+    def __init__(
+        self, dim, heads, dim_head, ff_mult=4, dropout=0.1, context_pre_only=False
+    ):
         super().__init__()
         self.context_pre_only = context_pre_only
+        self.attn_norm_c = (
+            AdaLayerNormZero_Final(dim) if context_pre_only else AdaLayerNormZero(dim)
+        )
         self.attn_norm_x = AdaLayerNormZero(dim)
         self.attn = Attention(
             processor=JointAttnProcessor(),
         if not context_pre_only:
             self.ff_norm_c = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+            self.ff_c = FeedForward(
+                dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh"
+            )
         else:
             self.ff_norm_c = None
             self.ff_c = None
         self.ff_norm_x = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff_x = FeedForward(
+            dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh"
+        )
+    def forward(
+        self, x, c, t, mask=None, src_mask=None, rope=None, c_rope=None
+    ):  # x: noised input, c: context, t: time embedding
         # pre-norm & modulation for attention input
         if self.context_pre_only:
             norm_c = self.attn_norm_c(c, t)
         else:
+            norm_c, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.attn_norm_c(
+                c, emb=t
+            )
+        norm_x, x_gate_msa, x_shift_mlp, x_scale_mlp, x_gate_mlp = self.attn_norm_x(
+            x, emb=t
+        )
         # attention
+        x_attn_output, c_attn_output = self.attn(
+            x=norm_x, c=norm_c, mask=mask, src_mask=src_mask, rope=rope, c_rope=c_rope
+        )
         # process attention output for context c
         if self.context_pre_only:
         else:  # if not last layer
             c = c + c_gate_msa.unsqueeze(1) * c_attn_output
+            norm_c = (
+                self.ff_norm_c(c) * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
+            )
             c_ff_output = self.ff_c(norm_c)
             c = c + c_gate_mlp.unsqueeze(1) * c_ff_output
     def __init__(self, dim, freq_embed_dim=256):
         super().__init__()
         self.time_embed = SinusPositionEmbedding(freq_embed_dim)
+        self.time_mlp = nn.Sequential(
+            nn.Linear(freq_embed_dim, dim), nn.SiLU(), nn.Linear(dim, dim)
+        )
     def forward(self, timestep: float["b"]):  # noqa: F821
         time_hidden = self.time_embed(timestep)

f5_tts/model/trainer.py CHANGED Viewed

@@ -67,7 +67,13 @@ class Trainer:
         self.logger = logger
         if self.logger == "wandb":
             if exists(wandb_resume_id):
-                init_kwargs = {"wandb": {"resume": "allow", "name": wandb_run_name, "id": wandb_resume_id}}
             else:
                 init_kwargs = {"wandb": {"resume": "allow", "name": wandb_run_name}}
@@ -102,7 +108,9 @@ class Trainer:
         self.epochs = epochs
         self.num_warmup_updates = num_warmup_updates
         self.save_per_updates = save_per_updates
-        self.last_per_steps = default(last_per_steps, save_per_updates * grad_accumulation_steps)
         self.checkpoint_path = default(checkpoint_path, "ckpts/test_e2-tts")
         self.batch_size = batch_size
@@ -126,8 +134,10 @@ class Trainer:
             self.optimizer = bnb.optim.AdamW8bit(model.parameters(), lr=learning_rate)
         else:
             self.optimizer = AdamW(model.parameters(), lr=learning_rate)
-        self.model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
         self.scale = None
         self.count = 0
@@ -137,10 +147,12 @@ class Trainer:
     def save_checkpoint(self, step, last=False):
         self.accelerator.wait_for_everyone()
-        if self.is_main:
             checkpoint = dict(
                 model_state_dict=self.accelerator.unwrap_model(self.model).state_dict(),
-                optimizer_state_dict=self.accelerator.unwrap_model(self.optimizer).state_dict(),
                 ema_model_state_dict=self.ema_model.state_dict(),
                 scheduler_state_dict=self.scheduler.state_dict(),
                 step=step,
@@ -150,16 +162,23 @@ class Trainer:
             if not os.path.exists(self.checkpoint_path):
                 os.makedirs(self.checkpoint_path)
             if last:
-                self.accelerator.save(checkpoint, f"{self.checkpoint_path}/model_last.pt")
                 print(f"Saved last checkpoint at step {step}")
             else:
-                self.accelerator.save(checkpoint, f"{self.checkpoint_path}/model_{step}.pt")
     def load_checkpoint(self):
         if (
             not exists(self.checkpoint_path)
             or not os.path.exists(self.checkpoint_path)
-            or not any(filename.endswith(".pt") for filename in os.listdir(self.checkpoint_path))
         ):
             return 0
@@ -172,10 +191,17 @@ class Trainer:
                 key=lambda x: int("".join(filter(str.isdigit, x))),
             )[-1]
         # checkpoint = torch.load(f"{self.checkpoint_path}/{latest_checkpoint}", map_location=self.accelerator.device)  # rather use accelerator.load_state ಥ_ಥ
-        checkpoint = torch.load(f"{self.checkpoint_path}/{latest_checkpoint}", weights_only=True, map_location="cpu")
         # patch for backward compatibility, 305e3ea
-        for key in ["ema_model.mel_spec.mel_stft.mel_scale.fb", "ema_model.mel_spec.mel_stft.spectrogram.window"]:
             if key in checkpoint["ema_model_state_dict"]:
                 del checkpoint["ema_model_state_dict"][key]
@@ -184,12 +210,19 @@ class Trainer:
         if "step" in checkpoint:
             # patch for backward compatibility, 305e3ea
-            for key in ["mel_spec.mel_stft.mel_scale.fb", "mel_spec.mel_stft.spectrogram.window"]:
                 if key in checkpoint["model_state_dict"]:
                     del checkpoint["model_state_dict"][key]
-            self.accelerator.unwrap_model(self.model).load_state_dict(checkpoint["model_state_dict"])
-            self.accelerator.unwrap_model(self.optimizer).load_state_dict(checkpoint["optimizer_state_dict"])
             if self.scheduler:
                 self.scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
             step = checkpoint["step"]
@@ -199,28 +232,37 @@ class Trainer:
                 for k, v in checkpoint["ema_model_state_dict"].items()
                 if k not in ["initted", "step"]
             }
-            self.accelerator.unwrap_model(self.model).load_state_dict(checkpoint["model_state_dict"])
             step = 0
         if "scale" in checkpoint:
             self.scale = float(checkpoint["scale"])
             self.model.scale = self.scale
         if "count" in checkpoint:
             self.count = int(checkpoint["count"])
         del checkpoint
         gc.collect()
         return step
-    def train(self, train_dataset: Dataset, num_workers=16, resumable_with_seed: int = None):
         if self.log_samples:
-            from f5_tts.infer.utils_infer import cfg_strength, load_vocoder, nfe_step, sway_sampling_coef
             vocoder = load_vocoder(
-                vocoder_name=self.vocoder_name, is_local=self.is_local_vocoder, local_path=self.local_vocoder_path
             )
-            target_sample_rate = self.accelerator.unwrap_model(self.model).mel_spec.target_sample_rate
             log_samples_path = f"{self.checkpoint_path}/samples"
             os.makedirs(log_samples_path, exist_ok=True)
@@ -245,7 +287,11 @@ class Trainer:
             self.accelerator.even_batches = False
             sampler = SequentialSampler(train_dataset)
             batch_sampler = DynamicBatchSampler(
-                sampler, self.batch_size, max_samples=self.max_samples, random_seed=resumable_with_seed, drop_last=False
             )
             train_dataloader = DataLoader(
                 train_dataset,
@@ -256,7 +302,9 @@ class Trainer:
                 batch_sampler=batch_sampler,
             )
         else:
-            raise ValueError(f"batch_size_type must be either 'sample' or 'frame', but received {self.batch_size_type}")
         #  accelerator.prepare() dispatches batches to devices;
         #  which means the length of dataloader calculated before, should consider the number of devices
@@ -266,10 +314,16 @@ class Trainer:
         # otherwise by default with split_batches=False, warmup steps change with num_processes
         total_steps = len(train_dataloader) * self.epochs / self.grad_accumulation_steps
         decay_steps = total_steps - warmup_steps
-        warmup_scheduler = LinearLR(self.optimizer, start_factor=1e-8, end_factor=1.0, total_iters=warmup_steps)
-        decay_scheduler = LinearLR(self.optimizer, start_factor=1.0, end_factor=1e-8, total_iters=decay_steps)
         self.scheduler = SequentialLR(
-            self.optimizer, schedulers=[warmup_scheduler, decay_scheduler], milestones=[warmup_steps]
         )
         train_dataloader, self.scheduler = self.accelerator.prepare(
             train_dataloader, self.scheduler
@@ -281,7 +335,9 @@ class Trainer:
             orig_epoch_step = len(train_dataloader)
             skipped_epoch = int(start_step // orig_epoch_step)
             skipped_batch = start_step % orig_epoch_step
-            skipped_dataloader = self.accelerator.skip_first_batches(train_dataloader, num_batches=skipped_batch)
         else:
             skipped_epoch = 0
@@ -309,28 +365,40 @@ class Trainer:
                     text_inputs = batch["text"]
                     mel_spec = batch["mel"].permute(0, 2, 1)
                     mel_lengths = batch["mel_lengths"]
                     self.count += 1
                     if self.scale is None:
                         self.scale = mel_spec.std()
                     else:
                         self.scale += (mel_spec.std() - self.scale) / self.count
-                    mel_spec = mel_spec / self.scale # normalize mel spectrogram
                     # TODO. add duration predictor training
-                    if self.duration_predictor is not None and self.accelerator.is_local_main_process:
-                        dur_loss = self.duration_predictor(mel_spec, lens=batch.get("durations"))
-                        self.accelerator.log({"duration loss": dur_loss.item()}, step=global_step)
                     loss, cond, pred, t = self.model(
-                        mel_spec, text=text_inputs, lens=mel_lengths, noise_scheduler=self.noise_scheduler
                     )
                     self.accelerator.backward(loss)
                     if self.max_grad_norm > 0 and self.accelerator.sync_gradients:
-                        self.accelerator.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
                     self.optimizer.step()
                     self.scheduler.step()
@@ -342,18 +410,30 @@ class Trainer:
                 global_step += 1
                 if self.accelerator.is_local_main_process:
-                    self.accelerator.log({"loss": loss.item(), "lr": self.scheduler.get_last_lr()[0]}, step=global_step)
                     if self.logger == "tensorboard":
                         self.writer.add_scalar("loss", loss.item(), global_step)
-                        self.writer.add_scalar("lr", self.scheduler.get_last_lr()[0], global_step)
                 progress_bar.set_postfix(step=str(global_step), loss=loss.item())
-                if global_step % (self.save_per_updates * self.grad_accumulation_steps) == 0:
                     self.save_checkpoint(global_step)
                     if self.log_samples and self.accelerator.is_local_main_process:
-                        gen_mel_spec = pred[0].unsqueeze(0).permute(0, 2, 1) * self.scale
-                        ref_mel_spec = cond[0].unsqueeze(0).permute(0, 2, 1) * self.scale
                         with torch.inference_mode():
                             if self.vocoder_name == "vocos":
                                 gen_audio = vocoder.decode(gen_mel_spec).cpu()
@@ -361,51 +441,56 @@ class Trainer:
                             elif self.vocoder_name == "bigvgan":
                                 gen_audio = vocoder(gen_mel_spec).squeeze(0).cpu()
                                 ref_audio = vocoder(ref_mel_spec).squeeze(0).cpu()
                             gen_audio = wandb.Audio(
                                 gen_audio.float().numpy().squeeze(),
                                 sample_rate=24000,
-                                caption="time: " + str(t[0].squeeze().float().cpu().numpy())
                             )
                             ref_audio = wandb.Audio(
                                 ref_audio.float().numpy().squeeze(),
                                 sample_rate=24000,
-                                caption="time: " + str(t[0].squeeze().float().cpu().numpy())
                             )
-                            self.accelerator.log({"gen_audio": gen_audio,
-                                                  "ref_audio": ref_audio,
-                                                 }, step=global_step)
-#                     if self.log_samples and self.accelerator.is_local_main_process:
-#                         ref_audio_len = mel_lengths[0]
-#                         infer_text = [
-#                             text_inputs[0] + ([" "] if isinstance(text_inputs[0], list) else " ") + text_inputs[0]
-#                         ]
-#                         with torch.inference_mode():
-# #                             generated, _ = self.accelerator.unwrap_model(self.model).sample(
-# #                                 cond=mel_spec[0][:ref_audio_len].unsqueeze(0),
-# #                                 text=infer_text,
-# #                                 duration=ref_audio_len * 2,
-# #                                 steps=nfe_step,
-# #                                 cfg_strength=cfg_strength,
-# #                                 sway_sampling_coef=sway_sampling_coef,
-# #                             )
-# #                             generated = generated.to(torch.float32)
-# #                             gen_mel_spec = generated[:, ref_audio_len:, :].permute(0, 2, 1).to(self.accelerator.device)
-# #                             ref_mel_spec = batch["mel"][0].unsqueeze(0)
-#                             gen_mel_spec = pred[0].unsqueeze(0).permute(0, 2, 1)
-#                             ref_mel_spec = cond[0].unsqueeze(0).permute(0, 2, 1)
-#                             if self.vocoder_name == "vocos":
-#                                 gen_audio = vocoder.decode(gen_mel_spec).cpu()
-#                                 ref_audio = vocoder.decode(ref_mel_spec).cpu()
-#                             elif self.vocoder_name == "bigvgan":
-#                                 gen_audio = vocoder(gen_mel_spec).squeeze(0).cpu()
-#                                 ref_audio = vocoder(ref_mel_spec).squeeze(0).cpu()
-#                         torchaudio.save(f"{log_samples_path}/step_{global_step}_gen.wav", gen_audio, target_sample_rate)
-#                         torchaudio.save(f"{log_samples_path}/step_{global_step}_ref.wav", ref_audio, target_sample_rate)
                 if global_step % self.last_per_steps == 0:
                     self.save_checkpoint(global_step, last=True)

         self.logger = logger
         if self.logger == "wandb":
             if exists(wandb_resume_id):
+                init_kwargs = {
+                    "wandb": {
+                        "resume": "allow",
+                        "name": wandb_run_name,
+                        "id": wandb_resume_id,
+                    }
+                }
             else:
                 init_kwargs = {"wandb": {"resume": "allow", "name": wandb_run_name}}
         self.epochs = epochs
         self.num_warmup_updates = num_warmup_updates
         self.save_per_updates = save_per_updates
+        self.last_per_steps = default(
+            last_per_steps, save_per_updates * grad_accumulation_steps
+        )
         self.checkpoint_path = default(checkpoint_path, "ckpts/test_e2-tts")
         self.batch_size = batch_size
             self.optimizer = bnb.optim.AdamW8bit(model.parameters(), lr=learning_rate)
         else:
             self.optimizer = AdamW(model.parameters(), lr=learning_rate)
+        self.model, self.optimizer = self.accelerator.prepare(
+            self.model, self.optimizer
+        )
         self.scale = None
         self.count = 0
     def save_checkpoint(self, step, last=False):
         self.accelerator.wait_for_everyone()
+        if self.is_main:
             checkpoint = dict(
                 model_state_dict=self.accelerator.unwrap_model(self.model).state_dict(),
+                optimizer_state_dict=self.accelerator.unwrap_model(
+                    self.optimizer
+                ).state_dict(),
                 ema_model_state_dict=self.ema_model.state_dict(),
                 scheduler_state_dict=self.scheduler.state_dict(),
                 step=step,
             if not os.path.exists(self.checkpoint_path):
                 os.makedirs(self.checkpoint_path)
             if last:
+                self.accelerator.save(
+                    checkpoint, f"{self.checkpoint_path}/model_last.pt"
+                )
                 print(f"Saved last checkpoint at step {step}")
             else:
+                self.accelerator.save(
+                    checkpoint, f"{self.checkpoint_path}/model_{step}.pt"
+                )
     def load_checkpoint(self):
         if (
             not exists(self.checkpoint_path)
             or not os.path.exists(self.checkpoint_path)
+            or not any(
+                filename.endswith(".pt")
+                for filename in os.listdir(self.checkpoint_path)
+            )
         ):
             return 0
                 key=lambda x: int("".join(filter(str.isdigit, x))),
             )[-1]
         # checkpoint = torch.load(f"{self.checkpoint_path}/{latest_checkpoint}", map_location=self.accelerator.device)  # rather use accelerator.load_state ಥ_ಥ
+        checkpoint = torch.load(
+            f"{self.checkpoint_path}/{latest_checkpoint}",
+            weights_only=True,
+            map_location="cpu",
+        )
         # patch for backward compatibility, 305e3ea
+        for key in [
+            "ema_model.mel_spec.mel_stft.mel_scale.fb",
+            "ema_model.mel_spec.mel_stft.spectrogram.window",
+        ]:
             if key in checkpoint["ema_model_state_dict"]:
                 del checkpoint["ema_model_state_dict"][key]
         if "step" in checkpoint:
             # patch for backward compatibility, 305e3ea
+            for key in [
+                "mel_spec.mel_stft.mel_scale.fb",
+                "mel_spec.mel_stft.spectrogram.window",
+            ]:
                 if key in checkpoint["model_state_dict"]:
                     del checkpoint["model_state_dict"][key]
+            self.accelerator.unwrap_model(self.model).load_state_dict(
+                checkpoint["model_state_dict"]
+            )
+            self.accelerator.unwrap_model(self.optimizer).load_state_dict(
+                checkpoint["optimizer_state_dict"]
+            )
             if self.scheduler:
                 self.scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
             step = checkpoint["step"]
                 for k, v in checkpoint["ema_model_state_dict"].items()
                 if k not in ["initted", "step"]
             }
+            self.accelerator.unwrap_model(self.model).load_state_dict(
+                checkpoint["model_state_dict"]
+            )
             step = 0
         if "scale" in checkpoint:
             self.scale = float(checkpoint["scale"])
             self.model.scale = self.scale
         if "count" in checkpoint:
             self.count = int(checkpoint["count"])
         del checkpoint
         gc.collect()
         return step
+    def train(
+        self, train_dataset: Dataset, num_workers=16, resumable_with_seed: int = None
+    ):
         if self.log_samples:
+            from f5_tts.infer.utils_infer import (cfg_strength, load_vocoder,
+                                                  nfe_step, sway_sampling_coef)
             vocoder = load_vocoder(
+                vocoder_name=self.vocoder_name,
+                is_local=self.is_local_vocoder,
+                local_path=self.local_vocoder_path,
             )
+            target_sample_rate = self.accelerator.unwrap_model(
+                self.model
+            ).mel_spec.target_sample_rate
             log_samples_path = f"{self.checkpoint_path}/samples"
             os.makedirs(log_samples_path, exist_ok=True)
             self.accelerator.even_batches = False
             sampler = SequentialSampler(train_dataset)
             batch_sampler = DynamicBatchSampler(
+                sampler,
+                self.batch_size,
+                max_samples=self.max_samples,
+                random_seed=resumable_with_seed,
+                drop_last=False,
             )
             train_dataloader = DataLoader(
                 train_dataset,
                 batch_sampler=batch_sampler,
             )
         else:
+            raise ValueError(
+                f"batch_size_type must be either 'sample' or 'frame', but received {self.batch_size_type}"
+            )
         #  accelerator.prepare() dispatches batches to devices;
         #  which means the length of dataloader calculated before, should consider the number of devices
         # otherwise by default with split_batches=False, warmup steps change with num_processes
         total_steps = len(train_dataloader) * self.epochs / self.grad_accumulation_steps
         decay_steps = total_steps - warmup_steps
+        warmup_scheduler = LinearLR(
+            self.optimizer, start_factor=1e-8, end_factor=1.0, total_iters=warmup_steps
+        )
+        decay_scheduler = LinearLR(
+            self.optimizer, start_factor=1.0, end_factor=1e-8, total_iters=decay_steps
+        )
         self.scheduler = SequentialLR(
+            self.optimizer,
+            schedulers=[warmup_scheduler, decay_scheduler],
+            milestones=[warmup_steps],
         )
         train_dataloader, self.scheduler = self.accelerator.prepare(
             train_dataloader, self.scheduler
             orig_epoch_step = len(train_dataloader)
             skipped_epoch = int(start_step // orig_epoch_step)
             skipped_batch = start_step % orig_epoch_step
+            skipped_dataloader = self.accelerator.skip_first_batches(
+                train_dataloader, num_batches=skipped_batch
+            )
         else:
             skipped_epoch = 0
                     text_inputs = batch["text"]
                     mel_spec = batch["mel"].permute(0, 2, 1)
                     mel_lengths = batch["mel_lengths"]
                     self.count += 1
                     if self.scale is None:
                         self.scale = mel_spec.std()
                     else:
                         self.scale += (mel_spec.std() - self.scale) / self.count
+                    mel_spec = mel_spec / self.scale  # normalize mel spectrogram
                     # TODO. add duration predictor training
+                    if (
+                        self.duration_predictor is not None
+                        and self.accelerator.is_local_main_process
+                    ):
+                        dur_loss = self.duration_predictor(
+                            mel_spec, lens=batch.get("durations")
+                        )
+                        self.accelerator.log(
+                            {"duration loss": dur_loss.item()}, step=global_step
+                        )
                     loss, cond, pred, t = self.model(
+                        mel_spec,
+                        text=text_inputs,
+                        lens=mel_lengths,
+                        noise_scheduler=self.noise_scheduler,
                     )
                     self.accelerator.backward(loss)
                     if self.max_grad_norm > 0 and self.accelerator.sync_gradients:
+                        self.accelerator.clip_grad_norm_(
+                            self.model.parameters(), self.max_grad_norm
+                        )
                     self.optimizer.step()
                     self.scheduler.step()
                 global_step += 1
                 if self.accelerator.is_local_main_process:
+                    self.accelerator.log(
+                        {"loss": loss.item(), "lr": self.scheduler.get_last_lr()[0]},
+                        step=global_step,
+                    )
                     if self.logger == "tensorboard":
                         self.writer.add_scalar("loss", loss.item(), global_step)
+                        self.writer.add_scalar(
+                            "lr", self.scheduler.get_last_lr()[0], global_step
+                        )
                 progress_bar.set_postfix(step=str(global_step), loss=loss.item())
+                if (
+                    global_step % (self.save_per_updates * self.grad_accumulation_steps)
+                    == 0
+                ):
                     self.save_checkpoint(global_step)
                     if self.log_samples and self.accelerator.is_local_main_process:
+                        gen_mel_spec = (
+                            pred[0].unsqueeze(0).permute(0, 2, 1) * self.scale
+                        )
+                        ref_mel_spec = (
+                            cond[0].unsqueeze(0).permute(0, 2, 1) * self.scale
+                        )
                         with torch.inference_mode():
                             if self.vocoder_name == "vocos":
                                 gen_audio = vocoder.decode(gen_mel_spec).cpu()
                             elif self.vocoder_name == "bigvgan":
                                 gen_audio = vocoder(gen_mel_spec).squeeze(0).cpu()
                                 ref_audio = vocoder(ref_mel_spec).squeeze(0).cpu()
                             gen_audio = wandb.Audio(
                                 gen_audio.float().numpy().squeeze(),
                                 sample_rate=24000,
+                                caption="time: "
+                                + str(t[0].squeeze().float().cpu().numpy()),
                             )
                             ref_audio = wandb.Audio(
                                 ref_audio.float().numpy().squeeze(),
                                 sample_rate=24000,
+                                caption="time: "
+                                + str(t[0].squeeze().float().cpu().numpy()),
+                            )
+                            self.accelerator.log(
+                                {
+                                    "gen_audio": gen_audio,
+                                    "ref_audio": ref_audio,
+                                },
+                                step=global_step,
                             )
+                #                     if self.log_samples and self.accelerator.is_local_main_process:
+                #                         ref_audio_len = mel_lengths[0]
+                #                         infer_text = [
+                #                             text_inputs[0] + ([" "] if isinstance(text_inputs[0], list) else " ") + text_inputs[0]
+                #                         ]
+                #                         with torch.inference_mode():
+                # #                             generated, _ = self.accelerator.unwrap_model(self.model).sample(
+                # #                                 cond=mel_spec[0][:ref_audio_len].unsqueeze(0),
+                # #                                 text=infer_text,
+                # #                                 duration=ref_audio_len * 2,
+                # #                                 steps=nfe_step,
+                # #                                 cfg_strength=cfg_strength,
+                # #                                 sway_sampling_coef=sway_sampling_coef,
+                # #                             )
+                # #                             generated = generated.to(torch.float32)
+                # #                             gen_mel_spec = generated[:, ref_audio_len:, :].permute(0, 2, 1).to(self.accelerator.device)
+                # #                             ref_mel_spec = batch["mel"][0].unsqueeze(0)
+                #                             gen_mel_spec = pred[0].unsqueeze(0).permute(0, 2, 1)
+                #                             ref_mel_spec = cond[0].unsqueeze(0).permute(0, 2, 1)
+                #                             if self.vocoder_name == "vocos":
+                #                                 gen_audio = vocoder.decode(gen_mel_spec).cpu()
+                #                                 ref_audio = vocoder.decode(ref_mel_spec).cpu()
+                #                             elif self.vocoder_name == "bigvgan":
+                #                                 gen_audio = vocoder(gen_mel_spec).squeeze(0).cpu()
+                #                                 ref_audio = vocoder(ref_mel_spec).squeeze(0).cpu()
+                #                         torchaudio.save(f"{log_samples_path}/step_{global_step}_gen.wav", gen_audio, target_sample_rate)
+                #                         torchaudio.save(f"{log_samples_path}/step_{global_step}_ref.wav", ref_audio, target_sample_rate)
                 if global_step % self.last_per_steps == 0:
                     self.save_checkpoint(global_step, last=True)

f5_tts/model/utils.py CHANGED Viewed

@@ -5,13 +5,11 @@ import random
 from collections import defaultdict
 from importlib.resources import files
 import torch
 from torch.nn.utils.rnn import pad_sequence
-import jieba
-from pypinyin import lazy_pinyin, Style
 # seed everything
@@ -39,7 +37,9 @@ def default(v, d):
 # tensor helpers
-def lens_to_mask(t: int["b"], length: int | None = None) -> bool["b n"]:  # noqa: F722 F821
     if not exists(length):
         length = t.amax()
@@ -47,7 +47,9 @@ def lens_to_mask(t: int["b"], length: int | None = None) -> bool["b n"]:  # noqa
     return seq[None, :] < t[:, None]
-def mask_from_start_end_indices(seq_len: int["b"], start: int["b"], end: int["b"]):  # noqa: F722 F821
     max_seq_len = seq_len.max().item()
     seq = torch.arange(max_seq_len, device=start.device).long()
     start_mask = seq[None, :] >= start[:, None]
@@ -55,7 +57,9 @@ def mask_from_start_end_indices(seq_len: int["b"], start: int["b"], end: int["b"
     return start_mask & end_mask
-def mask_from_frac_lengths(seq_len: int["b"], frac_lengths: float["b"]):  # noqa: F722 F821
     lengths = (frac_lengths * seq_len).long()
     max_start = seq_len - lengths
@@ -66,7 +70,9 @@ def mask_from_frac_lengths(seq_len: int["b"], frac_lengths: float["b"]):  # noqa
     return mask_from_start_end_indices(seq_len, start, end)
-def maybe_masked_mean(t: float["b n d"], mask: bool["b n"] = None) -> float["b d"]:  # noqa: F722
     if not exists(mask):
         return t.mean(dim=1)
@@ -90,7 +96,9 @@ def list_str_to_idx(
     vocab_char_map: dict[str, int],  # {char: idx}
     padding_value=-1,
 ) -> int["b nt"]:  # noqa: F722
-    list_idx_tensors = [torch.tensor([vocab_char_map.get(c, 0) for c in t]) for t in text]  # pinyin or char style
     text = pad_sequence(list_idx_tensors, padding_value=padding_value, batch_first=True)
     return text
@@ -109,13 +117,17 @@ def get_tokenizer(dataset_name, tokenizer: str = "pinyin"):
                 - if use "byte", set to 256 (unicode byte range)
     """
     if tokenizer in ["pinyin", "char"]:
-        tokenizer_path = os.path.join(files("f5_tts").joinpath("../data"), f"{dataset_name}_{tokenizer}/vocab.txt")
         with open(tokenizer_path, "r", encoding="utf-8") as f:
             vocab_char_map = {}
             for i, char in enumerate(f):
                 vocab_char_map[char[:-1]] = i
         vocab_size = len(vocab_char_map)
-        assert vocab_char_map[" "] == 0, "make sure space is of idx 0 in vocab.txt, cuz 0 is used for unknown char"
     elif tokenizer == "byte":
         vocab_char_map = None
@@ -131,7 +143,6 @@ def get_tokenizer(dataset_name, tokenizer: str = "pinyin"):
     return vocab_char_map, vocab_size
 # convert char to pinyin
 jieba.initialize()
@@ -145,9 +156,7 @@ def convert_char_to_pinyin(text_list, polyphone=True):
     )  # add custom trans here, to address oov
     def is_chinese(c):
-        return (
-            "\u3100" <= c <= "\u9fff"  # common chinese characters
-        )
     for text in text_list:
         char_list = []
@@ -158,7 +167,9 @@ def convert_char_to_pinyin(text_list, polyphone=True):
                 if char_list and seg_byte_len > 1 and char_list[-1] not in " :'\"":
                     char_list.append(" ")
                 char_list.extend(seg)
-            elif polyphone and seg_byte_len == 3 * len(seg):  # if pure east asian characters
                 seg_ = lazy_pinyin(seg, style=Style.TONE3, tone_sandhi=True)
                 for i, c in enumerate(seg):
                     if is_chinese(c):
@@ -170,7 +181,9 @@ def convert_char_to_pinyin(text_list, polyphone=True):
                         char_list.extend(c)
                     elif is_chinese(c):
                         char_list.append(" ")
-                        char_list.extend(lazy_pinyin(c, style=Style.TONE3, tone_sandhi=True))
                     else:
                         char_list.append(c)
         final_text_list.append(char_list)
@@ -224,7 +237,7 @@ def load_checkpoint(model, ckpt_path, device, use_ema=True):
 def sample_consecutive_steps(float_list):
     idx = torch.randint(0, len(float_list), size=(1,))
     next_idx = idx - 1
     if next_idx < 0:
         next_idx = 0
     else:

 from collections import defaultdict
 from importlib.resources import files
+import jieba
 import torch
+from pypinyin import Style, lazy_pinyin
 from torch.nn.utils.rnn import pad_sequence
 # seed everything
 # tensor helpers
+def lens_to_mask(
+    t: int["b"], length: int | None = None
+) -> bool["b n"]:  # noqa: F722 F821
     if not exists(length):
         length = t.amax()
     return seq[None, :] < t[:, None]
+def mask_from_start_end_indices(
+    seq_len: int["b"], start: int["b"], end: int["b"]
+):  # noqa: F722 F821
     max_seq_len = seq_len.max().item()
     seq = torch.arange(max_seq_len, device=start.device).long()
     start_mask = seq[None, :] >= start[:, None]
     return start_mask & end_mask
+def mask_from_frac_lengths(
+    seq_len: int["b"], frac_lengths: float["b"]
+):  # noqa: F722 F821
     lengths = (frac_lengths * seq_len).long()
     max_start = seq_len - lengths
     return mask_from_start_end_indices(seq_len, start, end)
+def maybe_masked_mean(
+    t: float["b n d"], mask: bool["b n"] = None
+) -> float["b d"]:  # noqa: F722
     if not exists(mask):
         return t.mean(dim=1)
     vocab_char_map: dict[str, int],  # {char: idx}
     padding_value=-1,
 ) -> int["b nt"]:  # noqa: F722
+    list_idx_tensors = [
+        torch.tensor([vocab_char_map.get(c, 0) for c in t]) for t in text
+    ]  # pinyin or char style
     text = pad_sequence(list_idx_tensors, padding_value=padding_value, batch_first=True)
     return text
                 - if use "byte", set to 256 (unicode byte range)
     """
     if tokenizer in ["pinyin", "char"]:
+        tokenizer_path = os.path.join(
+            files("f5_tts").joinpath("../data"), f"{dataset_name}_{tokenizer}/vocab.txt"
+        )
         with open(tokenizer_path, "r", encoding="utf-8") as f:
             vocab_char_map = {}
             for i, char in enumerate(f):
                 vocab_char_map[char[:-1]] = i
         vocab_size = len(vocab_char_map)
+        assert (
+            vocab_char_map[" "] == 0
+        ), "make sure space is of idx 0 in vocab.txt, cuz 0 is used for unknown char"
     elif tokenizer == "byte":
         vocab_char_map = None
     return vocab_char_map, vocab_size
 # convert char to pinyin
 jieba.initialize()
     )  # add custom trans here, to address oov
     def is_chinese(c):
+        return "\u3100" <= c <= "\u9fff"  # common chinese characters
     for text in text_list:
         char_list = []
                 if char_list and seg_byte_len > 1 and char_list[-1] not in " :'\"":
                     char_list.append(" ")
                 char_list.extend(seg)
+            elif polyphone and seg_byte_len == 3 * len(
+                seg
+            ):  # if pure east asian characters
                 seg_ = lazy_pinyin(seg, style=Style.TONE3, tone_sandhi=True)
                 for i, c in enumerate(seg):
                     if is_chinese(c):
                         char_list.extend(c)
                     elif is_chinese(c):
                         char_list.append(" ")
+                        char_list.extend(
+                            lazy_pinyin(c, style=Style.TONE3, tone_sandhi=True)
+                        )
                     else:
                         char_list.append(c)
         final_text_list.append(char_list)
 def sample_consecutive_steps(float_list):
     idx = torch.randint(0, len(float_list), size=(1,))
     next_idx = idx - 1
     if next_idx < 0:
         next_idx = 0
     else:

f5_tts/model_new/__init__.py CHANGED Viewed

@@ -4,5 +4,4 @@ from f5_tts.model_new.backbones.unett import UNetT
 from f5_tts.model_new.cfm import CFM
 from f5_tts.model_new.trainer import Trainer
 __all__ = ["CFM", "UNetT", "DiT", "MMDiT", "Trainer"]

 from f5_tts.model_new.cfm import CFM
 from f5_tts.model_new.trainer import Trainer
 __all__ = ["CFM", "UNetT", "DiT", "MMDiT", "Trainer"]

f5_tts/model_new/backbones/dit.py CHANGED Viewed

@@ -14,40 +14,49 @@ import torch.nn.functional as F
 from torch import nn
 from x_transformers.x_transformers import RotaryEmbedding
-from f5_tts.model_new.modules import (
-    AdaLayerNorm_Final,
-    ConvNeXtV2Block,
-    ConvPositionEmbedding,
-    DiTBlock,
-    TimestepEmbedding,
-    get_pos_embed_indices,
-    precompute_freqs_cis,
-)
 # Text embedding
 class TextEmbedding(nn.Module):
-    def __init__(self, text_num_embeds, text_dim, mask_padding=True, conv_layers=0, conv_mult=2):
         super().__init__()
-        self.text_embed = nn.Embedding(text_num_embeds + 1, text_dim)  # use 0 as filler token
         self.mask_padding = mask_padding  # mask filler and batch padding tokens or not
         if conv_layers > 0:
             self.extra_modeling = True
             self.precompute_max_pos = 4096  # ~44s of 24khz audio
-            self.register_buffer("freqs_cis", precompute_freqs_cis(text_dim, self.precompute_max_pos), persistent=False)
             self.text_blocks = nn.Sequential(
-                *[ConvNeXtV2Block(text_dim, text_dim * conv_mult) for _ in range(conv_layers)]
             )
         else:
             self.extra_modeling = False
     def forward(self, text: int["b nt"], seq_len, drop_text=False):  # noqa: F722
-        text = text + 1  # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
-        text = text[:, :seq_len]  # curtail if character tokens are more than the mel spec tokens
         batch, text_len = text.shape[0], text.shape[1]
         text = F.pad(text, (0, seq_len - text_len), value=0)
         if self.mask_padding:
@@ -62,16 +71,22 @@ class TextEmbedding(nn.Module):
         if self.extra_modeling:
             # sinus pos emb
             batch_start = torch.zeros((batch,), dtype=torch.long)
-            pos_idx = get_pos_embed_indices(batch_start, seq_len, max_pos=self.precompute_max_pos)
             text_pos_embed = self.freqs_cis[pos_idx]
             text = text + text_pos_embed
             # convnextv2 blocks
             if self.mask_padding:
-                text = text.masked_fill(text_mask.unsqueeze(-1).expand(-1, -1, text.size(-1)), 0.0)
                 for block in self.text_blocks:
                     text = block(text)
-                    text = text.masked_fill(text_mask.unsqueeze(-1).expand(-1, -1, text.size(-1)), 0.0)
             else:
                 text = self.text_blocks(text)
@@ -87,7 +102,13 @@ class InputEmbedding(nn.Module):
         self.proj = nn.Linear(mel_dim * 2 + text_dim, out_dim)
         self.conv_pos_embed = ConvPositionEmbedding(dim=out_dim)
-    def forward(self, x: float["b n d"], cond: float["b n d"], text_embed: float["b n d"], drop_audio_cond=False):  # noqa: F722
         if drop_audio_cond:  # cfg for cond audio
             cond = torch.zeros_like(cond)
@@ -127,7 +148,10 @@ class DiT(nn.Module):
         if text_dim is None:
             text_dim = mel_dim
         self.text_embed = TextEmbedding(
-            text_num_embeds, text_dim, mask_padding=text_mask_padding, conv_layers=conv_layers
         )
         self.text_cond, self.text_uncond = None, None  # text cache
         self.input_embed = InputEmbedding(mel_dim, text_dim, dim)
@@ -153,7 +177,9 @@ class DiT(nn.Module):
                 for _ in range(depth)
             ]
         )
-        self.long_skip_connection = nn.Linear(dim * 2, dim, bias=False) if long_skip_connection else None
         self.norm_out = AdaLayerNorm_Final(dim)  # final modulation
         self.proj_out = nn.Linear(dim, mel_dim)
@@ -230,13 +256,24 @@ class DiT(nn.Module):
         # t: conditioning time, text: text, x: noised audio + cond audio + text
         t = self.time_embed(time)
         if cfg_infer:  # pack cond & uncond forward: b n d -> 2b n d
-            x_cond = self.get_input_embed(x, cond, text, drop_audio_cond=False, drop_text=False, cache=cache)
-            x_uncond = self.get_input_embed(x, cond, text, drop_audio_cond=True, drop_text=True, cache=cache)
             x = torch.cat((x_cond, x_uncond), dim=0)
             t = torch.cat((t, t), dim=0)
             mask = torch.cat((mask, mask), dim=0) if mask is not None else None
         else:
-            x = self.get_input_embed(x, cond, text, drop_audio_cond=drop_audio_cond, drop_text=drop_text, cache=cache)
         rope = self.rotary_embed.forward_from_seq_len(seq_len)
@@ -246,7 +283,9 @@ class DiT(nn.Module):
         for block in self.transformer_blocks:
             if self.checkpoint_activations:
                 # https://pytorch.org/docs/stable/checkpoint.html#torch.utils.checkpoint.checkpoint
-                x = torch.utils.checkpoint.checkpoint(self.ckpt_wrapper(block), x, t, mask, rope, use_reentrant=False)
             else:
                 x = block(x, t, mask=mask, rope=rope)

 from torch import nn
 from x_transformers.x_transformers import RotaryEmbedding
+from f5_tts.model_new.modules import (AdaLayerNorm_Final, ConvNeXtV2Block,
+                                      ConvPositionEmbedding, DiTBlock,
+                                      TimestepEmbedding, get_pos_embed_indices,
+                                      precompute_freqs_cis)
 # Text embedding
 class TextEmbedding(nn.Module):
+    def __init__(
+        self, text_num_embeds, text_dim, mask_padding=True, conv_layers=0, conv_mult=2
+    ):
         super().__init__()
+        self.text_embed = nn.Embedding(
+            text_num_embeds + 1, text_dim
+        )  # use 0 as filler token
         self.mask_padding = mask_padding  # mask filler and batch padding tokens or not
         if conv_layers > 0:
             self.extra_modeling = True
             self.precompute_max_pos = 4096  # ~44s of 24khz audio
+            self.register_buffer(
+                "freqs_cis",
+                precompute_freqs_cis(text_dim, self.precompute_max_pos),
+                persistent=False,
+            )
             self.text_blocks = nn.Sequential(
+                *[
+                    ConvNeXtV2Block(text_dim, text_dim * conv_mult)
+                    for _ in range(conv_layers)
+                ]
             )
         else:
             self.extra_modeling = False
     def forward(self, text: int["b nt"], seq_len, drop_text=False):  # noqa: F722
+        text = (
+            text + 1
+        )  # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
+        text = text[
+            :, :seq_len
+        ]  # curtail if character tokens are more than the mel spec tokens
         batch, text_len = text.shape[0], text.shape[1]
         text = F.pad(text, (0, seq_len - text_len), value=0)
         if self.mask_padding:
         if self.extra_modeling:
             # sinus pos emb
             batch_start = torch.zeros((batch,), dtype=torch.long)
+            pos_idx = get_pos_embed_indices(
+                batch_start, seq_len, max_pos=self.precompute_max_pos
+            )
             text_pos_embed = self.freqs_cis[pos_idx]
             text = text + text_pos_embed
             # convnextv2 blocks
             if self.mask_padding:
+                text = text.masked_fill(
+                    text_mask.unsqueeze(-1).expand(-1, -1, text.size(-1)), 0.0
+                )
                 for block in self.text_blocks:
                     text = block(text)
+                    text = text.masked_fill(
+                        text_mask.unsqueeze(-1).expand(-1, -1, text.size(-1)), 0.0
+                    )
             else:
                 text = self.text_blocks(text)
         self.proj = nn.Linear(mel_dim * 2 + text_dim, out_dim)
         self.conv_pos_embed = ConvPositionEmbedding(dim=out_dim)
+    def forward(
+        self,
+        x: float["b n d"],
+        cond: float["b n d"],
+        text_embed: float["b n d"],
+        drop_audio_cond=False,
+    ):  # noqa: F722
         if drop_audio_cond:  # cfg for cond audio
             cond = torch.zeros_like(cond)
         if text_dim is None:
             text_dim = mel_dim
         self.text_embed = TextEmbedding(
+            text_num_embeds,
+            text_dim,
+            mask_padding=text_mask_padding,
+            conv_layers=conv_layers,
         )
         self.text_cond, self.text_uncond = None, None  # text cache
         self.input_embed = InputEmbedding(mel_dim, text_dim, dim)
                 for _ in range(depth)
             ]
         )
+        self.long_skip_connection = (
+            nn.Linear(dim * 2, dim, bias=False) if long_skip_connection else None
+        )
         self.norm_out = AdaLayerNorm_Final(dim)  # final modulation
         self.proj_out = nn.Linear(dim, mel_dim)
         # t: conditioning time, text: text, x: noised audio + cond audio + text
         t = self.time_embed(time)
         if cfg_infer:  # pack cond & uncond forward: b n d -> 2b n d
+            x_cond = self.get_input_embed(
+                x, cond, text, drop_audio_cond=False, drop_text=False, cache=cache
+            )
+            x_uncond = self.get_input_embed(
+                x, cond, text, drop_audio_cond=True, drop_text=True, cache=cache
+            )
             x = torch.cat((x_cond, x_uncond), dim=0)
             t = torch.cat((t, t), dim=0)
             mask = torch.cat((mask, mask), dim=0) if mask is not None else None
         else:
+            x = self.get_input_embed(
+                x,
+                cond,
+                text,
+                drop_audio_cond=drop_audio_cond,
+                drop_text=drop_text,
+                cache=cache,
+            )
         rope = self.rotary_embed.forward_from_seq_len(seq_len)
         for block in self.transformer_blocks:
             if self.checkpoint_activations:
                 # https://pytorch.org/docs/stable/checkpoint.html#torch.utils.checkpoint.checkpoint
+                x = torch.utils.checkpoint.checkpoint(
+                    self.ckpt_wrapper(block), x, t, mask, rope, use_reentrant=False
+                )
             else:
                 x = block(x, t, mask=mask, rope=rope)

f5_tts/model_new/backbones/mmdit.py CHANGED Viewed

@@ -13,15 +13,10 @@ import torch
 from torch import nn
 from x_transformers.x_transformers import RotaryEmbedding
-from f5_tts.model_new.modules import (
-    AdaLayerNorm_Final,
-    ConvPositionEmbedding,
-    MMDiTBlock,
-    TimestepEmbedding,
-    get_pos_embed_indices,
-    precompute_freqs_cis,
-)
 # text embedding
@@ -29,15 +24,25 @@ from f5_tts.model_new.modules import (
 class TextEmbedding(nn.Module):
     def __init__(self, out_dim, text_num_embeds, mask_padding=True):
         super().__init__()
-        self.text_embed = nn.Embedding(text_num_embeds + 1, out_dim)  # will use 0 as filler token
         self.mask_padding = mask_padding  # mask filler and batch padding tokens or not
         self.precompute_max_pos = 1024
-        self.register_buffer("freqs_cis", precompute_freqs_cis(out_dim, self.precompute_max_pos), persistent=False)
-    def forward(self, text: int["b nt"], drop_text=False) -> int["b nt d"]:  # noqa: F722
-        text = text + 1  # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
         if self.mask_padding:
             text_mask = text == 0
@@ -49,13 +54,17 @@ class TextEmbedding(nn.Module):
         # sinus pos emb
         batch_start = torch.zeros((text.shape[0],), dtype=torch.long)
         batch_text_len = text.shape[1]
-        pos_idx = get_pos_embed_indices(batch_start, batch_text_len, max_pos=self.precompute_max_pos)
         text_pos_embed = self.freqs_cis[pos_idx]
         text = text + text_pos_embed
         if self.mask_padding:
-            text = text.masked_fill(text_mask.unsqueeze(-1).expand(-1, -1, text.size(-1)), 0.0)
         return text
@@ -69,7 +78,9 @@ class AudioEmbedding(nn.Module):
         self.linear = nn.Linear(2 * in_dim, out_dim)
         self.conv_pos_embed = ConvPositionEmbedding(out_dim)
-    def forward(self, x: float["b n d"], cond: float["b n d"], drop_audio_cond=False):  # noqa: F722
         if drop_audio_cond:
             cond = torch.zeros_like(cond)
         x = torch.cat((x, cond), dim=-1)
@@ -99,7 +110,9 @@ class MMDiT(nn.Module):
         super().__init__()
         self.time_embed = TimestepEmbedding(dim)
-        self.text_embed = TextEmbedding(dim, text_num_embeds, mask_padding=text_mask_padding)
         self.text_cond, self.text_uncond = None, None  # text cache
         self.audio_embed = AudioEmbedding(mel_dim, dim)
@@ -187,15 +200,24 @@ class MMDiT(nn.Module):
         # t: conditioning (time), c: context (text + masked cond audio), x: noised input audio
         t = self.time_embed(time)
         if cfg_infer:  # pack cond & uncond forward: b n d -> 2b n d
-            x_cond, c_cond = self.get_input_embed(x, cond, text, drop_audio_cond=False, drop_text=False, cache=cache)
-            x_uncond, c_uncond = self.get_input_embed(x, cond, text, drop_audio_cond=True, drop_text=True, cache=cache)
             x = torch.cat((x_cond, x_uncond), dim=0)
             c = torch.cat((c_cond, c_uncond), dim=0)
             t = torch.cat((t, t), dim=0)
             mask = torch.cat((mask, mask), dim=0) if mask is not None else None
         else:
             x, c = self.get_input_embed(
-                x, cond, text, drop_audio_cond=drop_audio_cond, drop_text=drop_text, cache=cache
             )
         seq_len = x.shape[1]

 from torch import nn
 from x_transformers.x_transformers import RotaryEmbedding
+from f5_tts.model_new.modules import (AdaLayerNorm_Final,
+                                      ConvPositionEmbedding, MMDiTBlock,
+                                      TimestepEmbedding, get_pos_embed_indices,
+                                      precompute_freqs_cis)
 # text embedding
 class TextEmbedding(nn.Module):
     def __init__(self, out_dim, text_num_embeds, mask_padding=True):
         super().__init__()
+        self.text_embed = nn.Embedding(
+            text_num_embeds + 1, out_dim
+        )  # will use 0 as filler token
         self.mask_padding = mask_padding  # mask filler and batch padding tokens or not
         self.precompute_max_pos = 1024
+        self.register_buffer(
+            "freqs_cis",
+            precompute_freqs_cis(out_dim, self.precompute_max_pos),
+            persistent=False,
+        )
+    def forward(
+        self, text: int["b nt"], drop_text=False
+    ) -> int["b nt d"]:  # noqa: F722
+        text = (
+            text + 1
+        )  # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
         if self.mask_padding:
             text_mask = text == 0
         # sinus pos emb
         batch_start = torch.zeros((text.shape[0],), dtype=torch.long)
         batch_text_len = text.shape[1]
+        pos_idx = get_pos_embed_indices(
+            batch_start, batch_text_len, max_pos=self.precompute_max_pos
+        )
         text_pos_embed = self.freqs_cis[pos_idx]
         text = text + text_pos_embed
         if self.mask_padding:
+            text = text.masked_fill(
+                text_mask.unsqueeze(-1).expand(-1, -1, text.size(-1)), 0.0
+            )
         return text
         self.linear = nn.Linear(2 * in_dim, out_dim)
         self.conv_pos_embed = ConvPositionEmbedding(out_dim)
+    def forward(
+        self, x: float["b n d"], cond: float["b n d"], drop_audio_cond=False
+    ):  # noqa: F722
         if drop_audio_cond:
             cond = torch.zeros_like(cond)
         x = torch.cat((x, cond), dim=-1)
         super().__init__()
         self.time_embed = TimestepEmbedding(dim)
+        self.text_embed = TextEmbedding(
+            dim, text_num_embeds, mask_padding=text_mask_padding
+        )
         self.text_cond, self.text_uncond = None, None  # text cache
         self.audio_embed = AudioEmbedding(mel_dim, dim)
         # t: conditioning (time), c: context (text + masked cond audio), x: noised input audio
         t = self.time_embed(time)
         if cfg_infer:  # pack cond & uncond forward: b n d -> 2b n d
+            x_cond, c_cond = self.get_input_embed(
+                x, cond, text, drop_audio_cond=False, drop_text=False, cache=cache
+            )
+            x_uncond, c_uncond = self.get_input_embed(
+                x, cond, text, drop_audio_cond=True, drop_text=True, cache=cache
+            )
             x = torch.cat((x_cond, x_uncond), dim=0)
             c = torch.cat((c_cond, c_uncond), dim=0)
             t = torch.cat((t, t), dim=0)
             mask = torch.cat((mask, mask), dim=0) if mask is not None else None
         else:
             x, c = self.get_input_embed(
+                x,
+                cond,
+                text,
+                drop_audio_cond=drop_audio_cond,
+                drop_text=drop_text,
+                cache=cache,
             )
         seq_len = x.shape[1]

f5_tts/model_new/backbones/unett.py CHANGED Viewed

@@ -17,41 +17,50 @@ from torch import nn
 from x_transformers import RMSNorm
 from x_transformers.x_transformers import RotaryEmbedding
-from f5_tts.model_new.modules import (
-    Attention,
-    AttnProcessor,
-    ConvNeXtV2Block,
-    ConvPositionEmbedding,
-    FeedForward,
-    TimestepEmbedding,
-    get_pos_embed_indices,
-    precompute_freqs_cis,
-)
 # Text embedding
 class TextEmbedding(nn.Module):
-    def __init__(self, text_num_embeds, text_dim, mask_padding=True, conv_layers=0, conv_mult=2):
         super().__init__()
-        self.text_embed = nn.Embedding(text_num_embeds + 1, text_dim)  # use 0 as filler token
         self.mask_padding = mask_padding  # mask filler and batch padding tokens or not
         if conv_layers > 0:
             self.extra_modeling = True
             self.precompute_max_pos = 4096  # ~44s of 24khz audio
-            self.register_buffer("freqs_cis", precompute_freqs_cis(text_dim, self.precompute_max_pos), persistent=False)
             self.text_blocks = nn.Sequential(
-                *[ConvNeXtV2Block(text_dim, text_dim * conv_mult) for _ in range(conv_layers)]
             )
         else:
             self.extra_modeling = False
     def forward(self, text: int["b nt"], seq_len, drop_text=False):  # noqa: F722
-        text = text + 1  # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
-        text = text[:, :seq_len]  # curtail if character tokens are more than the mel spec tokens
         batch, text_len = text.shape[0], text.shape[1]
         text = F.pad(text, (0, seq_len - text_len), value=0)
         if self.mask_padding:
@@ -66,16 +75,22 @@ class TextEmbedding(nn.Module):
         if self.extra_modeling:
             # sinus pos emb
             batch_start = torch.zeros((batch,), dtype=torch.long)
-            pos_idx = get_pos_embed_indices(batch_start, seq_len, max_pos=self.precompute_max_pos)
             text_pos_embed = self.freqs_cis[pos_idx]
             text = text + text_pos_embed
             # convnextv2 blocks
             if self.mask_padding:
-                text = text.masked_fill(text_mask.unsqueeze(-1).expand(-1, -1, text.size(-1)), 0.0)
                 for block in self.text_blocks:
                     text = block(text)
-                    text = text.masked_fill(text_mask.unsqueeze(-1).expand(-1, -1, text.size(-1)), 0.0)
             else:
                 text = self.text_blocks(text)
@@ -91,7 +106,13 @@ class InputEmbedding(nn.Module):
         self.proj = nn.Linear(mel_dim * 2 + text_dim, out_dim)
         self.conv_pos_embed = ConvPositionEmbedding(dim=out_dim)
-    def forward(self, x: float["b n d"], cond: float["b n d"], text_embed: float["b n d"], drop_audio_cond=False):  # noqa: F722
         if drop_audio_cond:  # cfg for cond audio
             cond = torch.zeros_like(cond)
@@ -129,7 +150,10 @@ class UNetT(nn.Module):
         if text_dim is None:
             text_dim = mel_dim
         self.text_embed = TextEmbedding(
-            text_num_embeds, text_dim, mask_padding=text_mask_padding, conv_layers=conv_layers
         )
         self.text_cond, self.text_uncond = None, None  # text cache
         self.input_embed = InputEmbedding(mel_dim, text_dim, dim)
@@ -161,7 +185,11 @@ class UNetT(nn.Module):
             ff_norm = RMSNorm(dim)
             ff = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
-            skip_proj = nn.Linear(dim * 2, dim, bias=False) if needs_skip_proj and is_later_half else None
             self.layers.append(
                 nn.ModuleList(
@@ -226,13 +254,24 @@ class UNetT(nn.Module):
         # t: conditioning time, c: context (text + masked cond audio), x: noised input audio
         t = self.time_embed(time)
         if cfg_infer:  # pack cond & uncond forward: b n d -> 2b n d
-            x_cond = self.get_input_embed(x, cond, text, drop_audio_cond=False, drop_text=False, cache=cache)
-            x_uncond = self.get_input_embed(x, cond, text, drop_audio_cond=True, drop_text=True, cache=cache)
             x = torch.cat((x_cond, x_uncond), dim=0)
             t = torch.cat((t, t), dim=0)
             mask = torch.cat((mask, mask), dim=0) if mask is not None else None
         else:
-            x = self.get_input_embed(x, cond, text, drop_audio_cond=drop_audio_cond, drop_text=drop_text, cache=cache)
         # postfix time t to input x, [b n d] -> [b n+1 d]
         x = torch.cat([t.unsqueeze(1), x], dim=1)  # pack t to x
@@ -244,7 +283,9 @@ class UNetT(nn.Module):
         # flat unet transformer
         skip_connect_type = self.skip_connect_type
         skips = []
-        for idx, (maybe_skip_proj, attn_norm, attn, ff_norm, ff) in enumerate(self.layers):
             layer = idx + 1
             # skip connection logic

 from x_transformers import RMSNorm
 from x_transformers.x_transformers import RotaryEmbedding
+from f5_tts.model_new.modules import (Attention, AttnProcessor,
+                                      ConvNeXtV2Block, ConvPositionEmbedding,
+                                      FeedForward, TimestepEmbedding,
+                                      get_pos_embed_indices,
+                                      precompute_freqs_cis)
 # Text embedding
 class TextEmbedding(nn.Module):
+    def __init__(
+        self, text_num_embeds, text_dim, mask_padding=True, conv_layers=0, conv_mult=2
+    ):
         super().__init__()
+        self.text_embed = nn.Embedding(
+            text_num_embeds + 1, text_dim
+        )  # use 0 as filler token
         self.mask_padding = mask_padding  # mask filler and batch padding tokens or not
         if conv_layers > 0:
             self.extra_modeling = True
             self.precompute_max_pos = 4096  # ~44s of 24khz audio
+            self.register_buffer(
+                "freqs_cis",
+                precompute_freqs_cis(text_dim, self.precompute_max_pos),
+                persistent=False,
+            )
             self.text_blocks = nn.Sequential(
+                *[
+                    ConvNeXtV2Block(text_dim, text_dim * conv_mult)
+                    for _ in range(conv_layers)
+                ]
             )
         else:
             self.extra_modeling = False
     def forward(self, text: int["b nt"], seq_len, drop_text=False):  # noqa: F722
+        text = (
+            text + 1
+        )  # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
+        text = text[
+            :, :seq_len
+        ]  # curtail if character tokens are more than the mel spec tokens
         batch, text_len = text.shape[0], text.shape[1]
         text = F.pad(text, (0, seq_len - text_len), value=0)
         if self.mask_padding:
         if self.extra_modeling:
             # sinus pos emb
             batch_start = torch.zeros((batch,), dtype=torch.long)
+            pos_idx = get_pos_embed_indices(
+                batch_start, seq_len, max_pos=self.precompute_max_pos
+            )
             text_pos_embed = self.freqs_cis[pos_idx]
             text = text + text_pos_embed
             # convnextv2 blocks
             if self.mask_padding:
+                text = text.masked_fill(
+                    text_mask.unsqueeze(-1).expand(-1, -1, text.size(-1)), 0.0
+                )
                 for block in self.text_blocks:
                     text = block(text)
+                    text = text.masked_fill(
+                        text_mask.unsqueeze(-1).expand(-1, -1, text.size(-1)), 0.0
+                    )
             else:
                 text = self.text_blocks(text)
         self.proj = nn.Linear(mel_dim * 2 + text_dim, out_dim)
         self.conv_pos_embed = ConvPositionEmbedding(dim=out_dim)
+    def forward(
+        self,
+        x: float["b n d"],
+        cond: float["b n d"],
+        text_embed: float["b n d"],
+        drop_audio_cond=False,
+    ):  # noqa: F722
         if drop_audio_cond:  # cfg for cond audio
             cond = torch.zeros_like(cond)
         if text_dim is None:
             text_dim = mel_dim
         self.text_embed = TextEmbedding(
+            text_num_embeds,
+            text_dim,
+            mask_padding=text_mask_padding,
+            conv_layers=conv_layers,
         )
         self.text_cond, self.text_uncond = None, None  # text cache
         self.input_embed = InputEmbedding(mel_dim, text_dim, dim)
             ff_norm = RMSNorm(dim)
             ff = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
+            skip_proj = (
+                nn.Linear(dim * 2, dim, bias=False)
+                if needs_skip_proj and is_later_half
+                else None
+            )
             self.layers.append(
                 nn.ModuleList(
         # t: conditioning time, c: context (text + masked cond audio), x: noised input audio
         t = self.time_embed(time)
         if cfg_infer:  # pack cond & uncond forward: b n d -> 2b n d
+            x_cond = self.get_input_embed(
+                x, cond, text, drop_audio_cond=False, drop_text=False, cache=cache
+            )
+            x_uncond = self.get_input_embed(
+                x, cond, text, drop_audio_cond=True, drop_text=True, cache=cache
+            )
             x = torch.cat((x_cond, x_uncond), dim=0)
             t = torch.cat((t, t), dim=0)
             mask = torch.cat((mask, mask), dim=0) if mask is not None else None
         else:
+            x = self.get_input_embed(
+                x,
+                cond,
+                text,
+                drop_audio_cond=drop_audio_cond,
+                drop_text=drop_text,
+                cache=cache,
+            )
         # postfix time t to input x, [b n d] -> [b n+1 d]
         x = torch.cat([t.unsqueeze(1), x], dim=1)  # pack t to x
         # flat unet transformer
         skip_connect_type = self.skip_connect_type
         skips = []
+        for idx, (maybe_skip_proj, attn_norm, attn, ff_norm, ff) in enumerate(
+            self.layers
+        ):
             layer = idx + 1
             # skip connection logic

f5_tts/model_new/cfm.py CHANGED Viewed

@@ -19,15 +19,9 @@ from torch.nn.utils.rnn import pad_sequence
 from torchdiffeq import odeint
 from f5_tts.model_new.modules import MelSpec
-from f5_tts.model_new.utils import (
-    default,
-    exists,
-    get_epss_timesteps,
-    lens_to_mask,
-    list_str_to_idx,
-    list_str_to_tensor,
-    mask_from_frac_lengths,
-)
 class CFM(nn.Module):
@@ -139,13 +133,17 @@ class CFM(nn.Module):
         # duplicate test corner for inner time step oberservation
         if duplicate_test:
-            test_cond = F.pad(cond, (0, 0, cond_seq_len, max_duration - 2 * cond_seq_len), value=0.0)
         cond = F.pad(cond, (0, 0, 0, max_duration - cond_seq_len), value=0.0)
         if no_ref_audio:
             cond = torch.zeros_like(cond)
-        cond_mask = F.pad(cond_mask, (0, max_duration - cond_mask.shape[-1]), value=False)
         cond_mask = cond_mask.unsqueeze(-1)
         step_cond = torch.where(
             cond_mask, cond, torch.zeros_like(cond)
@@ -196,7 +194,11 @@ class CFM(nn.Module):
         for dur in duration:
             if exists(seed):
                 torch.manual_seed(seed)
-            y0.append(torch.randn(dur, self.num_channels, device=self.device, dtype=step_cond.dtype))
         y0 = pad_sequence(y0, padding_value=0, batch_first=True)
         t_start = 0
@@ -207,10 +209,14 @@ class CFM(nn.Module):
             y0 = (1 - t_start) * y0 + t_start * test_cond
             steps = int(steps * (1 - t_start))
-        if t_start == 0 and use_epss:  # use Empirically Pruned Step Sampling for low NFE
             t = get_epss_timesteps(steps, device=self.device, dtype=step_cond.dtype)
         else:
-            t = torch.linspace(t_start, 1, steps + 1, device=self.device, dtype=step_cond.dtype)
         if sway_sampling_coef is not None:
             t = t + sway_sampling_coef * (torch.cos(torch.pi / 2 * t) - 1 + t)
@@ -241,7 +247,12 @@ class CFM(nn.Module):
             inp = inp.permute(0, 2, 1)
             assert inp.shape[-1] == self.num_channels
-        batch, seq_len, dtype, device, _σ1 = *inp.shape[:2], inp.dtype, self.device, self.sigma
         # handle text as string
         if isinstance(text, list):
@@ -255,10 +266,16 @@ class CFM(nn.Module):
         if not exists(lens):
             lens = torch.full((batch,), seq_len, device=device)
-        mask = lens_to_mask(lens, length=seq_len)  # useless here, as collate_fn will pad to max length in batch
         # get a random span to mask out for training conditionally
-        frac_lengths = torch.zeros((batch,), device=self.device).float().uniform_(*self.frac_lengths_mask)
         rand_span_mask = mask_from_frac_lengths(lens, frac_lengths)
         if exists(mask):
@@ -292,7 +309,13 @@ class CFM(nn.Module):
         # apply mask will use more memory; might adjust batchsize or batchsampler long sequence threshold
         pred = self.transformer(
-            x=φ, cond=cond, text=text, time=time, drop_audio_cond=drop_audio_cond, drop_text=drop_text, mask=mask
         )
         # flow matching loss

 from torchdiffeq import odeint
 from f5_tts.model_new.modules import MelSpec
+from f5_tts.model_new.utils import (default, exists, get_epss_timesteps,
+                                    lens_to_mask, list_str_to_idx,
+                                    list_str_to_tensor, mask_from_frac_lengths)
 class CFM(nn.Module):
         # duplicate test corner for inner time step oberservation
         if duplicate_test:
+            test_cond = F.pad(
+                cond, (0, 0, cond_seq_len, max_duration - 2 * cond_seq_len), value=0.0
+            )
         cond = F.pad(cond, (0, 0, 0, max_duration - cond_seq_len), value=0.0)
         if no_ref_audio:
             cond = torch.zeros_like(cond)
+        cond_mask = F.pad(
+            cond_mask, (0, max_duration - cond_mask.shape[-1]), value=False
+        )
         cond_mask = cond_mask.unsqueeze(-1)
         step_cond = torch.where(
             cond_mask, cond, torch.zeros_like(cond)
         for dur in duration:
             if exists(seed):
                 torch.manual_seed(seed)
+            y0.append(
+                torch.randn(
+                    dur, self.num_channels, device=self.device, dtype=step_cond.dtype
+                )
+            )
         y0 = pad_sequence(y0, padding_value=0, batch_first=True)
         t_start = 0
             y0 = (1 - t_start) * y0 + t_start * test_cond
             steps = int(steps * (1 - t_start))
+        if (
+            t_start == 0 and use_epss
+        ):  # use Empirically Pruned Step Sampling for low NFE
             t = get_epss_timesteps(steps, device=self.device, dtype=step_cond.dtype)
         else:
+            t = torch.linspace(
+                t_start, 1, steps + 1, device=self.device, dtype=step_cond.dtype
+            )
         if sway_sampling_coef is not None:
             t = t + sway_sampling_coef * (torch.cos(torch.pi / 2 * t) - 1 + t)
             inp = inp.permute(0, 2, 1)
             assert inp.shape[-1] == self.num_channels
+        batch, seq_len, dtype, device, _σ1 = (
+            *inp.shape[:2],
+            inp.dtype,
+            self.device,
+            self.sigma,
+        )
         # handle text as string
         if isinstance(text, list):
         if not exists(lens):
             lens = torch.full((batch,), seq_len, device=device)
+        mask = lens_to_mask(
+            lens, length=seq_len
+        )  # useless here, as collate_fn will pad to max length in batch
         # get a random span to mask out for training conditionally
+        frac_lengths = (
+            torch.zeros((batch,), device=self.device)
+            .float()
+            .uniform_(*self.frac_lengths_mask)
+        )
         rand_span_mask = mask_from_frac_lengths(lens, frac_lengths)
         if exists(mask):
         # apply mask will use more memory; might adjust batchsize or batchsampler long sequence threshold
         pred = self.transformer(
+            x=φ,
+            cond=cond,
+            text=text,
+            time=time,
+            drop_audio_cond=drop_audio_cond,
+            drop_text=drop_text,
+            mask=mask,
         )
         # flow matching loss

f5_tts/model_new/dataset.py CHANGED Viewed

@@ -62,7 +62,9 @@ class HFDataset(Dataset):
         audio_tensor = torch.from_numpy(audio).float()
         if sample_rate != self.target_sample_rate:
-            resampler = torchaudio.transforms.Resample(sample_rate, self.target_sample_rate)
             audio_tensor = resampler(audio_tensor)
         audio_tensor = audio_tensor.unsqueeze(0)  # 't -> 1 t')
@@ -149,7 +151,9 @@ class CustomDataset(Dataset):
             # resample if necessary
             if source_sample_rate != self.target_sample_rate:
-                resampler = torchaudio.transforms.Resample(source_sample_rate, self.target_sample_rate)
                 audio = resampler(audio)
             # to mel spectrogram
@@ -173,7 +177,12 @@ class DynamicBatchSampler(Sampler[list[int]]):
     """
     def __init__(
-        self, sampler: Sampler[int], frames_threshold: int, max_samples=0, random_seed=None, drop_residual: bool = False
     ):
         self.sampler = sampler
         self.frames_threshold = frames_threshold
@@ -185,7 +194,8 @@ class DynamicBatchSampler(Sampler[list[int]]):
         data_source = self.sampler.data_source
         for idx in tqdm(
-            self.sampler, desc="Sorting with sampler... if slow, check whether dataset is provided with duration"
         ):
             indices.append((idx, data_source.get_frame_len(idx)))
         indices.sort(key=lambda elem: elem[1])
@@ -193,9 +203,12 @@ class DynamicBatchSampler(Sampler[list[int]]):
         batch = []
         batch_frames = 0
         for idx, frame_len in tqdm(
-            indices, desc=f"Creating dynamic batches with {frames_threshold} audio frames per gpu"
         ):
-            if batch_frames + frame_len <= self.frames_threshold and (max_samples == 0 or len(batch) < max_samples):
                 batch.append(idx)
                 batch_frames += frame_len
             else:
@@ -256,7 +269,9 @@ def load_dataset(
     print("Loading dataset ...")
     if dataset_type == "CustomDataset":
-        rel_data_path = str(files("f5_tts").joinpath(f"../../data/{dataset_name}_{tokenizer}"))
         if audio_type == "raw":
             try:
                 train_dataset = load_from_disk(f"{rel_data_path}/raw")
@@ -287,7 +302,10 @@ def load_dataset(
             data_dict = json.load(f)
         durations = data_dict["duration"]
         train_dataset = CustomDataset(
-            train_dataset, durations=durations, preprocessed_mel=preprocessed_mel, **mel_spec_kwargs
         )
     elif dataset_type == "HFDataset":
@@ -297,7 +315,11 @@ def load_dataset(
         )
         pre, post = dataset_name.split("_")
         train_dataset = HFDataset(
-            load_dataset(f"{pre}/{pre}", split=f"train.{post}", cache_dir=str(files("f5_tts").joinpath("../../data"))),
         )
     return train_dataset

         audio_tensor = torch.from_numpy(audio).float()
         if sample_rate != self.target_sample_rate:
+            resampler = torchaudio.transforms.Resample(
+                sample_rate, self.target_sample_rate
+            )
             audio_tensor = resampler(audio_tensor)
         audio_tensor = audio_tensor.unsqueeze(0)  # 't -> 1 t')
             # resample if necessary
             if source_sample_rate != self.target_sample_rate:
+                resampler = torchaudio.transforms.Resample(
+                    source_sample_rate, self.target_sample_rate
+                )
                 audio = resampler(audio)
             # to mel spectrogram
     """
     def __init__(
+        self,
+        sampler: Sampler[int],
+        frames_threshold: int,
+        max_samples=0,
+        random_seed=None,
+        drop_residual: bool = False,
     ):
         self.sampler = sampler
         self.frames_threshold = frames_threshold
         data_source = self.sampler.data_source
         for idx in tqdm(
+            self.sampler,
+            desc="Sorting with sampler... if slow, check whether dataset is provided with duration",
         ):
             indices.append((idx, data_source.get_frame_len(idx)))
         indices.sort(key=lambda elem: elem[1])
         batch = []
         batch_frames = 0
         for idx, frame_len in tqdm(
+            indices,
+            desc=f"Creating dynamic batches with {frames_threshold} audio frames per gpu",
         ):
+            if batch_frames + frame_len <= self.frames_threshold and (
+                max_samples == 0 or len(batch) < max_samples
+            ):
                 batch.append(idx)
                 batch_frames += frame_len
             else:
     print("Loading dataset ...")
     if dataset_type == "CustomDataset":
+        rel_data_path = str(
+            files("f5_tts").joinpath(f"../../data/{dataset_name}_{tokenizer}")
+        )
         if audio_type == "raw":
             try:
                 train_dataset = load_from_disk(f"{rel_data_path}/raw")
             data_dict = json.load(f)
         durations = data_dict["duration"]
         train_dataset = CustomDataset(
+            train_dataset,
+            durations=durations,
+            preprocessed_mel=preprocessed_mel,
+            **mel_spec_kwargs,
         )
     elif dataset_type == "HFDataset":
         )
         pre, post = dataset_name.split("_")
         train_dataset = HFDataset(
+            load_dataset(
+                f"{pre}/{pre}",
+                split=f"train.{post}",
+                cache_dir=str(files("f5_tts").joinpath("../../data")),
+            ),
         )
     return train_dataset

f5_tts/model_new/modules.py CHANGED Viewed

@@ -6,6 +6,7 @@ nt - text sequence
 nw - raw wave length
 d - dimension
 """
 # flake8: noqa
 from __future__ import annotations
@@ -22,7 +23,6 @@ from x_transformers.x_transformers import apply_rotary_pos_emb
 from f5_tts.model_new.utils import is_package_available
 # raw wav to mel spec
@@ -45,15 +45,25 @@ def get_bigvgan_mel_spectrogram(
     key = f"{n_fft}_{n_mel_channels}_{target_sample_rate}_{hop_length}_{win_length}_{fmin}_{fmax}_{device}"
     if key not in mel_basis_cache:
-        mel = librosa_mel_fn(sr=target_sample_rate, n_fft=n_fft, n_mels=n_mel_channels, fmin=fmin, fmax=fmax)
-        mel_basis_cache[key] = torch.from_numpy(mel).float().to(device)  # TODO: why they need .float()?
         hann_window_cache[key] = torch.hann_window(win_length).to(device)
     mel_basis = mel_basis_cache[key]
     hann_window = hann_window_cache[key]
     padding = (n_fft - hop_length) // 2
-    waveform = torch.nn.functional.pad(waveform.unsqueeze(1), (padding, padding), mode="reflect").squeeze(1)
     spec = torch.stft(
         waveform,
@@ -115,7 +125,9 @@ class MelSpec(nn.Module):
         mel_spec_type="vocos",
     ):
         super().__init__()
-        assert mel_spec_type in ["vocos", "bigvgan"], print("We only support two extract mel backend: vocos or bigvgan")
         self.n_fft = n_fft
         self.hop_length = hop_length
@@ -196,7 +208,9 @@ class ConvPositionEmbedding(nn.Module):
 # rotary positional embedding related
-def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0, theta_rescale_factor=1.0):
     # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
     # has some connection to NTK literature
     # https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
@@ -212,10 +226,15 @@ def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0, theta_resca
 def get_pos_embed_indices(start, length, max_pos, scale=1.0):
     # length = length if isinstance(length, int) else length.max()
-    scale = scale * torch.ones_like(start, dtype=torch.float32)  # in case scale is a scalar
     pos = (
         start.unsqueeze(1)
-        + (torch.arange(length, device=start.device, dtype=torch.float32).unsqueeze(0) * scale.unsqueeze(1)).long()
     )
     # avoid extra long error.
     pos = torch.where(pos < max_pos, pos, max_pos - 1)
@@ -254,7 +273,9 @@ class ConvNeXtV2Block(nn.Module):
             dim, dim, kernel_size=7, padding=padding, groups=dim, dilation=dilation
         )  # depthwise conv
         self.norm = nn.LayerNorm(dim, eps=1e-6)
-        self.pwconv1 = nn.Linear(dim, intermediate_dim)  # pointwise/1x1 convs, implemented with linear layers
         self.act = nn.GELU()
         self.grn = GRN(intermediate_dim)
         self.pwconv2 = nn.Linear(intermediate_dim, dim)
@@ -286,7 +307,9 @@ class RMSNorm(nn.Module):
         if self.native_rms_norm:
             if self.weight.dtype in [torch.float16, torch.bfloat16]:
                 x = x.to(self.weight.dtype)
-            x = F.rms_norm(x, normalized_shape=(x.shape[-1],), weight=self.weight, eps=self.eps)
         else:
             variance = x.to(torch.float32).pow(2).mean(-1, keepdim=True)
             x = x * torch.rsqrt(variance + self.eps)
@@ -312,7 +335,9 @@ class AdaLayerNorm(nn.Module):
     def forward(self, x, emb=None):
         emb = self.linear(self.silu(emb))
-        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = torch.chunk(emb, 6, dim=1)
         x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
         return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
@@ -343,14 +368,18 @@ class AdaLayerNorm_Final(nn.Module):
 class FeedForward(nn.Module):
-    def __init__(self, dim, dim_out=None, mult=4, dropout=0.0, approximate: str = "none"):
         super().__init__()
         inner_dim = int(dim * mult)
         dim_out = dim_out if dim_out is not None else dim
         activation = nn.GELU(approximate=approximate)
         project_in = nn.Sequential(nn.Linear(dim, inner_dim), activation)
-        self.ff = nn.Sequential(project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out))
     def forward(self, x):
         return self.ff(x)
@@ -375,7 +404,9 @@ class Attention(nn.Module):
         super().__init__()
         if not hasattr(F, "scaled_dot_product_attention"):
-            raise ImportError("Attention equires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
         self.processor = processor
@@ -435,19 +466,23 @@ class Attention(nn.Module):
 # Attention processor
 if is_package_available("flash_attn"):
     from flash_attn.bert_padding import pad_input, unpad_input
-    from flash_attn import flash_attn_varlen_func, flash_attn_func
 class AttnProcessor:
     def __init__(
         self,
-        pe_attn_head: int | None = None,  # number of attention head to apply rope, None for all
         attn_backend: str = "torch",  # "torch" or "flash_attn"
         attn_mask_enabled: bool = True,
     ):
         if attn_backend == "flash_attn":
-            assert is_package_available("flash_attn"), "Please install flash-attn first."
         self.pe_attn_head = pe_attn_head
         self.attn_backend = attn_backend
@@ -483,12 +518,18 @@ class AttnProcessor:
         # apply rotary position embedding
         if rope is not None:
             freqs, xpos_scale = rope
-            q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
             if self.pe_attn_head is not None:
                 pn = self.pe_attn_head
-                query[:, :pn, :, :] = apply_rotary_pos_emb(query[:, :pn, :, :], freqs, q_xpos_scale)
-                key[:, :pn, :, :] = apply_rotary_pos_emb(key[:, :pn, :, :], freqs, k_xpos_scale)
             else:
                 query = apply_rotary_pos_emb(query, freqs, q_xpos_scale)
                 key = apply_rotary_pos_emb(key, freqs, k_xpos_scale)
@@ -498,10 +539,14 @@ class AttnProcessor:
             if self.attn_mask_enabled and mask is not None:
                 attn_mask = mask
                 attn_mask = attn_mask.unsqueeze(1).unsqueeze(1)  # 'b n -> b 1 1 n'
-                attn_mask = attn_mask.expand(batch_size, attn.heads, query.shape[-2], key.shape[-2])
             else:
                 attn_mask = None
-            x = F.scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=0.0, is_causal=False)
             x = x.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
         elif self.attn_backend == "flash_attn":
@@ -509,7 +554,9 @@ class AttnProcessor:
             key = key.transpose(1, 2)
             value = value.transpose(1, 2)
             if self.attn_mask_enabled and mask is not None:
-                query, indices, q_cu_seqlens, q_max_seqlen_in_batch, _ = unpad_input(query, mask)
                 key, _, k_cu_seqlens, k_max_seqlen_in_batch, _ = unpad_input(key, mask)
                 value, _, _, _, _ = unpad_input(value, mask)
                 x = flash_attn_varlen_func(
@@ -595,12 +642,16 @@ class JointAttnProcessor:
         # apply rope for context and noised input independently
         if rope is not None:
             freqs, xpos_scale = rope
-            q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
             query = apply_rotary_pos_emb(query, freqs, q_xpos_scale)
             key = apply_rotary_pos_emb(key, freqs, k_xpos_scale)
         if c_rope is not None:
             freqs, xpos_scale = c_rope
-            q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
             c_query = apply_rotary_pos_emb(c_query, freqs, q_xpos_scale)
             c_key = apply_rotary_pos_emb(c_key, freqs, k_xpos_scale)
@@ -613,11 +664,15 @@ class JointAttnProcessor:
         if mask is not None:
             attn_mask = F.pad(mask, (0, c.shape[1]), value=True)  # no mask for c (text)
             attn_mask = attn_mask.unsqueeze(1).unsqueeze(1)  # 'b n -> b 1 1 n'
-            attn_mask = attn_mask.expand(batch_size, attn.heads, query.shape[-2], key.shape[-2])
         else:
             attn_mask = None
-        x = F.scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=0.0, is_causal=False)
         x = x.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
         x = x.to(query.dtype)
@@ -675,7 +730,9 @@ class DiTBlock(nn.Module):
         )
         self.ff_norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
-        self.ff = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
     def forward(self, x, t, mask=None, rope=None):  # x: noised input, t: time embedding
         # pre-norm & modulation for attention input
@@ -708,14 +765,26 @@ class MMDiTBlock(nn.Module):
     """
     def __init__(
-        self, dim, heads, dim_head, ff_mult=4, dropout=0.1, context_dim=None, context_pre_only=False, qk_norm=None
     ):
         super().__init__()
         if context_dim is None:
             context_dim = dim
         self.context_pre_only = context_pre_only
-        self.attn_norm_c = AdaLayerNorm_Final(context_dim) if context_pre_only else AdaLayerNorm(context_dim)
         self.attn_norm_x = AdaLayerNorm(dim)
         self.attn = Attention(
             processor=JointAttnProcessor(),
@@ -729,24 +798,38 @@ class MMDiTBlock(nn.Module):
         )
         if not context_pre_only:
-            self.ff_norm_c = nn.LayerNorm(context_dim, elementwise_affine=False, eps=1e-6)
-            self.ff_c = FeedForward(dim=context_dim, mult=ff_mult, dropout=dropout, approximate="tanh")
         else:
             self.ff_norm_c = None
             self.ff_c = None
         self.ff_norm_x = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
-        self.ff_x = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
-    def forward(self, x, c, t, mask=None, rope=None, c_rope=None):  # x: noised input, c: context, t: time embedding
         # pre-norm & modulation for attention input
         if self.context_pre_only:
             norm_c = self.attn_norm_c(c, t)
         else:
-            norm_c, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.attn_norm_c(c, emb=t)
-        norm_x, x_gate_msa, x_shift_mlp, x_scale_mlp, x_gate_mlp = self.attn_norm_x(x, emb=t)
         # attention
-        x_attn_output, c_attn_output = self.attn(x=norm_x, c=norm_c, mask=mask, rope=rope, c_rope=c_rope)
         # process attention output for context c
         if self.context_pre_only:
@@ -754,7 +837,9 @@ class MMDiTBlock(nn.Module):
         else:  # if not last layer
             c = c + c_gate_msa.unsqueeze(1) * c_attn_output
-            norm_c = self.ff_norm_c(c) * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
             c_ff_output = self.ff_c(norm_c)
             c = c + c_gate_mlp.unsqueeze(1) * c_ff_output
@@ -775,7 +860,9 @@ class TimestepEmbedding(nn.Module):
     def __init__(self, dim, freq_embed_dim=256):
         super().__init__()
         self.time_embed = SinusPositionEmbedding(freq_embed_dim)
-        self.time_mlp = nn.Sequential(nn.Linear(freq_embed_dim, dim), nn.SiLU(), nn.Linear(dim, dim))
     def forward(self, timestep: float["b"]):
         time_hidden = self.time_embed(timestep)

 nw - raw wave length
 d - dimension
 """
 # flake8: noqa
 from __future__ import annotations
 from f5_tts.model_new.utils import is_package_available
 # raw wav to mel spec
     key = f"{n_fft}_{n_mel_channels}_{target_sample_rate}_{hop_length}_{win_length}_{fmin}_{fmax}_{device}"
     if key not in mel_basis_cache:
+        mel = librosa_mel_fn(
+            sr=target_sample_rate,
+            n_fft=n_fft,
+            n_mels=n_mel_channels,
+            fmin=fmin,
+            fmax=fmax,
+        )
+        mel_basis_cache[key] = (
+            torch.from_numpy(mel).float().to(device)
+        )  # TODO: why they need .float()?
         hann_window_cache[key] = torch.hann_window(win_length).to(device)
     mel_basis = mel_basis_cache[key]
     hann_window = hann_window_cache[key]
     padding = (n_fft - hop_length) // 2
+    waveform = torch.nn.functional.pad(
+        waveform.unsqueeze(1), (padding, padding), mode="reflect"
+    ).squeeze(1)
     spec = torch.stft(
         waveform,
         mel_spec_type="vocos",
     ):
         super().__init__()
+        assert mel_spec_type in ["vocos", "bigvgan"], print(
+            "We only support two extract mel backend: vocos or bigvgan"
+        )
         self.n_fft = n_fft
         self.hop_length = hop_length
 # rotary positional embedding related
+def precompute_freqs_cis(
+    dim: int, end: int, theta: float = 10000.0, theta_rescale_factor=1.0
+):
     # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
     # has some connection to NTK literature
     # https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
 def get_pos_embed_indices(start, length, max_pos, scale=1.0):
     # length = length if isinstance(length, int) else length.max()
+    scale = scale * torch.ones_like(
+        start, dtype=torch.float32
+    )  # in case scale is a scalar
     pos = (
         start.unsqueeze(1)
+        + (
+            torch.arange(length, device=start.device, dtype=torch.float32).unsqueeze(0)
+            * scale.unsqueeze(1)
+        ).long()
     )
     # avoid extra long error.
     pos = torch.where(pos < max_pos, pos, max_pos - 1)
             dim, dim, kernel_size=7, padding=padding, groups=dim, dilation=dilation
         )  # depthwise conv
         self.norm = nn.LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(
+            dim, intermediate_dim
+        )  # pointwise/1x1 convs, implemented with linear layers
         self.act = nn.GELU()
         self.grn = GRN(intermediate_dim)
         self.pwconv2 = nn.Linear(intermediate_dim, dim)
         if self.native_rms_norm:
             if self.weight.dtype in [torch.float16, torch.bfloat16]:
                 x = x.to(self.weight.dtype)
+            x = F.rms_norm(
+                x, normalized_shape=(x.shape[-1],), weight=self.weight, eps=self.eps
+            )
         else:
             variance = x.to(torch.float32).pow(2).mean(-1, keepdim=True)
             x = x * torch.rsqrt(variance + self.eps)
     def forward(self, x, emb=None):
         emb = self.linear(self.silu(emb))
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = torch.chunk(
+            emb, 6, dim=1
+        )
         x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
         return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
 class FeedForward(nn.Module):
+    def __init__(
+        self, dim, dim_out=None, mult=4, dropout=0.0, approximate: str = "none"
+    ):
         super().__init__()
         inner_dim = int(dim * mult)
         dim_out = dim_out if dim_out is not None else dim
         activation = nn.GELU(approximate=approximate)
         project_in = nn.Sequential(nn.Linear(dim, inner_dim), activation)
+        self.ff = nn.Sequential(
+            project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out)
+        )
     def forward(self, x):
         return self.ff(x)
         super().__init__()
         if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "Attention equires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
         self.processor = processor
 # Attention processor
 if is_package_available("flash_attn"):
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
     from flash_attn.bert_padding import pad_input, unpad_input
 class AttnProcessor:
     def __init__(
         self,
+        pe_attn_head: (
+            int | None
+        ) = None,  # number of attention head to apply rope, None for all
         attn_backend: str = "torch",  # "torch" or "flash_attn"
         attn_mask_enabled: bool = True,
     ):
         if attn_backend == "flash_attn":
+            assert is_package_available(
+                "flash_attn"
+            ), "Please install flash-attn first."
         self.pe_attn_head = pe_attn_head
         self.attn_backend = attn_backend
         # apply rotary position embedding
         if rope is not None:
             freqs, xpos_scale = rope
+            q_xpos_scale, k_xpos_scale = (
+                (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
+            )
             if self.pe_attn_head is not None:
                 pn = self.pe_attn_head
+                query[:, :pn, :, :] = apply_rotary_pos_emb(
+                    query[:, :pn, :, :], freqs, q_xpos_scale
+                )
+                key[:, :pn, :, :] = apply_rotary_pos_emb(
+                    key[:, :pn, :, :], freqs, k_xpos_scale
+                )
             else:
                 query = apply_rotary_pos_emb(query, freqs, q_xpos_scale)
                 key = apply_rotary_pos_emb(key, freqs, k_xpos_scale)
             if self.attn_mask_enabled and mask is not None:
                 attn_mask = mask
                 attn_mask = attn_mask.unsqueeze(1).unsqueeze(1)  # 'b n -> b 1 1 n'
+                attn_mask = attn_mask.expand(
+                    batch_size, attn.heads, query.shape[-2], key.shape[-2]
+                )
             else:
                 attn_mask = None
+            x = F.scaled_dot_product_attention(
+                query, key, value, attn_mask=attn_mask, dropout_p=0.0, is_causal=False
+            )
             x = x.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
         elif self.attn_backend == "flash_attn":
             key = key.transpose(1, 2)
             value = value.transpose(1, 2)
             if self.attn_mask_enabled and mask is not None:
+                query, indices, q_cu_seqlens, q_max_seqlen_in_batch, _ = unpad_input(
+                    query, mask
+                )
                 key, _, k_cu_seqlens, k_max_seqlen_in_batch, _ = unpad_input(key, mask)
                 value, _, _, _, _ = unpad_input(value, mask)
                 x = flash_attn_varlen_func(
         # apply rope for context and noised input independently
         if rope is not None:
             freqs, xpos_scale = rope
+            q_xpos_scale, k_xpos_scale = (
+                (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
+            )
             query = apply_rotary_pos_emb(query, freqs, q_xpos_scale)
             key = apply_rotary_pos_emb(key, freqs, k_xpos_scale)
         if c_rope is not None:
             freqs, xpos_scale = c_rope
+            q_xpos_scale, k_xpos_scale = (
+                (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
+            )
             c_query = apply_rotary_pos_emb(c_query, freqs, q_xpos_scale)
             c_key = apply_rotary_pos_emb(c_key, freqs, k_xpos_scale)
         if mask is not None:
             attn_mask = F.pad(mask, (0, c.shape[1]), value=True)  # no mask for c (text)
             attn_mask = attn_mask.unsqueeze(1).unsqueeze(1)  # 'b n -> b 1 1 n'
+            attn_mask = attn_mask.expand(
+                batch_size, attn.heads, query.shape[-2], key.shape[-2]
+            )
         else:
             attn_mask = None
+        x = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attn_mask, dropout_p=0.0, is_causal=False
+        )
         x = x.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
         x = x.to(query.dtype)
         )
         self.ff_norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff = FeedForward(
+            dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh"
+        )
     def forward(self, x, t, mask=None, rope=None):  # x: noised input, t: time embedding
         # pre-norm & modulation for attention input
     """
     def __init__(
+        self,
+        dim,
+        heads,
+        dim_head,
+        ff_mult=4,
+        dropout=0.1,
+        context_dim=None,
+        context_pre_only=False,
+        qk_norm=None,
     ):
         super().__init__()
         if context_dim is None:
             context_dim = dim
         self.context_pre_only = context_pre_only
+        self.attn_norm_c = (
+            AdaLayerNorm_Final(context_dim)
+            if context_pre_only
+            else AdaLayerNorm(context_dim)
+        )
         self.attn_norm_x = AdaLayerNorm(dim)
         self.attn = Attention(
             processor=JointAttnProcessor(),
         )
         if not context_pre_only:
+            self.ff_norm_c = nn.LayerNorm(
+                context_dim, elementwise_affine=False, eps=1e-6
+            )
+            self.ff_c = FeedForward(
+                dim=context_dim, mult=ff_mult, dropout=dropout, approximate="tanh"
+            )
         else:
             self.ff_norm_c = None
             self.ff_c = None
         self.ff_norm_x = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff_x = FeedForward(
+            dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh"
+        )
+    def forward(
+        self, x, c, t, mask=None, rope=None, c_rope=None
+    ):  # x: noised input, c: context, t: time embedding
         # pre-norm & modulation for attention input
         if self.context_pre_only:
             norm_c = self.attn_norm_c(c, t)
         else:
+            norm_c, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.attn_norm_c(
+                c, emb=t
+            )
+        norm_x, x_gate_msa, x_shift_mlp, x_scale_mlp, x_gate_mlp = self.attn_norm_x(
+            x, emb=t
+        )
         # attention
+        x_attn_output, c_attn_output = self.attn(
+            x=norm_x, c=norm_c, mask=mask, rope=rope, c_rope=c_rope
+        )
         # process attention output for context c
         if self.context_pre_only:
         else:  # if not last layer
             c = c + c_gate_msa.unsqueeze(1) * c_attn_output
+            norm_c = (
+                self.ff_norm_c(c) * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
+            )
             c_ff_output = self.ff_c(norm_c)
             c = c + c_gate_mlp.unsqueeze(1) * c_ff_output
     def __init__(self, dim, freq_embed_dim=256):
         super().__init__()
         self.time_embed = SinusPositionEmbedding(freq_embed_dim)
+        self.time_mlp = nn.Sequential(
+            nn.Linear(freq_embed_dim, dim), nn.SiLU(), nn.Linear(dim, dim)
+        )
     def forward(self, timestep: float["b"]):
         time_hidden = self.time_embed(timestep)

f5_tts/model_new/trainer.py CHANGED Viewed

@@ -19,7 +19,6 @@ from f5_tts.model import CFM
 from f5_tts.model.dataset import DynamicBatchSampler, collate_fn
 from f5_tts.model.utils import default, exists
 # trainer
@@ -70,7 +69,13 @@ class Trainer:
         self.logger = logger
         if self.logger == "wandb":
             if exists(wandb_resume_id):
-                init_kwargs = {"wandb": {"resume": "allow", "name": wandb_run_name, "id": wandb_resume_id}}
             else:
                 init_kwargs = {"wandb": {"resume": "allow", "name": wandb_run_name}}
@@ -138,7 +143,9 @@ class Trainer:
             self.optimizer = bnb.optim.AdamW8bit(model.parameters(), lr=learning_rate)
         else:
             self.optimizer = AdamW(model.parameters(), lr=learning_rate)
-        self.model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
     @property
     def is_main(self):
@@ -157,12 +164,16 @@ class Trainer:
             if not os.path.exists(self.checkpoint_path):
                 os.makedirs(self.checkpoint_path)
             if last:
-                self.accelerator.save(checkpoint, f"{self.checkpoint_path}/model_last.pt")
                 print(f"Saved last checkpoint at update {update}")
             else:
                 if self.keep_last_n_checkpoints == 0:
                     return
-                self.accelerator.save(checkpoint, f"{self.checkpoint_path}/model_{update}.pt")
                 if self.keep_last_n_checkpoints > 0:
                     # Updated logic to exclude pretrained model from rotation
                     checkpoints = [
@@ -183,7 +194,10 @@ class Trainer:
         if (
             not exists(self.checkpoint_path)
             or not os.path.exists(self.checkpoint_path)
-            or not any(filename.endswith((".pt", ".safetensors")) for filename in os.listdir(self.checkpoint_path))
         ):
             return 0
@@ -195,11 +209,16 @@ class Trainer:
             all_checkpoints = [
                 f
                 for f in os.listdir(self.checkpoint_path)
-                if (f.startswith("model_") or f.startswith("pretrained_")) and f.endswith((".pt", ".safetensors"))
             ]
             # First try to find regular training checkpoints
-            training_checkpoints = [f for f in all_checkpoints if f.startswith("model_") and f != "model_last.pt"]
             if training_checkpoints:
                 latest_checkpoint = sorted(
                     training_checkpoints,
@@ -207,21 +226,30 @@ class Trainer:
                 )[-1]
             else:
                 # If no training checkpoints, use pretrained model
-                latest_checkpoint = next(f for f in all_checkpoints if f.startswith("pretrained_"))
         if latest_checkpoint.endswith(".safetensors"):  # always a pretrained checkpoint
             from safetensors.torch import load_file
-            checkpoint = load_file(f"{self.checkpoint_path}/{latest_checkpoint}", device="cpu")
             checkpoint = {"ema_model_state_dict": checkpoint}
         elif latest_checkpoint.endswith(".pt"):
             # checkpoint = torch.load(f"{self.checkpoint_path}/{latest_checkpoint}", map_location=self.accelerator.device)  # rather use accelerator.load_state ಥ_ಥ
             checkpoint = torch.load(
-                f"{self.checkpoint_path}/{latest_checkpoint}", weights_only=True, map_location="cpu"
             )
         # patch for backward compatibility, 305e3ea
-        for key in ["ema_model.mel_spec.mel_stft.mel_scale.fb", "ema_model.mel_spec.mel_stft.spectrogram.window"]:
             if key in checkpoint["ema_model_state_dict"]:
                 del checkpoint["ema_model_state_dict"][key]
@@ -231,17 +259,24 @@ class Trainer:
         if "update" in checkpoint or "step" in checkpoint:
             # patch for backward compatibility, with before f992c4e
             if "step" in checkpoint:
-                checkpoint["update"] = checkpoint["step"] // self.grad_accumulation_steps
                 if self.grad_accumulation_steps > 1 and self.is_main:
                     print(
                         "F5-TTS WARNING: Loading checkpoint saved with per_steps logic (before f992c4e), will convert to per_updates according to grad_accumulation_steps setting, may have unexpected behaviour."
                     )
             # patch for backward compatibility, 305e3ea
-            for key in ["mel_spec.mel_stft.mel_scale.fb", "mel_spec.mel_stft.spectrogram.window"]:
                 if key in checkpoint["model_state_dict"]:
                     del checkpoint["model_state_dict"][key]
-            self.accelerator.unwrap_model(self.model).load_state_dict(checkpoint["model_state_dict"])
             self.optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
             if self.scheduler:
                 self.scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
@@ -252,21 +287,30 @@ class Trainer:
                 for k, v in checkpoint["ema_model_state_dict"].items()
                 if k not in ["initted", "update", "step"]
             }
-            self.accelerator.unwrap_model(self.model).load_state_dict(checkpoint["model_state_dict"])
             update = 0
         del checkpoint
         gc.collect()
         return update
-    def train(self, train_dataset: Dataset, num_workers=16, resumable_with_seed: int = None):
         if self.log_samples:
-            from f5_tts.infer.utils_infer import cfg_strength, load_vocoder, nfe_step, sway_sampling_coef
             vocoder = load_vocoder(
-                vocoder_name=self.vocoder_name, is_local=self.is_local_vocoder, local_path=self.local_vocoder_path
             )
-            target_sample_rate = self.accelerator.unwrap_model(self.model).mel_spec.target_sample_rate
             log_samples_path = f"{self.checkpoint_path}/samples"
             os.makedirs(log_samples_path, exist_ok=True)
@@ -306,7 +350,9 @@ class Trainer:
                 batch_sampler=batch_sampler,
             )
         else:
-            raise ValueError(f"batch_size_type must be either 'sample' or 'frame', but received {self.batch_size_type}")
         #  accelerator.prepare() dispatches batches to devices;
         #  which means the length of dataloader calculated before, should consider the number of devices
@@ -314,12 +360,24 @@ class Trainer:
             self.num_warmup_updates * self.accelerator.num_processes
         )  # consider a fixed warmup steps while using accelerate multi-gpu ddp
         # otherwise by default with split_batches=False, warmup steps change with num_processes
-        total_updates = math.ceil(len(train_dataloader) / self.grad_accumulation_steps) * self.epochs
         decay_updates = total_updates - warmup_updates
-        warmup_scheduler = LinearLR(self.optimizer, start_factor=1e-8, end_factor=1.0, total_iters=warmup_updates)
-        decay_scheduler = LinearLR(self.optimizer, start_factor=1.0, end_factor=1e-8, total_iters=decay_updates)
         self.scheduler = SequentialLR(
-            self.optimizer, schedulers=[warmup_scheduler, decay_scheduler], milestones=[warmup_updates]
         )
         train_dataloader, self.scheduler = self.accelerator.prepare(
             train_dataloader, self.scheduler
@@ -332,21 +390,27 @@ class Trainer:
             start_step = start_update * self.grad_accumulation_steps
             skipped_epoch = int(start_step // orig_epoch_step)
             skipped_batch = start_step % orig_epoch_step
-            skipped_dataloader = self.accelerator.skip_first_batches(train_dataloader, num_batches=skipped_batch)
         else:
             skipped_epoch = 0
         for epoch in range(skipped_epoch, self.epochs):
             self.model.train()
             if exists(resumable_with_seed) and epoch == skipped_epoch:
-                progress_bar_initial = math.ceil(skipped_batch / self.grad_accumulation_steps)
                 current_dataloader = skipped_dataloader
             else:
                 progress_bar_initial = 0
                 current_dataloader = train_dataloader
             # Set epoch for the batch sampler if it exists
-            if hasattr(train_dataloader, "batch_sampler") and hasattr(train_dataloader.batch_sampler, "set_epoch"):
                 train_dataloader.batch_sampler.set_epoch(epoch)
             progress_bar = tqdm(
@@ -364,17 +428,29 @@ class Trainer:
                     mel_lengths = batch["mel_lengths"]
                     # TODO. add duration predictor training
-                    if self.duration_predictor is not None and self.accelerator.is_local_main_process:
-                        dur_loss = self.duration_predictor(mel_spec, lens=batch.get("durations"))
-                        self.accelerator.log({"duration loss": dur_loss.item()}, step=global_update)
                     loss, cond, pred = self.model(
-                        mel_spec, text=text_inputs, lens=mel_lengths, noise_scheduler=self.noise_scheduler
                     )
                     self.accelerator.backward(loss)
                     if self.max_grad_norm > 0 and self.accelerator.sync_gradients:
-                        self.accelerator.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
                     self.optimizer.step()
                     self.scheduler.step()
@@ -386,29 +462,44 @@ class Trainer:
                     global_update += 1
                     progress_bar.update(1)
-                    progress_bar.set_postfix(update=str(global_update), loss=loss.item())
                 if self.accelerator.is_local_main_process:
                     self.accelerator.log(
-                        {"loss": loss.item(), "lr": self.scheduler.get_last_lr()[0]}, step=global_update
                     )
                     if self.logger == "tensorboard":
                         self.writer.add_scalar("loss", loss.item(), global_update)
-                        self.writer.add_scalar("lr", self.scheduler.get_last_lr()[0], global_update)
-                if global_update % self.last_per_updates == 0 and self.accelerator.sync_gradients:
                     self.save_checkpoint(global_update, last=True)
-                if global_update % self.save_per_updates == 0 and self.accelerator.sync_gradients:
                     self.save_checkpoint(global_update)
                     if self.log_samples and self.accelerator.is_local_main_process:
                         ref_audio_len = mel_lengths[0]
                         infer_text = [
-                            text_inputs[0] + ([" "] if isinstance(text_inputs[0], list) else " ") + text_inputs[0]
                         ]
                         with torch.inference_mode():
-                            generated, _ = self.accelerator.unwrap_model(self.model).sample(
                                 cond=mel_spec[0][:ref_audio_len].unsqueeze(0),
                                 text=infer_text,
                                 duration=ref_audio_len * 2,
@@ -417,7 +508,11 @@ class Trainer:
                                 sway_sampling_coef=sway_sampling_coef,
                             )
                             generated = generated.to(torch.float32)
-                            gen_mel_spec = generated[:, ref_audio_len:, :].permute(0, 2, 1).to(self.accelerator.device)
                             ref_mel_spec = batch["mel"][0].unsqueeze(0)
                             if self.vocoder_name == "vocos":
                                 gen_audio = vocoder.decode(gen_mel_spec).cpu()
@@ -427,10 +522,14 @@ class Trainer:
                                 ref_audio = vocoder(ref_mel_spec).squeeze(0).cpu()
                         torchaudio.save(
-                            f"{log_samples_path}/update_{global_update}_gen.wav", gen_audio, target_sample_rate
                         )
                         torchaudio.save(
-                            f"{log_samples_path}/update_{global_update}_ref.wav", ref_audio, target_sample_rate
                         )
                         self.model.train()

 from f5_tts.model.dataset import DynamicBatchSampler, collate_fn
 from f5_tts.model.utils import default, exists
 # trainer
         self.logger = logger
         if self.logger == "wandb":
             if exists(wandb_resume_id):
+                init_kwargs = {
+                    "wandb": {
+                        "resume": "allow",
+                        "name": wandb_run_name,
+                        "id": wandb_resume_id,
+                    }
+                }
             else:
                 init_kwargs = {"wandb": {"resume": "allow", "name": wandb_run_name}}
             self.optimizer = bnb.optim.AdamW8bit(model.parameters(), lr=learning_rate)
         else:
             self.optimizer = AdamW(model.parameters(), lr=learning_rate)
+        self.model, self.optimizer = self.accelerator.prepare(
+            self.model, self.optimizer
+        )
     @property
     def is_main(self):
             if not os.path.exists(self.checkpoint_path):
                 os.makedirs(self.checkpoint_path)
             if last:
+                self.accelerator.save(
+                    checkpoint, f"{self.checkpoint_path}/model_last.pt"
+                )
                 print(f"Saved last checkpoint at update {update}")
             else:
                 if self.keep_last_n_checkpoints == 0:
                     return
+                self.accelerator.save(
+                    checkpoint, f"{self.checkpoint_path}/model_{update}.pt"
+                )
                 if self.keep_last_n_checkpoints > 0:
                     # Updated logic to exclude pretrained model from rotation
                     checkpoints = [
         if (
             not exists(self.checkpoint_path)
             or not os.path.exists(self.checkpoint_path)
+            or not any(
+                filename.endswith((".pt", ".safetensors"))
+                for filename in os.listdir(self.checkpoint_path)
+            )
         ):
             return 0
             all_checkpoints = [
                 f
                 for f in os.listdir(self.checkpoint_path)
+                if (f.startswith("model_") or f.startswith("pretrained_"))
+                and f.endswith((".pt", ".safetensors"))
             ]
             # First try to find regular training checkpoints
+            training_checkpoints = [
+                f
+                for f in all_checkpoints
+                if f.startswith("model_") and f != "model_last.pt"
+            ]
             if training_checkpoints:
                 latest_checkpoint = sorted(
                     training_checkpoints,
                 )[-1]
             else:
                 # If no training checkpoints, use pretrained model
+                latest_checkpoint = next(
+                    f for f in all_checkpoints if f.startswith("pretrained_")
+                )
         if latest_checkpoint.endswith(".safetensors"):  # always a pretrained checkpoint
             from safetensors.torch import load_file
+            checkpoint = load_file(
+                f"{self.checkpoint_path}/{latest_checkpoint}", device="cpu"
+            )
             checkpoint = {"ema_model_state_dict": checkpoint}
         elif latest_checkpoint.endswith(".pt"):
             # checkpoint = torch.load(f"{self.checkpoint_path}/{latest_checkpoint}", map_location=self.accelerator.device)  # rather use accelerator.load_state ಥ_ಥ
             checkpoint = torch.load(
+                f"{self.checkpoint_path}/{latest_checkpoint}",
+                weights_only=True,
+                map_location="cpu",
             )
         # patch for backward compatibility, 305e3ea
+        for key in [
+            "ema_model.mel_spec.mel_stft.mel_scale.fb",
+            "ema_model.mel_spec.mel_stft.spectrogram.window",
+        ]:
             if key in checkpoint["ema_model_state_dict"]:
                 del checkpoint["ema_model_state_dict"][key]
         if "update" in checkpoint or "step" in checkpoint:
             # patch for backward compatibility, with before f992c4e
             if "step" in checkpoint:
+                checkpoint["update"] = (
+                    checkpoint["step"] // self.grad_accumulation_steps
+                )
                 if self.grad_accumulation_steps > 1 and self.is_main:
                     print(
                         "F5-TTS WARNING: Loading checkpoint saved with per_steps logic (before f992c4e), will convert to per_updates according to grad_accumulation_steps setting, may have unexpected behaviour."
                     )
             # patch for backward compatibility, 305e3ea
+            for key in [
+                "mel_spec.mel_stft.mel_scale.fb",
+                "mel_spec.mel_stft.spectrogram.window",
+            ]:
                 if key in checkpoint["model_state_dict"]:
                     del checkpoint["model_state_dict"][key]
+            self.accelerator.unwrap_model(self.model).load_state_dict(
+                checkpoint["model_state_dict"]
+            )
             self.optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
             if self.scheduler:
                 self.scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
                 for k, v in checkpoint["ema_model_state_dict"].items()
                 if k not in ["initted", "update", "step"]
             }
+            self.accelerator.unwrap_model(self.model).load_state_dict(
+                checkpoint["model_state_dict"]
+            )
             update = 0
         del checkpoint
         gc.collect()
         return update
+    def train(
+        self, train_dataset: Dataset, num_workers=16, resumable_with_seed: int = None
+    ):
         if self.log_samples:
+            from f5_tts.infer.utils_infer import (cfg_strength, load_vocoder,
+                                                  nfe_step, sway_sampling_coef)
             vocoder = load_vocoder(
+                vocoder_name=self.vocoder_name,
+                is_local=self.is_local_vocoder,
+                local_path=self.local_vocoder_path,
             )
+            target_sample_rate = self.accelerator.unwrap_model(
+                self.model
+            ).mel_spec.target_sample_rate
             log_samples_path = f"{self.checkpoint_path}/samples"
             os.makedirs(log_samples_path, exist_ok=True)
                 batch_sampler=batch_sampler,
             )
         else:
+            raise ValueError(
+                f"batch_size_type must be either 'sample' or 'frame', but received {self.batch_size_type}"
+            )
         #  accelerator.prepare() dispatches batches to devices;
         #  which means the length of dataloader calculated before, should consider the number of devices
             self.num_warmup_updates * self.accelerator.num_processes
         )  # consider a fixed warmup steps while using accelerate multi-gpu ddp
         # otherwise by default with split_batches=False, warmup steps change with num_processes
+        total_updates = (
+            math.ceil(len(train_dataloader) / self.grad_accumulation_steps)
+            * self.epochs
+        )
         decay_updates = total_updates - warmup_updates
+        warmup_scheduler = LinearLR(
+            self.optimizer,
+            start_factor=1e-8,
+            end_factor=1.0,
+            total_iters=warmup_updates,
+        )
+        decay_scheduler = LinearLR(
+            self.optimizer, start_factor=1.0, end_factor=1e-8, total_iters=decay_updates
+        )
         self.scheduler = SequentialLR(
+            self.optimizer,
+            schedulers=[warmup_scheduler, decay_scheduler],
+            milestones=[warmup_updates],
         )
         train_dataloader, self.scheduler = self.accelerator.prepare(
             train_dataloader, self.scheduler
             start_step = start_update * self.grad_accumulation_steps
             skipped_epoch = int(start_step // orig_epoch_step)
             skipped_batch = start_step % orig_epoch_step
+            skipped_dataloader = self.accelerator.skip_first_batches(
+                train_dataloader, num_batches=skipped_batch
+            )
         else:
             skipped_epoch = 0
         for epoch in range(skipped_epoch, self.epochs):
             self.model.train()
             if exists(resumable_with_seed) and epoch == skipped_epoch:
+                progress_bar_initial = math.ceil(
+                    skipped_batch / self.grad_accumulation_steps
+                )
                 current_dataloader = skipped_dataloader
             else:
                 progress_bar_initial = 0
                 current_dataloader = train_dataloader
             # Set epoch for the batch sampler if it exists
+            if hasattr(train_dataloader, "batch_sampler") and hasattr(
+                train_dataloader.batch_sampler, "set_epoch"
+            ):
                 train_dataloader.batch_sampler.set_epoch(epoch)
             progress_bar = tqdm(
                     mel_lengths = batch["mel_lengths"]
                     # TODO. add duration predictor training
+                    if (
+                        self.duration_predictor is not None
+                        and self.accelerator.is_local_main_process
+                    ):
+                        dur_loss = self.duration_predictor(
+                            mel_spec, lens=batch.get("durations")
+                        )
+                        self.accelerator.log(
+                            {"duration loss": dur_loss.item()}, step=global_update
+                        )
                     loss, cond, pred = self.model(
+                        mel_spec,
+                        text=text_inputs,
+                        lens=mel_lengths,
+                        noise_scheduler=self.noise_scheduler,
                     )
                     self.accelerator.backward(loss)
                     if self.max_grad_norm > 0 and self.accelerator.sync_gradients:
+                        self.accelerator.clip_grad_norm_(
+                            self.model.parameters(), self.max_grad_norm
+                        )
                     self.optimizer.step()
                     self.scheduler.step()
                     global_update += 1
                     progress_bar.update(1)
+                    progress_bar.set_postfix(
+                        update=str(global_update), loss=loss.item()
+                    )
                 if self.accelerator.is_local_main_process:
                     self.accelerator.log(
+                        {"loss": loss.item(), "lr": self.scheduler.get_last_lr()[0]},
+                        step=global_update,
                     )
                     if self.logger == "tensorboard":
                         self.writer.add_scalar("loss", loss.item(), global_update)
+                        self.writer.add_scalar(
+                            "lr", self.scheduler.get_last_lr()[0], global_update
+                        )
+                if (
+                    global_update % self.last_per_updates == 0
+                    and self.accelerator.sync_gradients
+                ):
                     self.save_checkpoint(global_update, last=True)
+                if (
+                    global_update % self.save_per_updates == 0
+                    and self.accelerator.sync_gradients
+                ):
                     self.save_checkpoint(global_update)
                     if self.log_samples and self.accelerator.is_local_main_process:
                         ref_audio_len = mel_lengths[0]
                         infer_text = [
+                            text_inputs[0]
+                            + ([" "] if isinstance(text_inputs[0], list) else " ")
+                            + text_inputs[0]
                         ]
                         with torch.inference_mode():
+                            generated, _ = self.accelerator.unwrap_model(
+                                self.model
+                            ).sample(
                                 cond=mel_spec[0][:ref_audio_len].unsqueeze(0),
                                 text=infer_text,
                                 duration=ref_audio_len * 2,
                                 sway_sampling_coef=sway_sampling_coef,
                             )
                             generated = generated.to(torch.float32)
+                            gen_mel_spec = (
+                                generated[:, ref_audio_len:, :]
+                                .permute(0, 2, 1)
+                                .to(self.accelerator.device)
+                            )
                             ref_mel_spec = batch["mel"][0].unsqueeze(0)
                             if self.vocoder_name == "vocos":
                                 gen_audio = vocoder.decode(gen_mel_spec).cpu()
                                 ref_audio = vocoder(ref_mel_spec).squeeze(0).cpu()
                         torchaudio.save(
+                            f"{log_samples_path}/update_{global_update}_gen.wav",
+                            gen_audio,
+                            target_sample_rate,
                         )
                         torchaudio.save(
+                            f"{log_samples_path}/update_{global_update}_ref.wav",
+                            ref_audio,
+                            target_sample_rate,
                         )
                         self.model.train()

f5_tts/model_new/utils.py CHANGED Viewed

@@ -10,7 +10,6 @@ import torch
 from pypinyin import Style, lazy_pinyin
 from torch.nn.utils.rnn import pad_sequence
 # seed everything
@@ -48,7 +47,9 @@ def is_package_available(package_name: str) -> bool:
 # tensor helpers
-def lens_to_mask(t: int["b"], length: int | None = None) -> bool["b n"]:  # noqa: F722 F821
     if not exists(length):
         length = t.amax()
@@ -56,7 +57,9 @@ def lens_to_mask(t: int["b"], length: int | None = None) -> bool["b n"]:  # noqa
     return seq[None, :] < t[:, None]
-def mask_from_start_end_indices(seq_len: int["b"], start: int["b"], end: int["b"]):  # noqa: F722 F821
     max_seq_len = seq_len.max().item()
     seq = torch.arange(max_seq_len, device=start.device).long()
     start_mask = seq[None, :] >= start[:, None]
@@ -64,7 +67,9 @@ def mask_from_start_end_indices(seq_len: int["b"], start: int["b"], end: int["b"
     return start_mask & end_mask
-def mask_from_frac_lengths(seq_len: int["b"], frac_lengths: float["b"]):  # noqa: F722 F821
     lengths = (frac_lengths * seq_len).long()
     max_start = seq_len - lengths
@@ -75,7 +80,9 @@ def mask_from_frac_lengths(seq_len: int["b"], frac_lengths: float["b"]):  # noqa
     return mask_from_start_end_indices(seq_len, start, end)
-def maybe_masked_mean(t: float["b n d"], mask: bool["b n"] = None) -> float["b d"]:  # noqa: F722
     if not exists(mask):
         return t.mean(dim=1)
@@ -99,7 +106,9 @@ def list_str_to_idx(
     vocab_char_map: dict[str, int],  # {char: idx}
     padding_value=-1,
 ) -> int["b nt"]:  # noqa: F722
-    list_idx_tensors = [torch.tensor([vocab_char_map.get(c, 0) for c in t]) for t in text]  # pinyin or char style
     text = pad_sequence(list_idx_tensors, padding_value=padding_value, batch_first=True)
     return text
@@ -118,13 +127,18 @@ def get_tokenizer(dataset_name, tokenizer: str = "pinyin"):
                 - if use "byte", set to 256 (unicode byte range)
     """
     if tokenizer in ["pinyin", "char"]:
-        tokenizer_path = os.path.join(files("f5_tts").joinpath("../../data"), f"{dataset_name}_{tokenizer}/vocab.txt")
         with open(tokenizer_path, "r", encoding="utf-8") as f:
             vocab_char_map = {}
             for i, char in enumerate(f):
                 vocab_char_map[char[:-1]] = i
         vocab_size = len(vocab_char_map)
-        assert vocab_char_map[" "] == 0, "make sure space is of idx 0 in vocab.txt, cuz 0 is used for unknown char"
     elif tokenizer == "byte":
         vocab_char_map = None
@@ -154,9 +168,7 @@ def convert_char_to_pinyin(text_list, polyphone=True):
     )  # add custom trans here, to address oov
     def is_chinese(c):
-        return (
-            "\u3100" <= c <= "\u9fff"  # common chinese characters
-        )
     for text in text_list:
         char_list = []
@@ -167,7 +179,9 @@ def convert_char_to_pinyin(text_list, polyphone=True):
                 if char_list and seg_byte_len > 1 and char_list[-1] not in " :'\"":
                     char_list.append(" ")
                 char_list.extend(seg)
-            elif polyphone and seg_byte_len == 3 * len(seg):  # if pure east asian characters
                 seg_ = lazy_pinyin(seg, style=Style.TONE3, tone_sandhi=True)
                 for i, c in enumerate(seg):
                     if is_chinese(c):
@@ -179,7 +193,9 @@ def convert_char_to_pinyin(text_list, polyphone=True):
                         char_list.extend(c)
                     elif is_chinese(c):
                         char_list.append(" ")
-                        char_list.extend(lazy_pinyin(c, style=Style.TONE3, tone_sandhi=True))
                     else:
                         char_list.append(c)
         final_text_list.append(char_list)

 from pypinyin import Style, lazy_pinyin
 from torch.nn.utils.rnn import pad_sequence
 # seed everything
 # tensor helpers
+def lens_to_mask(
+    t: int["b"], length: int | None = None
+) -> bool["b n"]:  # noqa: F722 F821
     if not exists(length):
         length = t.amax()
     return seq[None, :] < t[:, None]
+def mask_from_start_end_indices(
+    seq_len: int["b"], start: int["b"], end: int["b"]
+):  # noqa: F722 F821
     max_seq_len = seq_len.max().item()
     seq = torch.arange(max_seq_len, device=start.device).long()
     start_mask = seq[None, :] >= start[:, None]
     return start_mask & end_mask
+def mask_from_frac_lengths(
+    seq_len: int["b"], frac_lengths: float["b"]
+):  # noqa: F722 F821
     lengths = (frac_lengths * seq_len).long()
     max_start = seq_len - lengths
     return mask_from_start_end_indices(seq_len, start, end)
+def maybe_masked_mean(
+    t: float["b n d"], mask: bool["b n"] = None
+) -> float["b d"]:  # noqa: F722
     if not exists(mask):
         return t.mean(dim=1)
     vocab_char_map: dict[str, int],  # {char: idx}
     padding_value=-1,
 ) -> int["b nt"]:  # noqa: F722
+    list_idx_tensors = [
+        torch.tensor([vocab_char_map.get(c, 0) for c in t]) for t in text
+    ]  # pinyin or char style
     text = pad_sequence(list_idx_tensors, padding_value=padding_value, batch_first=True)
     return text
                 - if use "byte", set to 256 (unicode byte range)
     """
     if tokenizer in ["pinyin", "char"]:
+        tokenizer_path = os.path.join(
+            files("f5_tts").joinpath("../../data"),
+            f"{dataset_name}_{tokenizer}/vocab.txt",
+        )
         with open(tokenizer_path, "r", encoding="utf-8") as f:
             vocab_char_map = {}
             for i, char in enumerate(f):
                 vocab_char_map[char[:-1]] = i
         vocab_size = len(vocab_char_map)
+        assert (
+            vocab_char_map[" "] == 0
+        ), "make sure space is of idx 0 in vocab.txt, cuz 0 is used for unknown char"
     elif tokenizer == "byte":
         vocab_char_map = None
     )  # add custom trans here, to address oov
     def is_chinese(c):
+        return "\u3100" <= c <= "\u9fff"  # common chinese characters
     for text in text_list:
         char_list = []
                 if char_list and seg_byte_len > 1 and char_list[-1] not in " :'\"":
                     char_list.append(" ")
                 char_list.extend(seg)
+            elif polyphone and seg_byte_len == 3 * len(
+                seg
+            ):  # if pure east asian characters
                 seg_ = lazy_pinyin(seg, style=Style.TONE3, tone_sandhi=True)
                 for i, c in enumerate(seg):
                     if is_chinese(c):
                         char_list.extend(c)
                     elif is_chinese(c):
                         char_list.append(" ")
+                        char_list.extend(
+                            lazy_pinyin(c, style=Style.TONE3, tone_sandhi=True)
+                        )
                     else:
                         char_list.append(c)
         final_text_list.append(char_list)

f5_tts/runtime/triton_trtllm/benchmark.py CHANGED Viewed

@@ -51,7 +51,6 @@ from torch.utils.data import DataLoader, DistributedSampler
 from tqdm import tqdm
 from vocos import Vocos
 torch.manual_seed(0)
@@ -64,7 +63,9 @@ def get_args():
         choices=["wenetspeech4tts", "test_zh", "test_en", "test_hard"],
         help="huggingface dataset split name",
     )
-    parser.add_argument("--output-dir", required=True, type=str, help="dir to save result")
     parser.add_argument(
         "--vocab-file",
         required=True,
@@ -89,8 +90,12 @@ def get_args():
         type=int,
         help="batch size (per-device) for inference",
     )
-    parser.add_argument("--num-workers", type=int, default=0, help="workers for dataloader")
-    parser.add_argument("--prefetch", type=int, default=None, help="prefetch for dataloader")
     parser.add_argument(
         "--vocoder",
         default="vocos",
@@ -105,8 +110,16 @@ def get_args():
     )
     parser.add_argument("--enable-warmup", action="store_true")
     parser.add_argument("--remove-input-padding", action="store_true")
-    parser.add_argument("--use-perf", action="store_true", help="use nvtx to record performance")
-    parser.add_argument("--backend-type", type=str, default="triton", choices=["trt", "pytorch"], help="backend type")
     args = parser.parse_args()
     return args
@@ -126,7 +139,13 @@ def data_collator(batch, vocab_char_map, device="cuda", use_perf=False):
         torch.cuda.nvtx.range_push("data_collator")
     target_sample_rate = 24000
     target_rms = 0.1
-    ids, ref_mel_list, ref_mel_len_list, estimated_reference_target_mel_len, reference_target_texts_list = (
         [],
         [],
         [],
@@ -170,7 +189,14 @@ def data_collator(batch, vocab_char_map, device="cuda", use_perf=False):
         ref_mel_len_list.append(ref_mel_len)
         estimated_reference_target_mel_len.append(
-            int(ref_mel.shape[0] * (1 + len(target_text.encode("utf-8")) / len(prompt_text.encode("utf-8"))))
         )
     max_seq_len = max(estimated_reference_target_mel_len)
@@ -182,12 +208,22 @@ def data_collator(batch, vocab_char_map, device="cuda", use_perf=False):
     for i, item in enumerate(text_pad_sequence):
         text_pad_sequence[i] = F.pad(
-            item, (0, estimated_reference_target_mel_len[i] - len(item)), mode="constant", value=-1
         )
-        text_pad_sequence[i] += 1  # WAR: 0 is reserved for padding token, hard coding in F5-TTS
-    text_pad_sequence = pad_sequence(text_pad_sequence, padding_value=-1, batch_first=True).to(device)
     text_pad_sequence = F.pad(
-        text_pad_sequence, (0, max_seq_len - text_pad_sequence.shape[1]), mode="constant", value=-1
     )
     if use_perf:
         torch.cuda.nvtx.range_pop()
@@ -252,7 +288,9 @@ def convert_char_to_pinyin(reference_target_texts_list, polyphone=True):
                 if char_list and seg_byte_len > 1 and char_list[-1] not in " :'\"":
                     char_list.append(" ")
                 char_list.extend(seg)
-            elif polyphone and seg_byte_len == 3 * len(seg):  # if pure east asian characters
                 seg_ = lazy_pinyin(seg, style=Style.TONE3, tone_sandhi=True)
                 for i, c in enumerate(seg):
                     if is_chinese(c):
@@ -264,7 +302,9 @@ def convert_char_to_pinyin(reference_target_texts_list, polyphone=True):
                         char_list.extend(c)
                     elif is_chinese(c):
                         char_list.append(" ")
-                        char_list.extend(lazy_pinyin(c, style=Style.TONE3, tone_sandhi=True))
                     else:
                         char_list.append(c)
         final_reference_target_texts_list.append(char_list)
@@ -277,13 +317,20 @@ def list_str_to_idx(
     vocab_char_map: Dict[str, int],  # {char: idx}
     padding_value=-1,
 ):
-    list_idx_tensors = [torch.tensor([vocab_char_map.get(c, 0) for c in t]) for t in text]  # pinyin or char style
     # text = pad_sequence(list_idx_tensors, padding_value=padding_value, batch_first=True)
     return list_idx_tensors
 def load_vocoder(
-    vocoder_name="vocos", is_local=False, local_path="", device="cuda", hf_cache_dir=None, vocoder_trt_engine_path=None
 ):
     if vocoder_name == "vocos":
         if vocoder_trt_engine_path is not None:
@@ -297,8 +344,14 @@ def load_vocoder(
             else:
                 print("Download Vocos from huggingface charactr/vocos-mel-24khz")
                 repo_id = "charactr/vocos-mel-24khz"
-                config_path = hf_hub_download(repo_id=repo_id, cache_dir=hf_cache_dir, filename="config.yaml")
-                model_path = hf_hub_download(repo_id=repo_id, cache_dir=hf_cache_dir, filename="pytorch_model.bin")
             vocoder = Vocos.from_hparams(config_path)
             state_dict = torch.load(model_path, map_location="cpu", weights_only=True)
             from vocos.feature_extractors import EncodecFeatures
@@ -343,14 +396,21 @@ class VocosTensorRT:
         with open(engine_path, "rb") as f:
             engine_buffer = f.read()
         self.session = Session.from_serialized_engine(engine_buffer)
-        self.stream = stream if stream is not None else torch.cuda.current_stream().cuda_stream
     def decode(self, mels):
         mels = mels.contiguous()
         inputs = {"mel": mels}
-        output_info = self.session.infer_shapes([TensorInfo("mel", trt.DataType.FLOAT, mels.shape)])
         outputs = {
-            t.name: torch.empty(tuple(t.shape), dtype=trt_dtype_to_torch(t.dtype), device="cuda") for t in output_info
         }
         ok = self.session.run(inputs, outputs, self.stream)
@@ -376,12 +436,18 @@ def main():
         config = json.load(f)
     if args.backend_type == "trt":
         model = F5TTS(
-            config, debug_mode=False, tllm_model_dir=tllm_model_dir, model_path=args.model_path, vocab_size=vocab_size
         )
     elif args.backend_type == "pytorch":
         import sys
-        sys.path.append(f"{os.path.dirname(os.path.abspath(__file__))}/../../../../src/")
         from f5_tts.infer.utils_infer import load_model
         from f5_tts.model import DiT
@@ -398,7 +464,9 @@ def main():
         model = load_model(DiT, F5TTS_model_cfg, args.model_path)
     vocoder = load_vocoder(
-        vocoder_name=args.vocoder, device=device, vocoder_trt_engine_path=args.vocoder_trt_engine_path
     )
     dataset = load_dataset(
@@ -411,7 +479,9 @@ def main():
         prompt_audio_len = example["prompt_audio"]["array"].shape[0]
         scale_factor = 1 + len(example["target_text"]) / len(example["prompt_text"])
         estimated_duration = prompt_audio_len * scale_factor
-        example["estimated_duration"] = estimated_duration / example["prompt_audio"]["sampling_rate"]
         return example
     dataset = dataset.map(add_estimated_duration)
@@ -442,12 +512,18 @@ def main():
     if args.enable_warmup:
         for batch in dataloader:
-            ref_mels, ref_mel_lens = batch["ref_mel_batch"].to(device), batch["ref_mel_len_batch"].to(device)
             text_pad_seq = batch["text_pad_sequence"].to(device)
             total_mel_lens = batch["estimated_reference_target_mel_len"]
             if args.backend_type == "trt":
                 _ = model.sample(
-                    text_pad_seq, ref_mels, ref_mel_lens, total_mel_lens, remove_input_padding=args.remove_input_padding
                 )
             elif args.backend_type == "pytorch":
                 with torch.inference_mode():
@@ -475,7 +551,9 @@ def main():
     for batch in dataloader:
         if args.use_perf:
             torch.cuda.nvtx.range_push("data sample")
-        ref_mels, ref_mel_lens = batch["ref_mel_batch"].to(device), batch["ref_mel_len_batch"].to(device)
         text_pad_seq = batch["text_pad_sequence"].to(device)
         total_mel_lens = batch["estimated_reference_target_mel_len"]

 from tqdm import tqdm
 from vocos import Vocos
 torch.manual_seed(0)
         choices=["wenetspeech4tts", "test_zh", "test_en", "test_hard"],
         help="huggingface dataset split name",
     )
+    parser.add_argument(
+        "--output-dir", required=True, type=str, help="dir to save result"
+    )
     parser.add_argument(
         "--vocab-file",
         required=True,
         type=int,
         help="batch size (per-device) for inference",
     )
+    parser.add_argument(
+        "--num-workers", type=int, default=0, help="workers for dataloader"
+    )
+    parser.add_argument(
+        "--prefetch", type=int, default=None, help="prefetch for dataloader"
+    )
     parser.add_argument(
         "--vocoder",
         default="vocos",
     )
     parser.add_argument("--enable-warmup", action="store_true")
     parser.add_argument("--remove-input-padding", action="store_true")
+    parser.add_argument(
+        "--use-perf", action="store_true", help="use nvtx to record performance"
+    )
+    parser.add_argument(
+        "--backend-type",
+        type=str,
+        default="triton",
+        choices=["trt", "pytorch"],
+        help="backend type",
+    )
     args = parser.parse_args()
     return args
         torch.cuda.nvtx.range_push("data_collator")
     target_sample_rate = 24000
     target_rms = 0.1
+    (
+        ids,
+        ref_mel_list,
+        ref_mel_len_list,
+        estimated_reference_target_mel_len,
+        reference_target_texts_list,
+    ) = (
         [],
         [],
         [],
         ref_mel_len_list.append(ref_mel_len)
         estimated_reference_target_mel_len.append(
+            int(
+                ref_mel.shape[0]
+                * (
+                    1
+                    + len(target_text.encode("utf-8"))
+                    / len(prompt_text.encode("utf-8"))
+                )
+            )
         )
     max_seq_len = max(estimated_reference_target_mel_len)
     for i, item in enumerate(text_pad_sequence):
         text_pad_sequence[i] = F.pad(
+            item,
+            (0, estimated_reference_target_mel_len[i] - len(item)),
+            mode="constant",
+            value=-1,
         )
+        text_pad_sequence[
+            i
+        ] += 1  # WAR: 0 is reserved for padding token, hard coding in F5-TTS
+    text_pad_sequence = pad_sequence(
+        text_pad_sequence, padding_value=-1, batch_first=True
+    ).to(device)
     text_pad_sequence = F.pad(
+        text_pad_sequence,
+        (0, max_seq_len - text_pad_sequence.shape[1]),
+        mode="constant",
+        value=-1,
     )
     if use_perf:
         torch.cuda.nvtx.range_pop()
                 if char_list and seg_byte_len > 1 and char_list[-1] not in " :'\"":
                     char_list.append(" ")
                 char_list.extend(seg)
+            elif polyphone and seg_byte_len == 3 * len(
+                seg
+            ):  # if pure east asian characters
                 seg_ = lazy_pinyin(seg, style=Style.TONE3, tone_sandhi=True)
                 for i, c in enumerate(seg):
                     if is_chinese(c):
                         char_list.extend(c)
                     elif is_chinese(c):
                         char_list.append(" ")
+                        char_list.extend(
+                            lazy_pinyin(c, style=Style.TONE3, tone_sandhi=True)
+                        )
                     else:
                         char_list.append(c)
         final_reference_target_texts_list.append(char_list)
     vocab_char_map: Dict[str, int],  # {char: idx}
     padding_value=-1,
 ):
+    list_idx_tensors = [
+        torch.tensor([vocab_char_map.get(c, 0) for c in t]) for t in text
+    ]  # pinyin or char style
     # text = pad_sequence(list_idx_tensors, padding_value=padding_value, batch_first=True)
     return list_idx_tensors
 def load_vocoder(
+    vocoder_name="vocos",
+    is_local=False,
+    local_path="",
+    device="cuda",
+    hf_cache_dir=None,
+    vocoder_trt_engine_path=None,
 ):
     if vocoder_name == "vocos":
         if vocoder_trt_engine_path is not None:
             else:
                 print("Download Vocos from huggingface charactr/vocos-mel-24khz")
                 repo_id = "charactr/vocos-mel-24khz"
+                config_path = hf_hub_download(
+                    repo_id=repo_id, cache_dir=hf_cache_dir, filename="config.yaml"
+                )
+                model_path = hf_hub_download(
+                    repo_id=repo_id,
+                    cache_dir=hf_cache_dir,
+                    filename="pytorch_model.bin",
+                )
             vocoder = Vocos.from_hparams(config_path)
             state_dict = torch.load(model_path, map_location="cpu", weights_only=True)
             from vocos.feature_extractors import EncodecFeatures
         with open(engine_path, "rb") as f:
             engine_buffer = f.read()
         self.session = Session.from_serialized_engine(engine_buffer)
+        self.stream = (
+            stream if stream is not None else torch.cuda.current_stream().cuda_stream
+        )
     def decode(self, mels):
         mels = mels.contiguous()
         inputs = {"mel": mels}
+        output_info = self.session.infer_shapes(
+            [TensorInfo("mel", trt.DataType.FLOAT, mels.shape)]
+        )
         outputs = {
+            t.name: torch.empty(
+                tuple(t.shape), dtype=trt_dtype_to_torch(t.dtype), device="cuda"
+            )
+            for t in output_info
         }
         ok = self.session.run(inputs, outputs, self.stream)
         config = json.load(f)
     if args.backend_type == "trt":
         model = F5TTS(
+            config,
+            debug_mode=False,
+            tllm_model_dir=tllm_model_dir,
+            model_path=args.model_path,
+            vocab_size=vocab_size,
         )
     elif args.backend_type == "pytorch":
         import sys
+        sys.path.append(
+            f"{os.path.dirname(os.path.abspath(__file__))}/../../../../src/"
+        )
         from f5_tts.infer.utils_infer import load_model
         from f5_tts.model import DiT
         model = load_model(DiT, F5TTS_model_cfg, args.model_path)
     vocoder = load_vocoder(
+        vocoder_name=args.vocoder,
+        device=device,
+        vocoder_trt_engine_path=args.vocoder_trt_engine_path,
     )
     dataset = load_dataset(
         prompt_audio_len = example["prompt_audio"]["array"].shape[0]
         scale_factor = 1 + len(example["target_text"]) / len(example["prompt_text"])
         estimated_duration = prompt_audio_len * scale_factor
+        example["estimated_duration"] = (
+            estimated_duration / example["prompt_audio"]["sampling_rate"]
+        )
         return example
     dataset = dataset.map(add_estimated_duration)
     if args.enable_warmup:
         for batch in dataloader:
+            ref_mels, ref_mel_lens = batch["ref_mel_batch"].to(device), batch[
+                "ref_mel_len_batch"
+            ].to(device)
             text_pad_seq = batch["text_pad_sequence"].to(device)
             total_mel_lens = batch["estimated_reference_target_mel_len"]
             if args.backend_type == "trt":
                 _ = model.sample(
+                    text_pad_seq,
+                    ref_mels,
+                    ref_mel_lens,
+                    total_mel_lens,
+                    remove_input_padding=args.remove_input_padding,
                 )
             elif args.backend_type == "pytorch":
                 with torch.inference_mode():
     for batch in dataloader:
         if args.use_perf:
             torch.cuda.nvtx.range_push("data sample")
+        ref_mels, ref_mel_lens = batch["ref_mel_batch"].to(device), batch[
+            "ref_mel_len_batch"
+        ].to(device)
         text_pad_seq = batch["text_pad_sequence"].to(device)
         total_mel_lens = batch["estimated_reference_target_mel_len"]

f5_tts/runtime/triton_trtllm/client_grpc.py CHANGED Viewed

@@ -64,8 +64,12 @@ def write_triton_stats(stats, summary_file):
             "The log is parsing from triton_client.get_inference_statistics(), to better human readability. \n"
         )
         summary_f.write("To learn more about the log, please refer to: \n")
-        summary_f.write("1. https://github.com/triton-inference-server/server/blob/main/docs/user_guide/metrics.md \n")
-        summary_f.write("2. https://github.com/triton-inference-server/server/issues/5374 \n\n")
         summary_f.write(
             "To better improve throughput, we always would like let requests wait in the queue for a while, and then execute them with a larger batch size. \n"
         )
@@ -86,7 +90,9 @@ def write_triton_stats(stats, summary_file):
             total_queue_time_s = int(model_inference_stats["queue"]["ns"]) / 1e9
             total_infer_time_s = int(model_inference_stats["compute_infer"]["ns"]) / 1e9
             total_input_time_s = int(model_inference_stats["compute_input"]["ns"]) / 1e9
-            total_output_time_s = int(model_inference_stats["compute_output"]["ns"]) / 1e9
             summary_f.write(
                 f"queue time {total_queue_time_s:<5.2f} s, compute infer time {total_infer_time_s:<5.2f} s, compute input time {total_input_time_s:<5.2f} s, compute output time {total_output_time_s:<5.2f} s \n"  # noqa
             )
@@ -97,7 +103,11 @@ def write_triton_stats(stats, summary_file):
                 compute_output = batch["compute_output"]
                 compute_infer = batch["compute_infer"]
                 batch_count = int(compute_infer["count"])
-                assert compute_infer["count"] == compute_output["count"] == compute_input["count"]
                 compute_infer_time_ms = int(compute_infer["ns"]) / 1e6
                 compute_input_time_ms = int(compute_input["ns"]) / 1e6
                 compute_output_time_ms = int(compute_output["ns"]) / 1e6
@@ -113,7 +123,9 @@ def write_triton_stats(stats, summary_file):
 def get_args():
-    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument(
         "--server-addr",
@@ -254,7 +266,9 @@ async def send(
     for i, item in enumerate(manifest_item_list):
         if i % log_interval == 0:
             print(f"{name}: {i}/{len(manifest_item_list)}")
-        waveform, sample_rate = load_audio(item["audio_filepath"], target_sample_rate=24000)
         duration = len(waveform) / sample_rate
         lengths = np.array([[len(waveform)]], dtype=np.int32)
@@ -269,7 +283,10 @@ async def send(
                     1,
                     padding_duration
                     * sample_rate
-                    * ((int(estimated_target_duration + duration) // padding_duration) + 1),
                 ),
                 dtype=np.float32,
             )
@@ -281,8 +298,12 @@ async def send(
         samples = samples.reshape(1, -1).astype(np.float32)
         inputs = [
-            protocol_client.InferInput("reference_wav", samples.shape, np_to_triton_dtype(samples.dtype)),
-            protocol_client.InferInput("reference_wav_len", lengths.shape, np_to_triton_dtype(lengths.dtype)),
             protocol_client.InferInput("reference_text", [1, 1], "BYTES"),
             protocol_client.InferInput("target_text", [1, 1], "BYTES"),
         ]
@@ -301,13 +322,17 @@ async def send(
         sequence_id = 100000000 + i + task_id * 10
         start = time.time()
-        response = await triton_client.infer(model_name, inputs, request_id=str(sequence_id), outputs=outputs)
         audio = response.as_numpy("waveform").reshape(-1)
         end = time.time() - start
-        audio_save_path = os.path.join(audio_save_dir, f"{item['target_audio_path']}.wav")
         sf.write(audio_save_path, audio, save_sample_rate, "PCM_16")
         actual_duration = len(audio) / save_sample_rate
@@ -341,7 +366,9 @@ def load_manifests(manifest_path):
 def split_data(data, k):
     n = len(data)
     if n < k:
-        print(f"Warning: the length of the input list ({n}) is less than k ({k}). Setting k to {n}.")
         k = n
     quotient = n // k
@@ -461,7 +488,9 @@ async def main():
     stats = await triton_client.get_inference_statistics(model_name="", as_json=True)
     write_triton_stats(stats, f"{args.log_dir}/stats_summary-{name}.txt")
-    metadata = await triton_client.get_model_config(model_name=args.model_name, as_json=True)
     with open(f"{args.log_dir}/model_config-{name}.json", "w") as f:
         json.dump(metadata, f, indent=4)

             "The log is parsing from triton_client.get_inference_statistics(), to better human readability. \n"
         )
         summary_f.write("To learn more about the log, please refer to: \n")
+        summary_f.write(
+            "1. https://github.com/triton-inference-server/server/blob/main/docs/user_guide/metrics.md \n"
+        )
+        summary_f.write(
+            "2. https://github.com/triton-inference-server/server/issues/5374 \n\n"
+        )
         summary_f.write(
             "To better improve throughput, we always would like let requests wait in the queue for a while, and then execute them with a larger batch size. \n"
         )
             total_queue_time_s = int(model_inference_stats["queue"]["ns"]) / 1e9
             total_infer_time_s = int(model_inference_stats["compute_infer"]["ns"]) / 1e9
             total_input_time_s = int(model_inference_stats["compute_input"]["ns"]) / 1e9
+            total_output_time_s = (
+                int(model_inference_stats["compute_output"]["ns"]) / 1e9
+            )
             summary_f.write(
                 f"queue time {total_queue_time_s:<5.2f} s, compute infer time {total_infer_time_s:<5.2f} s, compute input time {total_input_time_s:<5.2f} s, compute output time {total_output_time_s:<5.2f} s \n"  # noqa
             )
                 compute_output = batch["compute_output"]
                 compute_infer = batch["compute_infer"]
                 batch_count = int(compute_infer["count"])
+                assert (
+                    compute_infer["count"]
+                    == compute_output["count"]
+                    == compute_input["count"]
+                )
                 compute_infer_time_ms = int(compute_infer["ns"]) / 1e6
                 compute_input_time_ms = int(compute_input["ns"]) / 1e6
                 compute_output_time_ms = int(compute_output["ns"]) / 1e6
 def get_args():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
     parser.add_argument(
         "--server-addr",
     for i, item in enumerate(manifest_item_list):
         if i % log_interval == 0:
             print(f"{name}: {i}/{len(manifest_item_list)}")
+        waveform, sample_rate = load_audio(
+            item["audio_filepath"], target_sample_rate=24000
+        )
         duration = len(waveform) / sample_rate
         lengths = np.array([[len(waveform)]], dtype=np.int32)
                     1,
                     padding_duration
                     * sample_rate
+                    * (
+                        (int(estimated_target_duration + duration) // padding_duration)
+                        + 1
+                    ),
                 ),
                 dtype=np.float32,
             )
         samples = samples.reshape(1, -1).astype(np.float32)
         inputs = [
+            protocol_client.InferInput(
+                "reference_wav", samples.shape, np_to_triton_dtype(samples.dtype)
+            ),
+            protocol_client.InferInput(
+                "reference_wav_len", lengths.shape, np_to_triton_dtype(lengths.dtype)
+            ),
             protocol_client.InferInput("reference_text", [1, 1], "BYTES"),
             protocol_client.InferInput("target_text", [1, 1], "BYTES"),
         ]
         sequence_id = 100000000 + i + task_id * 10
         start = time.time()
+        response = await triton_client.infer(
+            model_name, inputs, request_id=str(sequence_id), outputs=outputs
+        )
         audio = response.as_numpy("waveform").reshape(-1)
         end = time.time() - start
+        audio_save_path = os.path.join(
+            audio_save_dir, f"{item['target_audio_path']}.wav"
+        )
         sf.write(audio_save_path, audio, save_sample_rate, "PCM_16")
         actual_duration = len(audio) / save_sample_rate
 def split_data(data, k):
     n = len(data)
     if n < k:
+        print(
+            f"Warning: the length of the input list ({n}) is less than k ({k}). Setting k to {n}."
+        )
         k = n
     quotient = n // k
     stats = await triton_client.get_inference_statistics(model_name="", as_json=True)
     write_triton_stats(stats, f"{args.log_dir}/stats_summary-{name}.txt")
+    metadata = await triton_client.get_model_config(
+        model_name=args.model_name, as_json=True
+    )
     with open(f"{args.log_dir}/model_config-{name}.json", "w") as f:
         json.dump(metadata, f, indent=4)

f5_tts/runtime/triton_trtllm/client_http.py CHANGED Viewed

@@ -31,7 +31,9 @@ import soundfile as sf
 def get_args():
-    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument(
         "--server-url",
@@ -91,15 +93,30 @@ def prepare_request(
     data = {
         "inputs": [
-            {"name": "reference_wav", "shape": samples.shape, "datatype": "FP32", "data": samples.tolist()},
             {
                 "name": "reference_wav_len",
                 "shape": lengths.shape,
                 "datatype": "INT32",
                 "data": lengths.tolist(),
             },
-            {"name": "reference_text", "shape": [1, 1], "datatype": "BYTES", "data": [reference_text]},
-            {"name": "target_text", "shape": [1, 1], "datatype": "BYTES", "data": [target_text]},
         ]
     }
@@ -135,7 +152,11 @@ if __name__ == "__main__":
     data = prepare_request(samples, args.reference_text, args.target_text)
     rsp = requests.post(
-        url, headers={"Content-Type": "application/json"}, json=data, verify=False, params={"request_id": "0"}
     )
     result = rsp.json()
     audio = result["outputs"][0]["data"]

 def get_args():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
     parser.add_argument(
         "--server-url",
     data = {
         "inputs": [
+            {
+                "name": "reference_wav",
+                "shape": samples.shape,
+                "datatype": "FP32",
+                "data": samples.tolist(),
+            },
             {
                 "name": "reference_wav_len",
                 "shape": lengths.shape,
                 "datatype": "INT32",
                 "data": lengths.tolist(),
             },
+            {
+                "name": "reference_text",
+                "shape": [1, 1],
+                "datatype": "BYTES",
+                "data": [reference_text],
+            },
+            {
+                "name": "target_text",
+                "shape": [1, 1],
+                "datatype": "BYTES",
+                "data": [target_text],
+            },
         ]
     }
     data = prepare_request(samples, args.reference_text, args.target_text)
     rsp = requests.post(
+        url,
+        headers={"Content-Type": "application/json"},
+        json=data,
+        verify=False,
+        params={"request_id": "0"},
     )
     result = rsp.json()
     audio = result["outputs"][0]["data"]

f5_tts/runtime/triton_trtllm/model_repo_f5_tts/f5_tts/1/f5_tts_trtllm.py CHANGED Viewed

@@ -17,7 +17,9 @@ from tensorrt_llm.runtime.session import Session
 def remove_tensor_padding(input_tensor, input_tensor_lengths=None):
     # Audio tensor case: batch, seq_len, feature_len
     # position_ids case: batch, seq_len
-    assert input_tensor_lengths is not None, "input_tensor_lengths must be provided for 3D input_tensor"
     # Initialize a list to collect valid sequences
     valid_sequences = []
@@ -32,11 +34,29 @@ def remove_tensor_padding(input_tensor, input_tensor_lengths=None):
 class TextEmbedding(nn.Module):
-    def __init__(self, text_num_embeds, text_dim, conv_layers=0, conv_mult=2, precompute_max_pos=4096):
         super().__init__()
-        self.text_embed = nn.Embedding(text_num_embeds + 1, text_dim)  # use 0 as filler token
-        self.register_buffer("freqs_cis", precompute_freqs_cis(text_dim, precompute_max_pos), persistent=False)
-        self.text_blocks = nn.Sequential(*[ConvNeXtV2Block(text_dim, text_dim * conv_mult) for _ in range(conv_layers)])
     def forward(self, text):
         # only keep tensors with value not -1
@@ -80,7 +100,9 @@ class ConvNeXtV2Block(nn.Module):
             dim, dim, kernel_size=7, padding=padding, groups=dim, dilation=dilation
         )  # depthwise conv
         self.norm = nn.LayerNorm(dim, eps=1e-6)
-        self.pwconv1 = nn.Linear(dim, intermediate_dim)  # pointwise/1x1 convs, implemented with linear layers
         self.act = nn.GELU()
         self.grn = GRN(intermediate_dim)
         self.pwconv2 = nn.Linear(intermediate_dim, dim)
@@ -98,7 +120,9 @@ class ConvNeXtV2Block(nn.Module):
         return residual + x
-def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0, theta_rescale_factor=1.0):
     # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
     # has some connection to NTK literature
     # https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
@@ -125,7 +149,9 @@ def load_checkpoint(ckpt_path, use_ema=True):
     for key in dict_state.keys():
         # transformer.text_embed.text_embed.weight -> text_embed.weight
         if "text_embed" in key:
-            text_embed_dict[key.replace("transformer.text_embed.", "")] = dict_state[key]
     return text_embed_dict
@@ -148,7 +174,12 @@ class F5TTS(object):
         pp_size = config["pretrained_config"]["mapping"]["pp_size"]
         assert pp_size == 1
         self.mapping = tensorrt_llm.Mapping(
-            world_size=world_size, rank=rank, cp_size=cp_size, tp_size=tp_size, pp_size=1, gpus_per_node=1
         )
         local_rank = rank % self.mapping.gpus_per_node
@@ -176,10 +207,23 @@ class F5TTS(object):
         self.outputs = {}
         self.buffer_allocated = False
-        expected_tensor_names = ["noise", "cond", "time", "rope_cos", "rope_sin", "input_lengths", "denoised"]
-        found_tensor_names = [self.session.engine.get_tensor_name(i) for i in range(self.session.engine.num_io_tensors)]
-        if not self.debug_mode and set(expected_tensor_names) != set(found_tensor_names):
             logger.error(
                 f"The following expected tensors are not found: {set(expected_tensor_names).difference(set(found_tensor_names))}"
             )
@@ -190,11 +234,16 @@ class F5TTS(object):
             logger.error(f"Found tensor names: {found_tensor_names}")
             raise RuntimeError("Tensor names in engine are not the same as expected.")
         if self.debug_mode:
-            self.debug_tensors = list(set(found_tensor_names) - set(expected_tensor_names))
         self.max_mel_len = 4096
         self.text_embedding = TextEmbedding(
-            text_num_embeds=vocab_size, text_dim=512, conv_layers=4, precompute_max_pos=self.max_mel_len
         ).to(self.device)
         self.text_embedding.load_state_dict(load_checkpoint(model_path), strict=True)
@@ -208,9 +257,16 @@ class F5TTS(object):
         self.head_dim = 64
         self.base_rescale_factor = 1.0
         self.interpolation_factor = 1.0
-        base = 10000.0 * self.base_rescale_factor ** (self.head_dim / (self.head_dim - 2))
-        inv_freq = 1.0 / (base ** (torch.arange(0, self.head_dim, 2).float() / self.head_dim))
-        freqs = torch.outer(torch.arange(self.max_mel_len, dtype=torch.float32), inv_freq) / self.interpolation_factor
         self.freqs = freqs.repeat_interleave(2, dim=-1).unsqueeze(0)
         self.rope_cos = self.freqs.cos().half()
         self.rope_sin = self.freqs.sin().half()
@@ -223,7 +279,9 @@ class F5TTS(object):
         time_expand = torch.zeros((1, self.nfe_steps, tmp_dim), dtype=torch.float32)
         half_dim = tmp_dim // 2
         emb_factor = math.log(10000) / (half_dim - 1)
-        emb_factor = 1000.0 * torch.exp(torch.arange(half_dim, dtype=torch.float32) * -emb_factor)
         for i in range(self.nfe_steps):
             emb = time_step[i] * emb_factor
             time_expand[:, i, :] = torch.cat((emb.sin(), emb.cos()), dim=-1)
@@ -242,7 +300,9 @@ class F5TTS(object):
                 shape = list(self.session.engine.get_tensor_shape(name))
                 shape[0] = batch_size
                 shape[1] = seq_len
-                self.outputs[name] = torch.empty(shape, dtype=self._tensor_dtype(name), device=self.device)
         self.buffer_allocated = True
@@ -356,17 +416,29 @@ class F5TTS(object):
         max_seq_len = ref_mel_batch.shape[1]
         text_pad_sequence_drop = torch.cat(
-            (text_pad_sequence, torch.zeros((1, text_pad_sequence.shape[1]), dtype=torch.int32).to(self.device)), dim=0
         )
         text_embedding_drop_list = []
         for i in range(batch + 1):
-            text_embedding_drop_list.append(self.text_embedding(text_pad_sequence_drop[i].unsqueeze(0).to(self.device)))
         text_embedding_drop_condition = torch.cat(text_embedding_drop_list, dim=0)
         text_embedding = text_embedding_drop_condition[:-1]
         # text_embedding_drop B,T,C batch should be the same
-        text_embedding_drop = text_embedding_drop_condition[-1].unsqueeze(0).repeat(batch, 1, 1)
         noise = torch.randn_like(ref_mel_batch).to(self.device)
         rope_cos = self.rope_cos[:, :max_seq_len, :].float().repeat(batch, 1, 1)
@@ -375,7 +447,9 @@ class F5TTS(object):
         cat_mel_text = torch.cat((ref_mel_batch, text_embedding), dim=-1)
         cat_mel_text_drop = torch.cat(
             (
-                torch.zeros((batch, max_seq_len, self.n_mel_channels), dtype=torch.float32).to(self.device),
                 text_embedding_drop,
             ),
             dim=-1,
@@ -384,7 +458,9 @@ class F5TTS(object):
         time_expand = self.time_expand.repeat(2 * batch, 1, 1).contiguous()
         # Convert estimated_reference_target_mel_len to tensor
-        input_lengths = torch.tensor(estimated_reference_target_mel_len, dtype=torch.int32)
         # combine above along the batch dimension
         inputs = {
@@ -393,20 +469,34 @@ class F5TTS(object):
             "time_expand": time_expand,
             "rope_cos": torch.cat((rope_cos, rope_cos), dim=0).contiguous(),
             "rope_sin": torch.cat((rope_sin, rope_sin), dim=0).contiguous(),
-            "input_lengths": torch.cat((input_lengths, input_lengths), dim=0).contiguous(),
             "delta_t": self.delta_t,
         }
         if use_perf and remove_input_padding:
             torch.cuda.nvtx.range_push("remove input padding")
         if remove_input_padding:
             max_seq_len = inputs["cond"].shape[1]
-            inputs["noise"] = remove_tensor_padding(inputs["noise"], inputs["input_lengths"])
-            inputs["cond"] = remove_tensor_padding(inputs["cond"], inputs["input_lengths"])
             # for time_expand, convert from B,D to B,T,D by repeat
-            inputs["time_expand"] = inputs["time_expand"].unsqueeze(1).repeat(1, max_seq_len, 1, 1)
-            inputs["time_expand"] = remove_tensor_padding(inputs["time_expand"], inputs["input_lengths"])
-            inputs["rope_cos"] = remove_tensor_padding(inputs["rope_cos"], inputs["input_lengths"])
-            inputs["rope_sin"] = remove_tensor_padding(inputs["rope_sin"], inputs["input_lengths"])
         if use_perf and remove_input_padding:
             torch.cuda.nvtx.range_pop()
         for key in inputs:
@@ -422,7 +512,9 @@ class F5TTS(object):
             denoised_list = []
             start_idx = 0
             for i in range(batch):
-                denoised_list.append(denoised[start_idx : start_idx + inputs["input_lengths"][i]])
                 start_idx += inputs["input_lengths"][i]
             if use_perf and remove_input_padding:
                 torch.cuda.nvtx.range_pop()

 def remove_tensor_padding(input_tensor, input_tensor_lengths=None):
     # Audio tensor case: batch, seq_len, feature_len
     # position_ids case: batch, seq_len
+    assert (
+        input_tensor_lengths is not None
+    ), "input_tensor_lengths must be provided for 3D input_tensor"
     # Initialize a list to collect valid sequences
     valid_sequences = []
 class TextEmbedding(nn.Module):
+    def __init__(
+        self,
+        text_num_embeds,
+        text_dim,
+        conv_layers=0,
+        conv_mult=2,
+        precompute_max_pos=4096,
+    ):
         super().__init__()
+        self.text_embed = nn.Embedding(
+            text_num_embeds + 1, text_dim
+        )  # use 0 as filler token
+        self.register_buffer(
+            "freqs_cis",
+            precompute_freqs_cis(text_dim, precompute_max_pos),
+            persistent=False,
+        )
+        self.text_blocks = nn.Sequential(
+            *[
+                ConvNeXtV2Block(text_dim, text_dim * conv_mult)
+                for _ in range(conv_layers)
+            ]
+        )
     def forward(self, text):
         # only keep tensors with value not -1
             dim, dim, kernel_size=7, padding=padding, groups=dim, dilation=dilation
         )  # depthwise conv
         self.norm = nn.LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(
+            dim, intermediate_dim
+        )  # pointwise/1x1 convs, implemented with linear layers
         self.act = nn.GELU()
         self.grn = GRN(intermediate_dim)
         self.pwconv2 = nn.Linear(intermediate_dim, dim)
         return residual + x
+def precompute_freqs_cis(
+    dim: int, end: int, theta: float = 10000.0, theta_rescale_factor=1.0
+):
     # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
     # has some connection to NTK literature
     # https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
     for key in dict_state.keys():
         # transformer.text_embed.text_embed.weight -> text_embed.weight
         if "text_embed" in key:
+            text_embed_dict[key.replace("transformer.text_embed.", "")] = dict_state[
+                key
+            ]
     return text_embed_dict
         pp_size = config["pretrained_config"]["mapping"]["pp_size"]
         assert pp_size == 1
         self.mapping = tensorrt_llm.Mapping(
+            world_size=world_size,
+            rank=rank,
+            cp_size=cp_size,
+            tp_size=tp_size,
+            pp_size=1,
+            gpus_per_node=1,
         )
         local_rank = rank % self.mapping.gpus_per_node
         self.outputs = {}
         self.buffer_allocated = False
+        expected_tensor_names = [
+            "noise",
+            "cond",
+            "time",
+            "rope_cos",
+            "rope_sin",
+            "input_lengths",
+            "denoised",
+        ]
+        found_tensor_names = [
+            self.session.engine.get_tensor_name(i)
+            for i in range(self.session.engine.num_io_tensors)
+        ]
+        if not self.debug_mode and set(expected_tensor_names) != set(
+            found_tensor_names
+        ):
             logger.error(
                 f"The following expected tensors are not found: {set(expected_tensor_names).difference(set(found_tensor_names))}"
             )
             logger.error(f"Found tensor names: {found_tensor_names}")
             raise RuntimeError("Tensor names in engine are not the same as expected.")
         if self.debug_mode:
+            self.debug_tensors = list(
+                set(found_tensor_names) - set(expected_tensor_names)
+            )
         self.max_mel_len = 4096
         self.text_embedding = TextEmbedding(
+            text_num_embeds=vocab_size,
+            text_dim=512,
+            conv_layers=4,
+            precompute_max_pos=self.max_mel_len,
         ).to(self.device)
         self.text_embedding.load_state_dict(load_checkpoint(model_path), strict=True)
         self.head_dim = 64
         self.base_rescale_factor = 1.0
         self.interpolation_factor = 1.0
+        base = 10000.0 * self.base_rescale_factor ** (
+            self.head_dim / (self.head_dim - 2)
+        )
+        inv_freq = 1.0 / (
+            base ** (torch.arange(0, self.head_dim, 2).float() / self.head_dim)
+        )
+        freqs = (
+            torch.outer(torch.arange(self.max_mel_len, dtype=torch.float32), inv_freq)
+            / self.interpolation_factor
+        )
         self.freqs = freqs.repeat_interleave(2, dim=-1).unsqueeze(0)
         self.rope_cos = self.freqs.cos().half()
         self.rope_sin = self.freqs.sin().half()
         time_expand = torch.zeros((1, self.nfe_steps, tmp_dim), dtype=torch.float32)
         half_dim = tmp_dim // 2
         emb_factor = math.log(10000) / (half_dim - 1)
+        emb_factor = 1000.0 * torch.exp(
+            torch.arange(half_dim, dtype=torch.float32) * -emb_factor
+        )
         for i in range(self.nfe_steps):
             emb = time_step[i] * emb_factor
             time_expand[:, i, :] = torch.cat((emb.sin(), emb.cos()), dim=-1)
                 shape = list(self.session.engine.get_tensor_shape(name))
                 shape[0] = batch_size
                 shape[1] = seq_len
+                self.outputs[name] = torch.empty(
+                    shape, dtype=self._tensor_dtype(name), device=self.device
+                )
         self.buffer_allocated = True
         max_seq_len = ref_mel_batch.shape[1]
         text_pad_sequence_drop = torch.cat(
+            (
+                text_pad_sequence,
+                torch.zeros((1, text_pad_sequence.shape[1]), dtype=torch.int32).to(
+                    self.device
+                ),
+            ),
+            dim=0,
         )
         text_embedding_drop_list = []
         for i in range(batch + 1):
+            text_embedding_drop_list.append(
+                self.text_embedding(
+                    text_pad_sequence_drop[i].unsqueeze(0).to(self.device)
+                )
+            )
         text_embedding_drop_condition = torch.cat(text_embedding_drop_list, dim=0)
         text_embedding = text_embedding_drop_condition[:-1]
         # text_embedding_drop B,T,C batch should be the same
+        text_embedding_drop = (
+            text_embedding_drop_condition[-1].unsqueeze(0).repeat(batch, 1, 1)
+        )
         noise = torch.randn_like(ref_mel_batch).to(self.device)
         rope_cos = self.rope_cos[:, :max_seq_len, :].float().repeat(batch, 1, 1)
         cat_mel_text = torch.cat((ref_mel_batch, text_embedding), dim=-1)
         cat_mel_text_drop = torch.cat(
             (
+                torch.zeros(
+                    (batch, max_seq_len, self.n_mel_channels), dtype=torch.float32
+                ).to(self.device),
                 text_embedding_drop,
             ),
             dim=-1,
         time_expand = self.time_expand.repeat(2 * batch, 1, 1).contiguous()
         # Convert estimated_reference_target_mel_len to tensor
+        input_lengths = torch.tensor(
+            estimated_reference_target_mel_len, dtype=torch.int32
+        )
         # combine above along the batch dimension
         inputs = {
             "time_expand": time_expand,
             "rope_cos": torch.cat((rope_cos, rope_cos), dim=0).contiguous(),
             "rope_sin": torch.cat((rope_sin, rope_sin), dim=0).contiguous(),
+            "input_lengths": torch.cat(
+                (input_lengths, input_lengths), dim=0
+            ).contiguous(),
             "delta_t": self.delta_t,
         }
         if use_perf and remove_input_padding:
             torch.cuda.nvtx.range_push("remove input padding")
         if remove_input_padding:
             max_seq_len = inputs["cond"].shape[1]
+            inputs["noise"] = remove_tensor_padding(
+                inputs["noise"], inputs["input_lengths"]
+            )
+            inputs["cond"] = remove_tensor_padding(
+                inputs["cond"], inputs["input_lengths"]
+            )
             # for time_expand, convert from B,D to B,T,D by repeat
+            inputs["time_expand"] = (
+                inputs["time_expand"].unsqueeze(1).repeat(1, max_seq_len, 1, 1)
+            )
+            inputs["time_expand"] = remove_tensor_padding(
+                inputs["time_expand"], inputs["input_lengths"]
+            )
+            inputs["rope_cos"] = remove_tensor_padding(
+                inputs["rope_cos"], inputs["input_lengths"]
+            )
+            inputs["rope_sin"] = remove_tensor_padding(
+                inputs["rope_sin"], inputs["input_lengths"]
+            )
         if use_perf and remove_input_padding:
             torch.cuda.nvtx.range_pop()
         for key in inputs:
             denoised_list = []
             start_idx = 0
             for i in range(batch):
+                denoised_list.append(
+                    denoised[start_idx : start_idx + inputs["input_lengths"][i]]
+                )
                 start_idx += inputs["input_lengths"][i]
             if use_perf and remove_input_padding:
                 torch.cuda.nvtx.range_pop()

f5_tts/runtime/triton_trtllm/model_repo_f5_tts/f5_tts/1/model.py CHANGED Viewed

@@ -73,7 +73,9 @@ def convert_char_to_pinyin(reference_target_texts_list, polyphone=True):
                 if char_list and seg_byte_len > 1 and char_list[-1] not in " :'\"":
                     char_list.append(" ")
                 char_list.extend(seg)
-            elif polyphone and seg_byte_len == 3 * len(seg):  # if pure east asian characters
                 seg_ = lazy_pinyin(seg, style=Style.TONE3, tone_sandhi=True)
                 for i, c in enumerate(seg):
                     if is_chinese(c):
@@ -85,7 +87,9 @@ def convert_char_to_pinyin(reference_target_texts_list, polyphone=True):
                         char_list.extend(c)
                     elif is_chinese(c):
                         char_list.append(" ")
-                        char_list.extend(lazy_pinyin(c, style=Style.TONE3, tone_sandhi=True))
                     else:
                         char_list.append(c)
         final_reference_target_texts_list.append(char_list)
@@ -98,7 +102,9 @@ def list_str_to_idx(
     vocab_char_map: dict[str, int],  # {char: idx}
     padding_value=-1,
 ):  # noqa: F722
-    list_idx_tensors = [torch.tensor([vocab_char_map.get(c, 0) for c in t]) for t in text]  # pinyin or char style
     return list_idx_tensors
@@ -121,7 +127,9 @@ class TritonPythonModel:
         self.vocab_char_map, self.vocab_size = get_tokenizer(parameters["vocab_file"])
         self.reference_sample_rate = int(parameters["reference_audio_sample_rate"])
-        self.resampler = torchaudio.transforms.Resample(self.reference_sample_rate, self.target_audio_sample_rate)
         self.tllm_model_dir = parameters["tllm_model_dir"]
         config_file = os.path.join(self.tllm_model_dir, "config.json")
@@ -163,13 +171,17 @@ class TritonPythonModel:
         input_tensor_0 = pb_utils.Tensor.from_dlpack("mel", to_dlpack(mel))
         inference_request = pb_utils.InferenceRequest(
-            model_name="vocoder", requested_output_names=["waveform"], inputs=[input_tensor_0]
         )
         inference_response = inference_request.exec()
         if inference_response.has_error():
             raise pb_utils.TritonModelException(inference_response.error().message())
         else:
-            waveform = pb_utils.get_output_tensor_by_name(inference_response, "waveform")
             waveform = torch.utils.dlpack.from_dlpack(waveform.to_dlpack()).cpu()
             return waveform
@@ -181,7 +193,13 @@ class TritonPythonModel:
             reference_target_texts_list,
             estimated_reference_target_mel_len,
             reference_mel_len,
-        ) = [], [], [], [], []
         mel_features_list = []
         if self.use_perf:
             torch.cuda.nvtx.range_push("preprocess")
@@ -189,10 +207,14 @@ class TritonPythonModel:
             wav_tensor = pb_utils.get_input_tensor_by_name(request, "reference_wav")
             wav_lens = pb_utils.get_input_tensor_by_name(request, "reference_wav_len")
-            reference_text = pb_utils.get_input_tensor_by_name(request, "reference_text").as_numpy()
             reference_text = reference_text[0][0].decode("utf-8")
             reference_text_list.append(reference_text)
-            target_text = pb_utils.get_input_tensor_by_name(request, "target_text").as_numpy()
             target_text = target_text[0][0].decode("utf-8")
             target_text_list.append(target_text)
@@ -221,30 +243,49 @@ class TritonPythonModel:
             reference_mel_len.append(mel_features.shape[1])
             estimated_reference_target_mel_len.append(
                 int(
-                    mel_features.shape[1] * (1 + len(target_text.encode("utf-8")) / len(reference_text.encode("utf-8")))
                 )
             )
         max_seq_len = min(max(estimated_reference_target_mel_len), self.max_mel_len)
         batch = len(requests)
-        mel_features = torch.zeros((batch, max_seq_len, self.n_mel_channels), dtype=torch.float16).to(self.device)
         for i, mel in enumerate(mel_features_list):
             mel_features[i, : mel.shape[1], :] = mel
         reference_mel_len_tensor = torch.LongTensor(reference_mel_len).to(self.device)
-        pinyin_list = convert_char_to_pinyin(reference_target_texts_list, polyphone=True)
         text_pad_sequence = list_str_to_idx(pinyin_list, self.vocab_char_map)
         for i, item in enumerate(text_pad_sequence):
             text_pad_sequence[i] = F.pad(
-                item, (0, estimated_reference_target_mel_len[i] - len(item)), mode="constant", value=-1
             )
-            text_pad_sequence[i] += 1  # WAR: 0 is reserved for padding token, hard coding in F5-TTS
-        text_pad_sequence = pad_sequence(text_pad_sequence, padding_value=-1, batch_first=True).to(self.device)
         text_pad_sequence = F.pad(
-            text_pad_sequence, (0, max_seq_len - text_pad_sequence.shape[1]), mode="constant", value=-1
         )
         if self.use_perf:
             torch.cuda.nvtx.range_pop()
@@ -264,7 +305,11 @@ class TritonPythonModel:
         for i in range(batch):
             ref_me_len = reference_mel_len[i]
             estimated_mel_len = estimated_reference_target_mel_len[i]
-            denoised_one_item = denoised[i, ref_me_len:estimated_mel_len, :].unsqueeze(0).transpose(1, 2)
             audio = self.forward_vocoder(denoised_one_item)
             rms = torch.sqrt(torch.mean(torch.square(audio)))
             if rms < self.target_rms:

                 if char_list and seg_byte_len > 1 and char_list[-1] not in " :'\"":
                     char_list.append(" ")
                 char_list.extend(seg)
+            elif polyphone and seg_byte_len == 3 * len(
+                seg
+            ):  # if pure east asian characters
                 seg_ = lazy_pinyin(seg, style=Style.TONE3, tone_sandhi=True)
                 for i, c in enumerate(seg):
                     if is_chinese(c):
                         char_list.extend(c)
                     elif is_chinese(c):
                         char_list.append(" ")
+                        char_list.extend(
+                            lazy_pinyin(c, style=Style.TONE3, tone_sandhi=True)
+                        )
                     else:
                         char_list.append(c)
         final_reference_target_texts_list.append(char_list)
     vocab_char_map: dict[str, int],  # {char: idx}
     padding_value=-1,
 ):  # noqa: F722
+    list_idx_tensors = [
+        torch.tensor([vocab_char_map.get(c, 0) for c in t]) for t in text
+    ]  # pinyin or char style
     return list_idx_tensors
         self.vocab_char_map, self.vocab_size = get_tokenizer(parameters["vocab_file"])
         self.reference_sample_rate = int(parameters["reference_audio_sample_rate"])
+        self.resampler = torchaudio.transforms.Resample(
+            self.reference_sample_rate, self.target_audio_sample_rate
+        )
         self.tllm_model_dir = parameters["tllm_model_dir"]
         config_file = os.path.join(self.tllm_model_dir, "config.json")
         input_tensor_0 = pb_utils.Tensor.from_dlpack("mel", to_dlpack(mel))
         inference_request = pb_utils.InferenceRequest(
+            model_name="vocoder",
+            requested_output_names=["waveform"],
+            inputs=[input_tensor_0],
         )
         inference_response = inference_request.exec()
         if inference_response.has_error():
             raise pb_utils.TritonModelException(inference_response.error().message())
         else:
+            waveform = pb_utils.get_output_tensor_by_name(
+                inference_response, "waveform"
+            )
             waveform = torch.utils.dlpack.from_dlpack(waveform.to_dlpack()).cpu()
             return waveform
             reference_target_texts_list,
             estimated_reference_target_mel_len,
             reference_mel_len,
+        ) = (
+            [],
+            [],
+            [],
+            [],
+            [],
+        )
         mel_features_list = []
         if self.use_perf:
             torch.cuda.nvtx.range_push("preprocess")
             wav_tensor = pb_utils.get_input_tensor_by_name(request, "reference_wav")
             wav_lens = pb_utils.get_input_tensor_by_name(request, "reference_wav_len")
+            reference_text = pb_utils.get_input_tensor_by_name(
+                request, "reference_text"
+            ).as_numpy()
             reference_text = reference_text[0][0].decode("utf-8")
             reference_text_list.append(reference_text)
+            target_text = pb_utils.get_input_tensor_by_name(
+                request, "target_text"
+            ).as_numpy()
             target_text = target_text[0][0].decode("utf-8")
             target_text_list.append(target_text)
             reference_mel_len.append(mel_features.shape[1])
             estimated_reference_target_mel_len.append(
                 int(
+                    mel_features.shape[1]
+                    * (
+                        1
+                        + len(target_text.encode("utf-8"))
+                        / len(reference_text.encode("utf-8"))
+                    )
                 )
             )
         max_seq_len = min(max(estimated_reference_target_mel_len), self.max_mel_len)
         batch = len(requests)
+        mel_features = torch.zeros(
+            (batch, max_seq_len, self.n_mel_channels), dtype=torch.float16
+        ).to(self.device)
         for i, mel in enumerate(mel_features_list):
             mel_features[i, : mel.shape[1], :] = mel
         reference_mel_len_tensor = torch.LongTensor(reference_mel_len).to(self.device)
+        pinyin_list = convert_char_to_pinyin(
+            reference_target_texts_list, polyphone=True
+        )
         text_pad_sequence = list_str_to_idx(pinyin_list, self.vocab_char_map)
         for i, item in enumerate(text_pad_sequence):
             text_pad_sequence[i] = F.pad(
+                item,
+                (0, estimated_reference_target_mel_len[i] - len(item)),
+                mode="constant",
+                value=-1,
             )
+            text_pad_sequence[
+                i
+            ] += 1  # WAR: 0 is reserved for padding token, hard coding in F5-TTS
+        text_pad_sequence = pad_sequence(
+            text_pad_sequence, padding_value=-1, batch_first=True
+        ).to(self.device)
         text_pad_sequence = F.pad(
+            text_pad_sequence,
+            (0, max_seq_len - text_pad_sequence.shape[1]),
+            mode="constant",
+            value=-1,
         )
         if self.use_perf:
             torch.cuda.nvtx.range_pop()
         for i in range(batch):
             ref_me_len = reference_mel_len[i]
             estimated_mel_len = estimated_reference_target_mel_len[i]
+            denoised_one_item = (
+                denoised[i, ref_me_len:estimated_mel_len, :]
+                .unsqueeze(0)
+                .transpose(1, 2)
+            )
             audio = self.forward_vocoder(denoised_one_item)
             rms = torch.sqrt(torch.mean(torch.square(audio)))
             if rms < self.target_rms:

f5_tts/runtime/triton_trtllm/patch/__init__.py CHANGED Viewed

@@ -13,14 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .baichuan.model import BaichuanForCausalLM
-from .bert.model import (
-    BertForQuestionAnswering,
-    BertForSequenceClassification,
-    BertModel,
-    RobertaForQuestionAnswering,
-    RobertaForSequenceClassification,
-    RobertaModel,
-)
 from .bloom.model import BloomForCausalLM, BloomModel
 from .chatglm.config import ChatGLMConfig
 from .chatglm.model import ChatGLMForCausalLM, ChatGLMModel
@@ -51,17 +47,17 @@ from .mamba.model import MambaForCausalLM
 from .medusa.config import MedusaConfig
 from .medusa.model import MedusaForCausalLm
 from .mllama.model import MLLaMAModel
-from .modeling_utils import PretrainedConfig, PretrainedModel, SpeculativeDecodingMode
 from .mpt.model import MPTForCausalLM, MPTModel
 from .nemotron_nas.model import DeciLMForCausalLM
 from .opt.model import OPTForCausalLM, OPTModel
-from .phi.model import PhiForCausalLM, PhiModel
 from .phi3.model import Phi3ForCausalLM, Phi3Model
 from .qwen.model import QWenForCausalLM
 from .recurrentgemma.model import RecurrentGemmaForCausalLM
 from .redrafter.model import ReDrafterForCausalLM
 __all__ = [
     "BertModel",
     "BertForQuestionAnswering",

 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .baichuan.model import BaichuanForCausalLM
+from .bert.model import (BertForQuestionAnswering,
+                         BertForSequenceClassification, BertModel,
+                         RobertaForQuestionAnswering,
+                         RobertaForSequenceClassification, RobertaModel)
 from .bloom.model import BloomForCausalLM, BloomModel
 from .chatglm.config import ChatGLMConfig
 from .chatglm.model import ChatGLMForCausalLM, ChatGLMModel
 from .medusa.config import MedusaConfig
 from .medusa.model import MedusaForCausalLm
 from .mllama.model import MLLaMAModel
+from .modeling_utils import (PretrainedConfig, PretrainedModel,
+                             SpeculativeDecodingMode)
 from .mpt.model import MPTForCausalLM, MPTModel
 from .nemotron_nas.model import DeciLMForCausalLM
 from .opt.model import OPTForCausalLM, OPTModel
 from .phi3.model import Phi3ForCausalLM, Phi3Model
+from .phi.model import PhiForCausalLM, PhiModel
 from .qwen.model import QWenForCausalLM
 from .recurrentgemma.model import RecurrentGemmaForCausalLM
 from .redrafter.model import ReDrafterForCausalLM
 __all__ = [
     "BertModel",
     "BertForQuestionAnswering",

f5_tts/runtime/triton_trtllm/patch/f5tts/model.py CHANGED Viewed

@@ -13,8 +13,8 @@ from ...layers import Linear
 from ...module import Module, ModuleList
 from ...plugin import current_all_reduce_helper
 from ..modeling_utils import PretrainedConfig, PretrainedModel
-from .modules import AdaLayerNormZero_Final, ConvPositionEmbedding, DiTBlock, TimestepEmbedding
 current_file_path = os.path.abspath(__file__)
 parent_dir = os.path.dirname(current_file_path)
@@ -38,7 +38,9 @@ class F5TTS(PretrainedModel):
         self.dtype = str_dtype_to_trt(config.dtype)
         self.time_embed = TimestepEmbedding(config.hidden_size)
-        self.input_embed = InputEmbedding(config.mel_dim, config.text_dim, config.hidden_size)
         self.dim = config.hidden_size
         self.depth = config.num_hidden_layers
@@ -71,7 +73,14 @@ class F5TTS(PretrainedModel):
         t = self.time_embed(time)
         x = self.input_embed(noise, cond)
         for block in self.transformer_blocks:
-            x = block(x, t, rope_cos=rope_cos, rope_sin=rope_sin, input_lengths=input_lengths, scale=scale)
         denoise = self.proj_out(self.norm_out(x, t))
         denoise.mark_output("denoised", self.dtype)
         return denoise

 from ...module import Module, ModuleList
 from ...plugin import current_all_reduce_helper
 from ..modeling_utils import PretrainedConfig, PretrainedModel
+from .modules import (AdaLayerNormZero_Final, ConvPositionEmbedding, DiTBlock,
+                      TimestepEmbedding)
 current_file_path = os.path.abspath(__file__)
 parent_dir = os.path.dirname(current_file_path)
         self.dtype = str_dtype_to_trt(config.dtype)
         self.time_embed = TimestepEmbedding(config.hidden_size)
+        self.input_embed = InputEmbedding(
+            config.mel_dim, config.text_dim, config.hidden_size
+        )
         self.dim = config.hidden_size
         self.depth = config.num_hidden_layers
         t = self.time_embed(time)
         x = self.input_embed(noise, cond)
         for block in self.transformer_blocks:
+            x = block(
+                x,
+                t,
+                rope_cos=rope_cos,
+                rope_sin=rope_sin,
+                input_lengths=input_lengths,
+                scale=scale,
+            )
         denoise = self.proj_out(self.norm_out(x, t))
         denoise.mark_output("denoised", self.dtype)
         return denoise

f5_tts/runtime/triton_trtllm/patch/f5tts/modules.py CHANGED Viewed

@@ -9,28 +9,10 @@ import torch.nn.functional as F
 from tensorrt_llm._common import default_net
 from ..._utils import str_dtype_to_trt, trt_dtype_to_np
-from ...functional import (
-    Tensor,
-    bert_attention,
-    cast,
-    chunk,
-    concat,
-    constant,
-    expand,
-    expand_dims,
-    expand_dims_like,
-    expand_mask,
-    gelu,
-    matmul,
-    permute,
-    shape,
-    silu,
-    slice,
-    softmax,
-    squeeze,
-    unsqueeze,
-    view,
-)
 from ...layers import ColumnLinear, Conv1d, LayerNorm, Linear, Mish, RowLinear
 from ...module import Module
@@ -57,7 +39,9 @@ class AdaLayerNormZero(Module):
     def forward(self, x, emb=None):
         emb = self.linear(silu(emb))
-        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = chunk(emb, 6, dim=1)
         x = self.norm(x)
         ones = constant(np.ones(1, dtype=np.float32)).cast(x.dtype)
         if default_net().plugin_config.remove_input_padding:
@@ -91,8 +75,12 @@ class ConvPositionEmbedding(Module):
     def __init__(self, dim, kernel_size=31, groups=16):
         super().__init__()
         assert kernel_size % 2 != 0
-        self.conv1d1 = Conv1d(dim, dim, kernel_size, groups=groups, padding=kernel_size // 2)
-        self.conv1d2 = Conv1d(dim, dim, kernel_size, groups=groups, padding=kernel_size // 2)
         self.mish = Mish()
     def forward(self, x, mask=None):  # noqa: F722
@@ -120,7 +108,9 @@ class Attention(Module):
         super().__init__()
         if not hasattr(F, "scaled_dot_product_attention"):
-            raise ImportError("Attention equires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
         self.processor = processor
@@ -191,16 +181,32 @@ class Attention(Module):
         c_rope=None,  # rotary position embedding for c
     ) -> torch.Tensor:
         if c is not None:
-            return self.processor(self, x, c=c, input_lengths=input_lengths, scale=scale, rope=rope, c_rope=c_rope)
         else:
             return self.processor(
-                self, x, rope_cos=rope_cos, rope_sin=rope_sin, input_lengths=input_lengths, scale=scale
             )
 def rotate_every_two_3dim(tensor: Tensor) -> Tensor:
     shape_tensor = concat(
-        [shape(tensor, i) / 2 if i == (tensor.ndim() - 1) else shape(tensor, i) for i in range(tensor.ndim())]
     )
     if default_net().plugin_config.remove_input_padding:
         assert tensor.ndim() == 2
@@ -208,7 +214,9 @@ def rotate_every_two_3dim(tensor: Tensor) -> Tensor:
         x2 = slice(tensor, [0, 1], shape_tensor, [1, 2])
         x1 = expand_dims(x1, 2)
         x2 = expand_dims(x2, 2)
-        zero = constant(np.ascontiguousarray(np.zeros([1], dtype=trt_dtype_to_np(tensor.dtype))))
         x2 = zero - x2
         x = concat([x2, x1], 2)
         out = view(x, concat([shape(x, 0), shape(x, 1) * 2]))
@@ -219,7 +227,9 @@ def rotate_every_two_3dim(tensor: Tensor) -> Tensor:
         x2 = slice(tensor, [0, 0, 1], shape_tensor, [1, 1, 2])
         x1 = expand_dims(x1, 3)
         x2 = expand_dims(x2, 3)
-        zero = constant(np.ascontiguousarray(np.zeros([1], dtype=trt_dtype_to_np(tensor.dtype))))
         x2 = zero - x2
         x = concat([x2, x1], 3)
         out = view(x, concat([shape(x, 0), shape(x, 1), shape(x, 2) * 2]))
@@ -235,15 +245,23 @@ def apply_rotary_pos_emb_3dim(x, rope_cos, rope_sin):
         end_dim = shape(x, -1) - shape(rope_cos, -1)
         new_t_unrotated_shape = concat([shape(x, 0), end_dim])  # (2, -1, 960)
         x_unrotated = slice(x, concat([0, rot_dim]), new_t_unrotated_shape, [1, 1])
-        out = concat([x_ * rope_cos + rotate_every_two_3dim(x_) * rope_sin, x_unrotated], dim=-1)
     else:
         rot_dim = shape(rope_cos, 2)  # 64
         new_t_shape = concat([shape(x, 0), shape(x, 1), rot_dim])  # (2, -1, 64)
         x_ = slice(x, [0, 0, 0], new_t_shape, [1, 1, 1])
         end_dim = shape(x, 2) - shape(rope_cos, 2)
-        new_t_unrotated_shape = concat([shape(x, 0), shape(x, 1), end_dim])  # (2, -1, 960)
-        x_unrotated = slice(x, concat([0, 0, rot_dim]), new_t_unrotated_shape, [1, 1, 1])
-        out = concat([x_ * rope_cos + rotate_every_two_3dim(x_) * rope_sin, x_unrotated], dim=-1)
     return out
@@ -279,8 +297,12 @@ class AttnProcessor:
             seq_len_2d = concat([1, N])
             max_position_embeddings = 4096
             # create position ids
-            position_ids_buffer = constant(np.expand_dims(np.arange(max_position_embeddings).astype(np.int32), 0))
-            tmp_position_ids = slice(position_ids_buffer, starts=[0, 0], sizes=seq_len_2d)
             tmp_position_ids = expand(tmp_position_ids, concat([B, N]))  # BxL
             tmp_input_lengths = unsqueeze(input_lengths, 1)  # Bx1
             tmp_input_lengths = expand(tmp_input_lengths, concat([B, N]))  # BxL
@@ -315,14 +337,28 @@ class AttnProcessor:
             assert not default_net().plugin_config.remove_input_padding
             def transpose_for_scores(x):
-                new_x_shape = concat([shape(x, 0), shape(x, 1), attn.num_attention_heads, attn.attention_head_size])
                 y = x.view(new_x_shape)
                 y = y.transpose(1, 2)
                 return y
             def transpose_for_scores_k(x):
-                new_x_shape = concat([shape(x, 0), shape(x, 1), attn.num_attention_heads, attn.attention_head_size])
                 y = x.view(new_x_shape)
                 y = y.permute([0, 2, 3, 1])
@@ -342,7 +378,11 @@ class AttnProcessor:
             attention_probs = softmax(attention_scores, dim=-1)
             context = matmul(attention_probs, value, use_fp32_acc=False).transpose(1, 2)
-            context = context.view(concat([shape(context, 0), shape(context, 1), attn.attention_hidden_size]))
         context = attn.to_out(context)
         if mask is not None:
             mask = mask.view(concat([shape(mask, 0), shape(mask, 1), 1]))
@@ -370,13 +410,26 @@ class DiTBlock(Module):
         self.ff = FeedForward(dim=dim, mult=ff_mult, dropout=dropout)
     def forward(
-        self, x, t, rope_cos, rope_sin, input_lengths, scale=1.0, rope=ModuleNotFoundError
     ):  # x: noised input, t: time embedding
         # pre-norm & modulation for attention input
         norm, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.attn_norm(x, emb=t)
         # attention
         # norm ----> (2,1226,1024)
-        attn_output = self.attn(x=norm, rope_cos=rope_cos, rope_sin=rope_sin, input_lengths=input_lengths, scale=scale)
         # process attention output for input x
         if default_net().plugin_config.remove_input_padding:
@@ -387,7 +440,9 @@ class DiTBlock(Module):
         if default_net().plugin_config.remove_input_padding:
             norm = self.ff_norm(x) * (ones + scale_mlp) + shift_mlp
         else:
-            norm = self.ff_norm(x) * (ones + unsqueeze(scale_mlp, 1)) + unsqueeze(shift_mlp, 1)
             # norm = self.ff_norm(x) * (ones + scale_mlp) + shift_mlp
         ff_output = self.ff(norm)
         if default_net().plugin_config.remove_input_padding:

 from tensorrt_llm._common import default_net
 from ..._utils import str_dtype_to_trt, trt_dtype_to_np
+from ...functional import (Tensor, bert_attention, cast, chunk, concat,
+                           constant, expand, expand_dims, expand_dims_like,
+                           expand_mask, gelu, matmul, permute, shape, silu,
+                           slice, softmax, squeeze, unsqueeze, view)
 from ...layers import ColumnLinear, Conv1d, LayerNorm, Linear, Mish, RowLinear
 from ...module import Module
     def forward(self, x, emb=None):
         emb = self.linear(silu(emb))
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = chunk(
+            emb, 6, dim=1
+        )
         x = self.norm(x)
         ones = constant(np.ones(1, dtype=np.float32)).cast(x.dtype)
         if default_net().plugin_config.remove_input_padding:
     def __init__(self, dim, kernel_size=31, groups=16):
         super().__init__()
         assert kernel_size % 2 != 0
+        self.conv1d1 = Conv1d(
+            dim, dim, kernel_size, groups=groups, padding=kernel_size // 2
+        )
+        self.conv1d2 = Conv1d(
+            dim, dim, kernel_size, groups=groups, padding=kernel_size // 2
+        )
         self.mish = Mish()
     def forward(self, x, mask=None):  # noqa: F722
         super().__init__()
         if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "Attention equires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
         self.processor = processor
         c_rope=None,  # rotary position embedding for c
     ) -> torch.Tensor:
         if c is not None:
+            return self.processor(
+                self,
+                x,
+                c=c,
+                input_lengths=input_lengths,
+                scale=scale,
+                rope=rope,
+                c_rope=c_rope,
+            )
         else:
             return self.processor(
+                self,
+                x,
+                rope_cos=rope_cos,
+                rope_sin=rope_sin,
+                input_lengths=input_lengths,
+                scale=scale,
             )
 def rotate_every_two_3dim(tensor: Tensor) -> Tensor:
     shape_tensor = concat(
+        [
+            shape(tensor, i) / 2 if i == (tensor.ndim() - 1) else shape(tensor, i)
+            for i in range(tensor.ndim())
+        ]
     )
     if default_net().plugin_config.remove_input_padding:
         assert tensor.ndim() == 2
         x2 = slice(tensor, [0, 1], shape_tensor, [1, 2])
         x1 = expand_dims(x1, 2)
         x2 = expand_dims(x2, 2)
+        zero = constant(
+            np.ascontiguousarray(np.zeros([1], dtype=trt_dtype_to_np(tensor.dtype)))
+        )
         x2 = zero - x2
         x = concat([x2, x1], 2)
         out = view(x, concat([shape(x, 0), shape(x, 1) * 2]))
         x2 = slice(tensor, [0, 0, 1], shape_tensor, [1, 1, 2])
         x1 = expand_dims(x1, 3)
         x2 = expand_dims(x2, 3)
+        zero = constant(
+            np.ascontiguousarray(np.zeros([1], dtype=trt_dtype_to_np(tensor.dtype)))
+        )
         x2 = zero - x2
         x = concat([x2, x1], 3)
         out = view(x, concat([shape(x, 0), shape(x, 1), shape(x, 2) * 2]))
         end_dim = shape(x, -1) - shape(rope_cos, -1)
         new_t_unrotated_shape = concat([shape(x, 0), end_dim])  # (2, -1, 960)
         x_unrotated = slice(x, concat([0, rot_dim]), new_t_unrotated_shape, [1, 1])
+        out = concat(
+            [x_ * rope_cos + rotate_every_two_3dim(x_) * rope_sin, x_unrotated], dim=-1
+        )
     else:
         rot_dim = shape(rope_cos, 2)  # 64
         new_t_shape = concat([shape(x, 0), shape(x, 1), rot_dim])  # (2, -1, 64)
         x_ = slice(x, [0, 0, 0], new_t_shape, [1, 1, 1])
         end_dim = shape(x, 2) - shape(rope_cos, 2)
+        new_t_unrotated_shape = concat(
+            [shape(x, 0), shape(x, 1), end_dim]
+        )  # (2, -1, 960)
+        x_unrotated = slice(
+            x, concat([0, 0, rot_dim]), new_t_unrotated_shape, [1, 1, 1]
+        )
+        out = concat(
+            [x_ * rope_cos + rotate_every_two_3dim(x_) * rope_sin, x_unrotated], dim=-1
+        )
     return out
             seq_len_2d = concat([1, N])
             max_position_embeddings = 4096
             # create position ids
+            position_ids_buffer = constant(
+                np.expand_dims(np.arange(max_position_embeddings).astype(np.int32), 0)
+            )
+            tmp_position_ids = slice(
+                position_ids_buffer, starts=[0, 0], sizes=seq_len_2d
+            )
             tmp_position_ids = expand(tmp_position_ids, concat([B, N]))  # BxL
             tmp_input_lengths = unsqueeze(input_lengths, 1)  # Bx1
             tmp_input_lengths = expand(tmp_input_lengths, concat([B, N]))  # BxL
             assert not default_net().plugin_config.remove_input_padding
             def transpose_for_scores(x):
+                new_x_shape = concat(
+                    [
+                        shape(x, 0),
+                        shape(x, 1),
+                        attn.num_attention_heads,
+                        attn.attention_head_size,
+                    ]
+                )
                 y = x.view(new_x_shape)
                 y = y.transpose(1, 2)
                 return y
             def transpose_for_scores_k(x):
+                new_x_shape = concat(
+                    [
+                        shape(x, 0),
+                        shape(x, 1),
+                        attn.num_attention_heads,
+                        attn.attention_head_size,
+                    ]
+                )
                 y = x.view(new_x_shape)
                 y = y.permute([0, 2, 3, 1])
             attention_probs = softmax(attention_scores, dim=-1)
             context = matmul(attention_probs, value, use_fp32_acc=False).transpose(1, 2)
+            context = context.view(
+                concat(
+                    [shape(context, 0), shape(context, 1), attn.attention_hidden_size]
+                )
+            )
         context = attn.to_out(context)
         if mask is not None:
             mask = mask.view(concat([shape(mask, 0), shape(mask, 1), 1]))
         self.ff = FeedForward(dim=dim, mult=ff_mult, dropout=dropout)
     def forward(
+        self,
+        x,
+        t,
+        rope_cos,
+        rope_sin,
+        input_lengths,
+        scale=1.0,
+        rope=ModuleNotFoundError,
     ):  # x: noised input, t: time embedding
         # pre-norm & modulation for attention input
         norm, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.attn_norm(x, emb=t)
         # attention
         # norm ----> (2,1226,1024)
+        attn_output = self.attn(
+            x=norm,
+            rope_cos=rope_cos,
+            rope_sin=rope_sin,
+            input_lengths=input_lengths,
+            scale=scale,
+        )
         # process attention output for input x
         if default_net().plugin_config.remove_input_padding:
         if default_net().plugin_config.remove_input_padding:
             norm = self.ff_norm(x) * (ones + scale_mlp) + shift_mlp
         else:
+            norm = self.ff_norm(x) * (ones + unsqueeze(scale_mlp, 1)) + unsqueeze(
+                shift_mlp, 1
+            )
             # norm = self.ff_norm(x) * (ones + scale_mlp) + shift_mlp
         ff_output = self.ff(norm)
         if default_net().plugin_config.remove_input_padding:

f5_tts/runtime/triton_trtllm/scripts/conv_stft.py CHANGED Viewed

@@ -40,7 +40,6 @@ import torch as th
 import torch.nn.functional as F
 from scipy.signal import check_COLA, get_window
 support_clp_op = None
 if th.__version__ >= "1.7.0":
     from torch.fft import rfft as fft
@@ -124,7 +123,9 @@ class STFT(th.nn.Module):
         ifft_kernel = th.pinverse(fft_kernel)[:, None, :]
         window = get_window(self.win_type, self.win_len)
-        self.perfect_reconstruct = check_COLA(window, self.win_len, self.win_len - self.win_hop)
         window = th.FloatTensor(window)
         if self.mode == "continue":
             left_pad = (self.fft_len - self.win_len) // 2

 import torch.nn.functional as F
 from scipy.signal import check_COLA, get_window
 support_clp_op = None
 if th.__version__ >= "1.7.0":
     from torch.fft import rfft as fft
         ifft_kernel = th.pinverse(fft_kernel)[:, None, :]
         window = get_window(self.win_type, self.win_len)
+        self.perfect_reconstruct = check_COLA(
+            window, self.win_len, self.win_len - self.win_hop
+        )
         window = th.FloatTensor(window)
         if self.mode == "continue":
             left_pad = (self.fft_len - self.win_len) // 2

f5_tts/runtime/triton_trtllm/scripts/convert_checkpoint.py CHANGED Viewed

@@ -179,19 +179,47 @@ def parse_arguments():
     )  # TODO: support F5TTS_v1_Base
     parser.add_argument("--timm_ckpt", type=str, default="./ckpts/model_1200000.pt")
     parser.add_argument(
-        "--output_dir", type=str, default="./tllm_checkpoint", help="The path to save the TensorRT-LLM checkpoint"
     )
-    parser.add_argument("--hidden_size", type=int, default=1024, help="The hidden size of DiT")
-    parser.add_argument("--depth", type=int, default=22, help="The number of DiTBlock layers")
-    parser.add_argument("--num_heads", type=int, default=16, help="The number of heads of attention module")
     parser.add_argument("--cfg_scale", type=float, default=4.0)
-    parser.add_argument("--tp_size", type=int, default=1, help="N-way tensor parallelism size")
-    parser.add_argument("--cp_size", type=int, default=1, help="Context parallelism size")
-    parser.add_argument("--pp_size", type=int, default=1, help="N-way pipeline parallelism size")
-    parser.add_argument("--dtype", type=str, default="float16", choices=["float32", "bfloat16", "float16"])
-    parser.add_argument("--fp8_linear", action="store_true", help="Whether use FP8 for linear layers")
     parser.add_argument(
-        "--workers", type=int, default=1, help="The number of workers for converting checkpoint in parallel"
     )
     args = parser.parse_args()
     return args
@@ -205,10 +233,15 @@ def convert_timm_dit(args, mapping, dtype="float32"):
     model_params = dict(torch.load(args.timm_ckpt))
     model_params = {
-        k: v for k, v in model_params["ema_model_state_dict"].items() if k.startswith("ema_model.transformer")
     }
     prefix = "ema_model.transformer."
-    model_params = {key[len(prefix) :] if key.startswith(prefix) else key: value for key, value in model_params.items()}
     timm_to_trtllm_name = FACEBOOK_DIT_NAME_MAPPING
@@ -223,8 +256,13 @@ def convert_timm_dit(args, mapping, dtype="float32"):
     weights = dict()
     for name, param in model_params.items():
-        if name == "input_embed.conv_pos_embed.conv1d.0.weight" or name == "input_embed.conv_pos_embed.conv1d.2.weight":
-            weights[get_trtllm_name(name)] = param.contiguous().to(torch_dtype).unsqueeze(-1)
         else:
             weights[get_trtllm_name(name)] = param.contiguous().to(torch_dtype)
@@ -239,25 +277,37 @@ def convert_timm_dit(args, mapping, dtype="float32"):
     for k, v in weights.items():
         if re.match("^transformer_blocks.*.attn.to_k.weight$", k):
             weights[k] *= scale_factor
-            weights[k] = split_q_tp(v, args.num_heads, args.hidden_size, tensor_parallel, mapping.tp_rank)
         elif re.match("^transformer_blocks.*.attn.to_k.bias$", k):
             weights[k] *= scale_factor
-            weights[k] = split_q_bias_tp(v, args.num_heads, args.hidden_size, tensor_parallel, mapping.tp_rank)
         elif re.match("^transformer_blocks.*.attn.to_q.weight$", k):
-            weights[k] = split_q_tp(v, args.num_heads, args.hidden_size, tensor_parallel, mapping.tp_rank)
             weights[k] *= scale_factor
         elif re.match("^transformer_blocks.*.attn.to_q.bias$", k):
-            weights[k] = split_q_bias_tp(v, args.num_heads, args.hidden_size, tensor_parallel, mapping.tp_rank)
             weights[k] *= scale_factor
         elif re.match("^transformer_blocks.*.attn.to_v.weight$", k):
-            weights[k] = split_q_tp(v, args.num_heads, args.hidden_size, tensor_parallel, mapping.tp_rank)
         elif re.match("^transformer_blocks.*.attn.to_v.bias$", k):
-            weights[k] = split_q_bias_tp(v, args.num_heads, args.hidden_size, tensor_parallel, mapping.tp_rank)
         elif re.match("^transformer_blocks.*.attn.to_out.weight$", k):
             weights[k] = split_matrix_tp(v, tensor_parallel, mapping.tp_rank, dim=1)
@@ -317,7 +367,9 @@ def covert_and_save(args, rank):
     weights = convert_timm_dit(args, mapping, dtype=args.dtype)
-    safetensors.torch.save_file(weights, os.path.join(args.output_dir, f"rank{rank}.safetensors"))
 def execute(workers, func, args):
@@ -334,7 +386,9 @@ def execute(workers, func, args):
                 except Exception as e:
                     traceback.print_exc()
                     exceptions.append(e)
-            assert len(exceptions) == 0, "Checkpoint conversion failed, please check error log."
 def main():

     )  # TODO: support F5TTS_v1_Base
     parser.add_argument("--timm_ckpt", type=str, default="./ckpts/model_1200000.pt")
     parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="./tllm_checkpoint",
+        help="The path to save the TensorRT-LLM checkpoint",
+    )
+    parser.add_argument(
+        "--hidden_size", type=int, default=1024, help="The hidden size of DiT"
+    )
+    parser.add_argument(
+        "--depth", type=int, default=22, help="The number of DiTBlock layers"
+    )
+    parser.add_argument(
+        "--num_heads",
+        type=int,
+        default=16,
+        help="The number of heads of attention module",
     )
     parser.add_argument("--cfg_scale", type=float, default=4.0)
     parser.add_argument(
+        "--tp_size", type=int, default=1, help="N-way tensor parallelism size"
+    )
+    parser.add_argument(
+        "--cp_size", type=int, default=1, help="Context parallelism size"
+    )
+    parser.add_argument(
+        "--pp_size", type=int, default=1, help="N-way pipeline parallelism size"
+    )
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        default="float16",
+        choices=["float32", "bfloat16", "float16"],
+    )
+    parser.add_argument(
+        "--fp8_linear", action="store_true", help="Whether use FP8 for linear layers"
+    )
+    parser.add_argument(
+        "--workers",
+        type=int,
+        default=1,
+        help="The number of workers for converting checkpoint in parallel",
     )
     args = parser.parse_args()
     return args
     model_params = dict(torch.load(args.timm_ckpt))
     model_params = {
+        k: v
+        for k, v in model_params["ema_model_state_dict"].items()
+        if k.startswith("ema_model.transformer")
     }
     prefix = "ema_model.transformer."
+    model_params = {
+        key[len(prefix) :] if key.startswith(prefix) else key: value
+        for key, value in model_params.items()
+    }
     timm_to_trtllm_name = FACEBOOK_DIT_NAME_MAPPING
     weights = dict()
     for name, param in model_params.items():
+        if (
+            name == "input_embed.conv_pos_embed.conv1d.0.weight"
+            or name == "input_embed.conv_pos_embed.conv1d.2.weight"
+        ):
+            weights[get_trtllm_name(name)] = (
+                param.contiguous().to(torch_dtype).unsqueeze(-1)
+            )
         else:
             weights[get_trtllm_name(name)] = param.contiguous().to(torch_dtype)
     for k, v in weights.items():
         if re.match("^transformer_blocks.*.attn.to_k.weight$", k):
             weights[k] *= scale_factor
+            weights[k] = split_q_tp(
+                v, args.num_heads, args.hidden_size, tensor_parallel, mapping.tp_rank
+            )
         elif re.match("^transformer_blocks.*.attn.to_k.bias$", k):
             weights[k] *= scale_factor
+            weights[k] = split_q_bias_tp(
+                v, args.num_heads, args.hidden_size, tensor_parallel, mapping.tp_rank
+            )
         elif re.match("^transformer_blocks.*.attn.to_q.weight$", k):
+            weights[k] = split_q_tp(
+                v, args.num_heads, args.hidden_size, tensor_parallel, mapping.tp_rank
+            )
             weights[k] *= scale_factor
         elif re.match("^transformer_blocks.*.attn.to_q.bias$", k):
+            weights[k] = split_q_bias_tp(
+                v, args.num_heads, args.hidden_size, tensor_parallel, mapping.tp_rank
+            )
             weights[k] *= scale_factor
         elif re.match("^transformer_blocks.*.attn.to_v.weight$", k):
+            weights[k] = split_q_tp(
+                v, args.num_heads, args.hidden_size, tensor_parallel, mapping.tp_rank
+            )
         elif re.match("^transformer_blocks.*.attn.to_v.bias$", k):
+            weights[k] = split_q_bias_tp(
+                v, args.num_heads, args.hidden_size, tensor_parallel, mapping.tp_rank
+            )
         elif re.match("^transformer_blocks.*.attn.to_out.weight$", k):
             weights[k] = split_matrix_tp(v, tensor_parallel, mapping.tp_rank, dim=1)
     weights = convert_timm_dit(args, mapping, dtype=args.dtype)
+    safetensors.torch.save_file(
+        weights, os.path.join(args.output_dir, f"rank{rank}.safetensors")
+    )
 def execute(workers, func, args):
                 except Exception as e:
                     traceback.print_exc()
                     exceptions.append(e)
+            assert (
+                len(exceptions) == 0
+            ), "Checkpoint conversion failed, please check error log."
 def main():

f5_tts/runtime/triton_trtllm/scripts/export_vocoder_to_onnx.py CHANGED Viewed

@@ -20,12 +20,13 @@ from conv_stft import STFT
 from huggingface_hub import hf_hub_download
 from vocos import Vocos
 opset_version = 17
 def get_args():
-    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument(
         "--vocoder",
         type=str,
@@ -108,7 +109,9 @@ def export_VocosVocoder(vocos_vocoder, output_path, verbose):
     print("Exported to {}".format(output_path))
-def load_vocoder(vocoder_name="vocos", is_local=False, local_path="", device="cpu", hf_cache_dir=None):
     if vocoder_name == "vocos":
         # vocoder = Vocos.from_pretrained("charactr/vocos-mel-24khz").to(device)
         if is_local:
@@ -118,8 +121,12 @@ def load_vocoder(vocoder_name="vocos", is_local=False, local_path="", device="cp
         else:
             print("Download Vocos from huggingface charactr/vocos-mel-24khz")
             repo_id = "charactr/vocos-mel-24khz"
-            config_path = hf_hub_download(repo_id=repo_id, cache_dir=hf_cache_dir, filename="config.yaml")
-            model_path = hf_hub_download(repo_id=repo_id, cache_dir=hf_cache_dir, filename="pytorch_model.bin")
         vocoder = Vocos.from_hparams(config_path)
         state_dict = torch.load(model_path, map_location="cpu", weights_only=True)
         vocoder.load_state_dict(state_dict)

 from huggingface_hub import hf_hub_download
 from vocos import Vocos
 opset_version = 17
 def get_args():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
     parser.add_argument(
         "--vocoder",
         type=str,
     print("Exported to {}".format(output_path))
+def load_vocoder(
+    vocoder_name="vocos", is_local=False, local_path="", device="cpu", hf_cache_dir=None
+):
     if vocoder_name == "vocos":
         # vocoder = Vocos.from_pretrained("charactr/vocos-mel-24khz").to(device)
         if is_local:
         else:
             print("Download Vocos from huggingface charactr/vocos-mel-24khz")
             repo_id = "charactr/vocos-mel-24khz"
+            config_path = hf_hub_download(
+                repo_id=repo_id, cache_dir=hf_cache_dir, filename="config.yaml"
+            )
+            model_path = hf_hub_download(
+                repo_id=repo_id, cache_dir=hf_cache_dir, filename="pytorch_model.bin"
+            )
         vocoder = Vocos.from_hparams(config_path)
         state_dict = torch.load(model_path, map_location="cpu", weights_only=True)
         vocoder.load_state_dict(state_dict)

f5_tts/runtime/triton_trtllm/scripts/fill_template.py CHANGED Viewed

@@ -29,8 +29,12 @@ if __name__ == "__main__":
         "substitutions",
         help="substitutions to perform, in the format variable_name_1:value_1,variable_name_2:value_2...",
     )
-    parser.add_argument("--in_place", "-i", action="store_true", help="do the operation in-place")
-    parser.add_argument("--participant_ids", help="Participant IDs for the model", default="")
     args = parser.parse_args()
     main(**vars(args))

         "substitutions",
         help="substitutions to perform, in the format variable_name_1:value_1,variable_name_2:value_2...",
     )
+    parser.add_argument(
+        "--in_place", "-i", action="store_true", help="do the operation in-place"
+    )
+    parser.add_argument(
+        "--participant_ids", help="Participant IDs for the model", default=""
+    )
     args = parser.parse_args()
     main(**vars(args))

f5_tts/scripts/count_max_epoch.py CHANGED Viewed

@@ -24,10 +24,14 @@ updates_per_epoch = total_hours / mini_batch_hours
 # result
 epochs = wanted_max_updates / updates_per_epoch
-print(f"epochs should be set to: {epochs:.0f} ({epochs / grad_accum:.1f} x gd_acum {grad_accum})")
 print(f"progress_bar should show approx. 0/{updates_per_epoch:.0f} updates")
 # print(f"                      or approx. 0/{steps_per_epoch:.0f} steps")
 # others
 print(f"total {total_hours:.0f} hours")
-print(f"mini-batch of {mini_batch_frames:.0f} frames, {mini_batch_hours:.2f} hours per mini-batch")

 # result
 epochs = wanted_max_updates / updates_per_epoch
+print(
+    f"epochs should be set to: {epochs:.0f} ({epochs / grad_accum:.1f} x gd_acum {grad_accum})"
+)
 print(f"progress_bar should show approx. 0/{updates_per_epoch:.0f} updates")
 # print(f"                      or approx. 0/{steps_per_epoch:.0f} steps")
 # others
 print(f"total {total_hours:.0f} hours")
+print(
+    f"mini-batch of {mini_batch_frames:.0f} frames, {mini_batch_hours:.2f} hours per mini-batch"
+)