Spaces:

mrfakename
/

DMOSpeech2

Running on Zero

File size: 12,519 Bytes

39d2f14
 
 
 
 
597cecf
 
 
 
 
 
39d2f14
 
597cecf
 
 
39d2f14
597cecf
39d2f14
 
597cecf
 
 
 
 
 
 
 
 
 
 
 
 
 
39d2f14
597cecf
39d2f14
 
 
 
 
 
 
 
597cecf
 
 
 
 
39d2f14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
597cecf
39d2f14
 
597cecf
39d2f14
597cecf
39d2f14
 
 
 
 
 
 
 
 
 
597cecf
 
39d2f14
 
 
 
 
597cecf
 
 
 
 
 
 
 
 
 
 
39d2f14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
597cecf
39d2f14
 
 
 
 
 
 
597cecf
39d2f14
 
 
 
 
 
 
 
 
 
 
 
 
 
597cecf
 
 
39d2f14
 
 
 
 
 
 
597cecf
 
 
 
 
39d2f14
597cecf
39d2f14
 
 
 
 
 
 
 
 
 
597cecf
39d2f14
597cecf
39d2f14
597cecf
 
 
 
 
 
39d2f14
597cecf
39d2f14
597cecf
39d2f14
 
 
 
 
 
597cecf
39d2f14
 
597cecf
39d2f14
597cecf
 
 
 
39d2f14
597cecf
39d2f14
 
 
597cecf
39d2f14
 
 
 
 
 
 
 
 
 
597cecf
 
39d2f14
597cecf
39d2f14
 
 
 
 
 
 
597cecf
39d2f14
597cecf
 
 
 
 
 
 
39d2f14
 
 
 
 
 
 
 
 
 
597cecf
 
39d2f14
597cecf
39d2f14
 
 
 
 
 
 
597cecf
39d2f14
 
597cecf
39d2f14
597cecf
39d2f14
597cecf
39d2f14
 
597cecf
39d2f14
597cecf
 
 
 
39d2f14
 
597cecf
39d2f14
 
 
 
 
 
 
 
 
597cecf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39d2f14
 
 
597cecf
 
 
 
 
39d2f14
 
 
 
 
 
 
597cecf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39d2f14
 
 
597cecf
39d2f14
 
 
 
 
 
 
 
 
 
 
 
597cecf

from __future__ import annotations

import contextlib
import copy
import os
from pathlib import Path
from random import random
from typing import Callable

import torch
from torch import nn

from f5_tts.model import DiT, UNetT
from f5_tts.model.utils import (default, exists, lens_to_mask, list_str_to_idx,
                                list_str_to_tensor, mask_from_frac_lengths,
                                sample_consecutive_steps, sample_from_list)
from guidance_model import Guidance


class UniModel(nn.Module):
    def __init__(
        self,
        model: DiT,  # teacher model (dit model)
        checkpoint_path: str = "",
        second_time: bool = True,
        use_fp16: bool = True,
        real_guidance_scale: float = 2.0,
        fake_guidance_scale: float = 0.0,
        gen_cls_loss: bool = False,
        sway_coeff: float = -1.0,
        vocab_char_map: dict[str, int] | None = None,
        frac_lengths_mask: tuple[float, float] = (0.7, 1.0),
    ):

        super().__init__()

        if checkpoint_path != "":
            if "model_last.pt" in os.listdir(checkpoint_path):
                latest_checkpoint = "model_last.pt"
            else:
                latest_checkpoint = sorted(
                    [f for f in os.listdir(checkpoint_path) if f.endswith(".pt")],
                    key=lambda x: int("".join(filter(str.isdigit, x))),
                )[-1]
            checkpoint = torch.load(
                f"{checkpoint_path}/{latest_checkpoint}",
                weights_only=True,
                map_location="cpu",
            )

            if "scale" in checkpoint:
                self.scale = checkpoint["scale"]
            else:
                self.scale = 1.0
            print(f"Loaded teacher model with scale: {self.scale}")

            if "step" in checkpoint:
                state = checkpoint["model_state_dict"]
            else:
                checkpoint["model_state_dict"] = {
                    k.replace("ema_model.", ""): v
                    for k, v in checkpoint["ema_model_state_dict"].items()
                    if k not in ["initted", "step"]
                }
                state = checkpoint["model_state_dict"]

            # only load the DiT module
            filtered_state_dict = {
                k.replace("transformer.", ""): v
                for k, v in state.items()
                if k.startswith("transformer.")
            }

            model.load_state_dict(filtered_state_dict, strict=False)
        else:
            self.scale = 1.0

        real_unet = copy.deepcopy(model)
        real_unet.time_embed2 = None

        fake_unet = copy.deepcopy(model)

        # Instantiate Guidance, which internally uses real_unet and fake_unet initialized from the teacher
        self.guidance_model = Guidance(
            real_unet=real_unet,
            fake_unet=fake_unet,
            use_fp16=use_fp16,
            real_guidance_scale=real_guidance_scale,
            fake_guidance_scale=fake_guidance_scale,
            gen_cls_loss=gen_cls_loss,
            sway_coeff=sway_coeff,
        )

        self.feedforward_model = copy.deepcopy(model)  # initialize the student model
        self.feedforward_model.requires_grad_(True)
        self.feedforward_model.time_embed2 = None

        self.vocab_char_map = vocab_char_map
        self.frac_lengths_mask = frac_lengths_mask

        self.second_time = second_time  # fake_unet.time_embed2 is not None

    def forward(
        self,
        inp: float["b n d"],  # mel
        text: int["b nt"] | list[str],
        *,
        lens: int["b"] | None = None,
        student_steps: list[int] = [0, 0.25, 0.5, 0.75],
        update_generator: bool = False,
    ):
        """
        Forward pass that routes to either generator_forward or guidance_forward
        in the Guidance class, depending on the arguments.

        Parameters:
        -----------
        generator_turn: bool
            If True, run the generator forward pass (distribution matching loss, etc.)
        guidance_turn: bool
            If True, run the guidance forward pass (fake loss, cls loss, etc.)
        data_dict: dict
            Input dictionary containing the necessary keys for the forward passes.
            Expected keys may include:
                "inp": Tensor (B, N, D) - input mel or latent
                "text": Tensor or list[str] - text input
                "rand_span_mask": Tensor (B, N) - boolean mask
                "real_data": dict with keys like:
                     "inp", "text", "rand_span_mask"

        Returns:
        --------
        loss_dict: dict[str, Tensor]
            Dictionary of losses.
        log_dict: dict[str, Tensor or float]
            Dictionary of logging tensors or values.
        """

        batch, seq_len, dtype, device = *inp.shape[:2], inp.dtype, inp.device

        # handle text as string
        if isinstance(text, list):
            if exists(self.vocab_char_map):
                text = list_str_to_idx(text, self.vocab_char_map).to(device)
            else:
                text = list_str_to_tensor(text).to(device)
            assert text.shape[0] == batch

        # lens and mask
        if not exists(lens):
            lens = torch.full((batch,), seq_len, device=device)

        mask = lens_to_mask(
            lens, length=seq_len
        )  # useless here, as collate_fn will pad to max length in batch

        # sample from the list of student steps
        time = sample_from_list(student_steps, batch).to(device)
        c_time, p_time = sample_consecutive_steps(student_steps)
        time = torch.ones_like(time) * c_time
        p_time = torch.ones_like(time) * p_time

        frac_lengths = (
            torch.zeros((batch,), device=device)
            .float()
            .uniform_(*self.frac_lengths_mask)
        )
        rand_span_mask = mask_from_frac_lengths(lens, frac_lengths)

        if exists(mask):
            rand_span_mask &= mask

        # # use generated output from previous step as input
        with torch.no_grad():
            x1 = inp
            x0 = torch.randn_like(x1)
            t = p_time.unsqueeze(-1).unsqueeze(-1)
            phi = (1 - t) * x0 + t * x1
            cond = torch.where(rand_span_mask[..., None], torch.zeros_like(x1), x1)

            pred = self.feedforward_model(
                x=phi,
                cond=cond,
                text=text,
                time=p_time,
                drop_audio_cond=False,
                drop_text=False,  # make sure the cfg=1
            )  # flow prediction

            # predicted mel spectrogram
            output = phi + (1 - t) * pred
            output[~rand_span_mask] = inp[~rand_span_mask]

        # forward diffusion
        x1 = output
        x0 = torch.randn_like(x1)
        t = time.unsqueeze(-1).unsqueeze(-1)
        phi = (1 - t) * x0 + t * x1
        cond = torch.where(rand_span_mask[..., None], torch.zeros_like(x1), x1)

        with torch.no_grad() if not update_generator else contextlib.nullcontext():
            pred = self.feedforward_model(
                x=phi,
                cond=cond,
                text=text,
                time=time,
                drop_audio_cond=False,
                drop_text=False,  # make sure no cfg is used
            )

            # predicted mel spectrogram
            output = phi + (1 - t) * pred
            output[~rand_span_mask] = inp[~rand_span_mask]

        if update_generator:
            generator_data_dict = {
                "inp": output,
                "text": text,
                "rand_span_mask": rand_span_mask,
                "second_time": time if self.second_time else None,
                "mse_loss": time.mean() == student_steps[-1].mean(),
                "real_data": {
                    "inp": inp,
                    "text": text,
                    "rand_span_mask": rand_span_mask,
                },
            }

            # avoid any side effects of gradient accumulation
            # self.guidance_model.requires_grad_(False)
            # self.feedforward_model.requires_grad_(True)
            generator_loss_dict, generator_log_dict = self.guidance_model(
                generator_turn=True,
                guidance_turn=False,
                generator_data_dict=generator_data_dict,
                guidance_data_dict=None,
            )

            generator_log_dict["ground_truth"] = x1
            generator_log_dict["generator_input"] = phi
            generator_log_dict["generator_output"] = output
            generator_log_dict["generator_cond"] = cond
            generator_log_dict["time"] = time

            return generator_loss_dict, generator_log_dict
        else:
            guidance_data_dict = {
                "inp": output.detach(),
                "text": text,
                "rand_span_mask": rand_span_mask,
                "second_time": time if self.second_time else None,
                "real_data": {
                    "inp": inp,
                    "text": text,
                    "rand_span_mask": rand_span_mask,
                },
            }

            # avoid any side effects of gradient accumulation
            # self.feedforward_model.requires_grad_(False)
            # self.guidance_model.requires_grad_(True)
            guidance_loss_dict, guidance_log_dict = self.guidance_model(
                generator_turn=False,
                guidance_turn=True,
                generator_data_dict=None,
                guidance_data_dict=guidance_data_dict,
            )
            # self.feedforward_model.requires_grad_(True)

            return guidance_loss_dict, guidance_log_dict

        # return guidance_loss_dict, guidance_log_dict, generator_loss_dict, generator_log_dict


if __name__ == "__main__":

    from torch.utils.data import DataLoader, Dataset, SequentialSampler

    from f5_tts.model.dataset import (DynamicBatchSampler, collate_fn,
                                      load_dataset)
    from f5_tts.model.utils import get_tokenizer

    bsz = 16

    tokenizer = "pinyin"  # 'pinyin', 'char', or 'custom'
    tokenizer_path = None  # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
    dataset_name = "Emilia_ZH_EN"
    if tokenizer == "custom":
        tokenizer_path = tokenizer_path
    else:
        tokenizer_path = dataset_name
    vocab_char_map, vocab_size = get_tokenizer(tokenizer_path, tokenizer)

    dit = DiT(
        dim=1024,
        depth=22,
        heads=16,
        ff_mult=2,
        text_dim=512,
        conv_layers=4,
        text_num_embeds=vocab_size,
        mel_dim=100,
    )

    model = UniModel(
        dit,
        checkpoint_path="/data4/F5TTS/ckpts/F5TTS_Base_norm_flow_8GPU_vocos_pinyin_Emilia_ZH_EN",
        gen_cls_loss=True,
        vocab_char_map=vocab_char_map,
        frac_lengths_mask=(0.7, 1.0),
    ).cuda()

    # batch = next(iter(train_dataloader))
    # torch.save(batch, "batch.pt")
    batch = torch.load("batch.pt")
    inp, text, lens = (
        batch["mel"].permute(0, 2, 1).cuda(),
        batch["text"],
        batch["mel_lengths"].cuda(),
    )

    # text = ["hello world"] * bsz
    # lens = torch.randint(1, 1000, (bsz,)).cuda()
    # inp = torch.randn(bsz, lens.max(), 100).cuda()
    with torch.autocast(device_type="cuda", dtype=torch.float16):
        num_student_step = 4

        guidance_loss_dict, guidance_log_dict = model(
            inp,
            text,
            lens=lens,
            update_generator=False,
            student_steps=(torch.linspace(0.0, 1.0, num_student_step + 1)[:-1]),
        )

        generator_loss_dict, generator_log_dict = model(
            inp,
            text,
            lens=lens,
            update_generator=True,
            student_steps=(torch.linspace(0.0, 1.0, num_student_step + 1)[:-1]),
        )

        print(guidance_loss_dict)
        print(generator_loss_dict)

        guidance_loss = 0
        guidance_loss += guidance_loss_dict["loss_fake_mean"]
        guidance_loss += guidance_loss_dict["guidance_cls_loss"]

        generator_loss = 0
        generator_loss += generator_loss_dict["loss_dm"]
        generator_loss += generator_loss_dict["loss_ctc"]
        generator_loss += generator_loss_dict["loss_sim"]
        generator_loss += generator_loss_dict["gen_cls_loss"]
        generator_loss += generator_loss_dict["loss_mse"]

        guidance_loss.backward()
        generator_loss.backward()