Spaces:

alexnasa
/

OmniAvatar

Running on Zero

App Files Files Community

alex commited on 4 days ago

Commit

c20d00d

1 Parent(s): a8a56b0

do faster CPU based tts

Browse files

Files changed (36) hide show

app.py +32 -50
higgs_audio/__init__.py +0 -1
higgs_audio/audio_processing/LICENSE +0 -51
higgs_audio/audio_processing/descriptaudiocodec/__init__.py +0 -0
higgs_audio/audio_processing/descriptaudiocodec/dac/model/base.py +0 -286
higgs_audio/audio_processing/descriptaudiocodec/dac/model/dac.py +0 -365
higgs_audio/audio_processing/descriptaudiocodec/dac/nn/layers.py +0 -33
higgs_audio/audio_processing/descriptaudiocodec/dac/nn/quantize.py +0 -251
higgs_audio/audio_processing/higgs_audio_tokenizer.py +0 -341
higgs_audio/audio_processing/quantization/__init__.py +0 -8
higgs_audio/audio_processing/quantization/ac.py +0 -301
higgs_audio/audio_processing/quantization/core_vq.py +0 -360
higgs_audio/audio_processing/quantization/core_vq_lsx_version.py +0 -431
higgs_audio/audio_processing/quantization/ddp_utils.py +0 -197
higgs_audio/audio_processing/quantization/distrib.py +0 -123
higgs_audio/audio_processing/quantization/vq.py +0 -116
higgs_audio/audio_processing/semantic_module.py +0 -310
higgs_audio/constants.py +0 -3
higgs_audio/data_collator/__init__.py +0 -0
higgs_audio/data_collator/higgs_audio_collator.py +0 -583
higgs_audio/data_types.py +0 -38
higgs_audio/dataset/__init__.py +0 -0
higgs_audio/dataset/chatml_dataset.py +0 -554
higgs_audio/model/__init__.py +0 -9
higgs_audio/model/audio_head.py +0 -139
higgs_audio/model/common.py +0 -27
higgs_audio/model/configuration_higgs_audio.py +0 -235
higgs_audio/model/cuda_graph_runner.py +0 -129
higgs_audio/model/custom_modules.py +0 -155
higgs_audio/model/modeling_higgs_audio.py +0 -0
higgs_audio/model/utils.py +0 -778
higgs_audio/serve/serve_engine.py +0 -474
higgs_audio/serve/utils.py +0 -254
higgs_audio_utils.py +0 -280
requirements.txt +2 -13
supertonic.py +364 -0

app.py CHANGED Viewed

@@ -16,40 +16,11 @@ from tqdm import tqdm
 import importlib, site, sys
 from huggingface_hub import hf_hub_download, snapshot_download
-# Re-discover all .pth/.egg-link files
-for sitedir in site.getsitepackages():
-    site.addsitedir(sitedir)
-# Clear caches so importlib will pick up new modules
-importlib.invalidate_caches()
 def sh(cmd): subprocess.check_call(cmd, shell=True)
-flash_attention_installed = False
-try:
-    print("Attempting to download and install FlashAttention wheel...")
-    flash_attention_wheel = hf_hub_download(
-            repo_id="alexnasa/flash-attn-3",
-            repo_type="model",
-            filename="128/flash_attn_3-3.0.0b1-cp39-abi3-linux_x86_64.whl",
-        )
-    sh(f"pip install {flash_attention_wheel}")
-    # tell Python to re-scan site-packages now that the egg-link exists
-    import importlib, site; site.addsitedir(site.getsitepackages()[0]); importlib.invalidate_caches()
-    flash_attention_installed = True
-    print("FlashAttention installed successfully.")
-except Exception as e:
-    print(f"⚠️ Could not install FlashAttention: {e}")
-    print("Continuing without FlashAttention...")
 import torch
 print(f"Torch version: {torch.__version__}")
-print(f"FlashAttention available: {flash_attention_installed}")
 import torch.nn as nn
@@ -82,41 +53,38 @@ from transformers import Wav2Vec2FeatureExtractor
 import torchvision.transforms as transforms
 import torch.nn.functional as F
 from OmniAvatar.utils.audio_preprocess import add_silence_to_audio_ffmpeg
-from higgs_audio_utils import text_to_speech, initialize_engine
-DEFAULT_TTS_MODEL_PATH = "bosonai/higgs-audio-v2-generation-3B-base"
-DEFAULT_AUDIO_TOKENIZER_PATH = "bosonai/higgs-audio-v2-tokenizer"
-engine = initialize_engine(DEFAULT_TTS_MODEL_PATH, DEFAULT_AUDIO_TOKENIZER_PATH)
 os.environ["PROCESSED_RESULTS"] = f"{os.getcwd()}/proprocess_results"
-@spaces.GPU
-def tts_from_text(text):
-    _, output = text_to_speech(engine, text)
     return output
 def speak_to_me(session_id, evt: gr.EventData):
     detail = getattr(evt, "data", None) or getattr(evt, "_data", {}) or {}
-    current_text = detail.get("text", "")
-    output = tts_from_text(current_text)
     if session_id is None:
         session_id = uuid.uuid4().hex
     output_dir = os.path.join(os.environ["PROCESSED_RESULTS"], session_id)
     tts_dir = output_dir + '/tts'
     os.makedirs(tts_dir, exist_ok=True)
-    speech_to_text_path = os.path.join(tts_dir, f"speech_to_text.wav")
-    sampling_rate = output[0]
-    audio_data = output[1]
-    torchaudio.save(speech_to_text_path, torch.from_numpy(audio_data)[None, :], output[0])
     return speech_to_text_path
 def tensor_to_pil(tensor):
@@ -814,6 +782,20 @@ css = """
     .stateful textarea[readonly]{
       color: var(--body-text-color-subdued) !important; /* subdued in both modes */
     }
     """
 with gr.Blocks(css=css) as demo:
@@ -859,7 +841,7 @@ with gr.Blocks(css=css) as demo:
             with gr.Column():
                 image_input = extendedimage(label="Reference Image", type="filepath", height=512)
-                audio_input = ExtendedAudio(label="Input Audio", type="filepath", options=["EMPTY"], show_download_button=True)
                 gr.Markdown("*A 5-second limit is applied to audio files to shorten generation time. You can turn this off in Advanced Settings*")
@@ -869,7 +851,7 @@ with gr.Blocks(css=css) as demo:
                 num_steps = gr.Slider(4, 50, value=8, step=1, label="Steps")
                 time_required = gr.Text(value="⌚ Zero GPU Required: --", show_label=False)
-                infer_btn = gr.Button("🦜 Avatar Me", variant="primary")
                 with gr.Accordion("Advanced Settings", open=False):
                     raw_img_text = gr.Text(show_label=False, label="", value='', visible=False)
                     limit_on = gr.Checkbox(label="Limit Audio files to 5 seconds", value=True)

 import importlib, site, sys
 from huggingface_hub import hf_hub_download, snapshot_download
 def sh(cmd): subprocess.check_call(cmd, shell=True)
 import torch
 print(f"Torch version: {torch.__version__}")
 import torch.nn as nn
 import torchvision.transforms as transforms
 import torch.nn.functional as F
 from OmniAvatar.utils.audio_preprocess import add_silence_to_audio_ffmpeg
+from supertonic import generate_speech
 os.environ["PROCESSED_RESULTS"] = f"{os.getcwd()}/proprocess_results"
+def tts_from_text(text, tts_dir, voice_choice):
+    output = generate_speech([text], tts_dir, voice_choice)[0]
     return output
 def speak_to_me(session_id, evt: gr.EventData):
     detail = getattr(evt, "data", None) or getattr(evt, "_data", {}) or {}
     if session_id is None:
         session_id = uuid.uuid4().hex
+    current_text = detail.get("text", "")
+    voice  = detail.get("choice")
+    voice_choice = "M1"
+    if voice == "Person1":
+        voice_choice = "M1"
+    elif  voice == "Person2":
+        voice_choice = "F1"
     output_dir = os.path.join(os.environ["PROCESSED_RESULTS"], session_id)
     tts_dir = output_dir + '/tts'
     os.makedirs(tts_dir, exist_ok=True)
+    speech_to_text_path = tts_from_text(current_text, tts_dir, voice_choice)
     return speech_to_text_path
 def tensor_to_pil(tensor):
     .stateful textarea[readonly]{
       color: var(--body-text-color-subdued) !important; /* subdued in both modes */
     }
+    .button-gradient {
+        background: linear-gradient(45deg, rgb(255, 65, 108), rgb(255, 75, 43), rgb(255, 155, 0), rgb(255, 65, 108)) 0% 0% / 400% 400%;
+        border: none;
+        padding: 14px 28px;
+        font-size: 16px;
+        font-weight: bold;
+        color: white;
+        border-radius: 10px;
+        cursor: pointer;
+        transition: 0.3s ease-in-out;
+        animation: 2s linear 0s infinite normal none running gradientAnimation;
+        box-shadow: rgba(255, 65, 108, 0.6) 0px 4px 10px;
+    }
     """
 with gr.Blocks(css=css) as demo:
             with gr.Column():
                 image_input = extendedimage(label="Reference Image", type="filepath", height=512)
+                audio_input = ExtendedAudio(label="Input Audio", type="filepath", options=["Person1", "Person2"], show_download_button=True)
                 gr.Markdown("*A 5-second limit is applied to audio files to shorten generation time. You can turn this off in Advanced Settings*")
                 num_steps = gr.Slider(4, 50, value=8, step=1, label="Steps")
                 time_required = gr.Text(value="⌚ Zero GPU Required: --", show_label=False)
+                infer_btn = gr.Button("🦜 Avatar Me", variant='primary', elem_classes="button-gradient")
                 with gr.Accordion("Advanced Settings", open=False):
                     raw_img_text = gr.Text(show_label=False, label="", value='', visible=False)
                     limit_on = gr.Checkbox(label="Limit Audio files to 5 seconds", value=True)

higgs_audio/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from .model import HiggsAudioConfig, HiggsAudioModel

higgs_audio/audio_processing/LICENSE DELETED Viewed

@@ -1,51 +0,0 @@
-Third-Party License Attribution for Audio Processing Module
-===========================================================
-This directory contains code derived from multiple open-source projects.
-The following sections detail the licenses and attributions for third-party code.
-## XCodec Repository
-The code in this directory is derived from:
-https://github.com/zhenye234/xcodec
-## Individual File Attributions
-### Quantization Module (quantization/)
-- Several files contain code derived from Meta Platforms, Inc. and the vector-quantize-pytorch repository
-- Individual files contain their own license headers where applicable
-- The vector-quantize-pytorch portions are licensed under the MIT License
-## License Terms
-### MIT License (for applicable portions)
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-## Attribution Requirements
-When using this code, please ensure proper attribution to:
-1. The original xcodec repository: https://github.com/zhenye234/xcodec
-2. Any other repositories mentioned in individual file headers
-3. This derivative work and its modifications
-## Disclaimer
-This directory contains modified versions of the original code. Please refer to
-the original repositories for the canonical implementations and their specific
-license terms.
-For any questions about licensing or attribution, please check the individual
-file headers and the original source repositories.

higgs_audio/audio_processing/descriptaudiocodec/__init__.py DELETED Viewed

File without changes

higgs_audio/audio_processing/descriptaudiocodec/dac/model/base.py DELETED Viewed

@@ -1,286 +0,0 @@
-import math
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Union
-import numpy as np
-import torch
-import tqdm
-from audiotools import AudioSignal
-from torch import nn
-SUPPORTED_VERSIONS = ["1.0.0"]
-@dataclass
-class DACFile:
-    codes: torch.Tensor
-    # Metadata
-    chunk_length: int
-    original_length: int
-    input_db: float
-    channels: int
-    sample_rate: int
-    padding: bool
-    dac_version: str
-    def save(self, path):
-        artifacts = {
-            "codes": self.codes.numpy().astype(np.uint16),
-            "metadata": {
-                "input_db": self.input_db.numpy().astype(np.float32),
-                "original_length": self.original_length,
-                "sample_rate": self.sample_rate,
-                "chunk_length": self.chunk_length,
-                "channels": self.channels,
-                "padding": self.padding,
-                "dac_version": SUPPORTED_VERSIONS[-1],
-            },
-        }
-        path = Path(path).with_suffix(".dac")
-        with open(path, "wb") as f:
-            np.save(f, artifacts)
-        return path
-    @classmethod
-    def load(cls, path):
-        artifacts = np.load(path, allow_pickle=True)[()]
-        codes = torch.from_numpy(artifacts["codes"].astype(int))
-        if artifacts["metadata"].get("dac_version", None) not in SUPPORTED_VERSIONS:
-            raise RuntimeError(f"Given file {path} can't be loaded with this version of descript-audio-codec.")
-        return cls(codes=codes, **artifacts["metadata"])
-class CodecMixin:
-    @property
-    def padding(self):
-        if not hasattr(self, "_padding"):
-            self._padding = True
-        return self._padding
-    @padding.setter
-    def padding(self, value):
-        assert isinstance(value, bool)
-        layers = [l for l in self.modules() if isinstance(l, (nn.Conv1d, nn.ConvTranspose1d))]
-        for layer in layers:
-            if value:
-                if hasattr(layer, "original_padding"):
-                    layer.padding = layer.original_padding
-            else:
-                layer.original_padding = layer.padding
-                layer.padding = tuple(0 for _ in range(len(layer.padding)))
-        self._padding = value
-    def get_delay(self):
-        # Any number works here, delay is invariant to input length
-        l_out = self.get_output_length(0)
-        L = l_out
-        layers = []
-        for layer in self.modules():
-            if isinstance(layer, (nn.Conv1d, nn.ConvTranspose1d)):
-                layers.append(layer)
-        for layer in reversed(layers):
-            d = layer.dilation[0]
-            k = layer.kernel_size[0]
-            s = layer.stride[0]
-            if isinstance(layer, nn.ConvTranspose1d):
-                L = ((L - d * (k - 1) - 1) / s) + 1
-            elif isinstance(layer, nn.Conv1d):
-                L = (L - 1) * s + d * (k - 1) + 1
-            L = math.ceil(L)
-        l_in = L
-        return (l_in - l_out) // 2
-    def get_output_length(self, input_length):
-        L = input_length
-        # Calculate output length
-        for layer in self.modules():
-            if isinstance(layer, (nn.Conv1d, nn.ConvTranspose1d)):
-                d = layer.dilation[0]
-                k = layer.kernel_size[0]
-                s = layer.stride[0]
-                if isinstance(layer, nn.Conv1d):
-                    L = ((L - d * (k - 1) - 1) / s) + 1
-                elif isinstance(layer, nn.ConvTranspose1d):
-                    L = (L - 1) * s + d * (k - 1) + 1
-                L = math.floor(L)
-        return L
-    @torch.no_grad()
-    def compress(
-        self,
-        audio_path_or_signal: Union[str, Path, AudioSignal],
-        win_duration: float = 1.0,
-        verbose: bool = False,
-        normalize_db: float = -16,
-        n_quantizers: int = None,
-    ) -> DACFile:
-        """Processes an audio signal from a file or AudioSignal object into
-        discrete codes. This function processes the signal in short windows,
-        using constant GPU memory.
-        Parameters
-        ----------
-        audio_path_or_signal : Union[str, Path, AudioSignal]
-            audio signal to reconstruct
-        win_duration : float, optional
-            window duration in seconds, by default 5.0
-        verbose : bool, optional
-            by default False
-        normalize_db : float, optional
-            normalize db, by default -16
-        Returns
-        -------
-        DACFile
-            Object containing compressed codes and metadata
-            required for decompression
-        """
-        audio_signal = audio_path_or_signal
-        if isinstance(audio_signal, (str, Path)):
-            audio_signal = AudioSignal.load_from_file_with_ffmpeg(str(audio_signal))
-        self.eval()
-        original_padding = self.padding
-        original_device = audio_signal.device
-        audio_signal = audio_signal.clone()
-        original_sr = audio_signal.sample_rate
-        resample_fn = audio_signal.resample
-        loudness_fn = audio_signal.loudness
-        # If audio is > 10 minutes long, use the ffmpeg versions
-        if audio_signal.signal_duration >= 10 * 60 * 60:
-            resample_fn = audio_signal.ffmpeg_resample
-            loudness_fn = audio_signal.ffmpeg_loudness
-        original_length = audio_signal.signal_length
-        resample_fn(self.sample_rate)
-        input_db = loudness_fn()
-        if normalize_db is not None:
-            audio_signal.normalize(normalize_db)
-        audio_signal.ensure_max_of_audio()
-        nb, nac, nt = audio_signal.audio_data.shape
-        audio_signal.audio_data = audio_signal.audio_data.reshape(nb * nac, 1, nt)
-        win_duration = audio_signal.signal_duration if win_duration is None else win_duration
-        if audio_signal.signal_duration <= win_duration:
-            # Unchunked compression (used if signal length < win duration)
-            self.padding = True
-            n_samples = nt
-            hop = nt
-        else:
-            # Chunked inference
-            self.padding = False
-            # Zero-pad signal on either side by the delay
-            audio_signal.zero_pad(self.delay, self.delay)
-            n_samples = int(win_duration * self.sample_rate)
-            # Round n_samples to nearest hop length multiple
-            n_samples = int(math.ceil(n_samples / self.hop_length) * self.hop_length)
-            hop = self.get_output_length(n_samples)
-        codes = []
-        range_fn = range if not verbose else tqdm.trange
-        for i in range_fn(0, nt, hop):
-            x = audio_signal[..., i : i + n_samples]
-            x = x.zero_pad(0, max(0, n_samples - x.shape[-1]))
-            audio_data = x.audio_data.to(self.device)
-            audio_data = self.preprocess(audio_data, self.sample_rate)
-            _, c, _, _, _ = self.encode(audio_data, n_quantizers)
-            codes.append(c.to(original_device))
-            chunk_length = c.shape[-1]
-        codes = torch.cat(codes, dim=-1)
-        dac_file = DACFile(
-            codes=codes,
-            chunk_length=chunk_length,
-            original_length=original_length,
-            input_db=input_db,
-            channels=nac,
-            sample_rate=original_sr,
-            padding=self.padding,
-            dac_version=SUPPORTED_VERSIONS[-1],
-        )
-        if n_quantizers is not None:
-            codes = codes[:, :n_quantizers, :]
-        self.padding = original_padding
-        return dac_file
-    @torch.no_grad()
-    def decompress(
-        self,
-        obj: Union[str, Path, DACFile],
-        verbose: bool = False,
-    ) -> AudioSignal:
-        """Reconstruct audio from a given .dac file
-        Parameters
-        ----------
-        obj : Union[str, Path, DACFile]
-            .dac file location or corresponding DACFile object.
-        verbose : bool, optional
-            Prints progress if True, by default False
-        Returns
-        -------
-        AudioSignal
-            Object with the reconstructed audio
-        """
-        self.eval()
-        if isinstance(obj, (str, Path)):
-            obj = DACFile.load(obj)
-        original_padding = self.padding
-        self.padding = obj.padding
-        range_fn = range if not verbose else tqdm.trange
-        codes = obj.codes
-        original_device = codes.device
-        chunk_length = obj.chunk_length
-        recons = []
-        for i in range_fn(0, codes.shape[-1], chunk_length):
-            c = codes[..., i : i + chunk_length].to(self.device)
-            z = self.quantizer.from_codes(c)[0]
-            r = self.decode(z)
-            recons.append(r.to(original_device))
-        recons = torch.cat(recons, dim=-1)
-        recons = AudioSignal(recons, self.sample_rate)
-        resample_fn = recons.resample
-        loudness_fn = recons.loudness
-        # If audio is > 10 minutes long, use the ffmpeg versions
-        if recons.signal_duration >= 10 * 60 * 60:
-            resample_fn = recons.ffmpeg_resample
-            loudness_fn = recons.ffmpeg_loudness
-        recons.normalize(obj.input_db)
-        resample_fn(obj.sample_rate)
-        recons = recons[..., : obj.original_length]
-        loudness_fn()
-        recons.audio_data = recons.audio_data.reshape(-1, obj.channels, obj.original_length)
-        self.padding = original_padding
-        return recons

higgs_audio/audio_processing/descriptaudiocodec/dac/model/dac.py DELETED Viewed

@@ -1,365 +0,0 @@
-import math
-from typing import List
-from typing import Union
-import numpy as np
-import torch
-from audiotools import AudioSignal
-from audiotools.ml import BaseModel
-from torch import nn
-from .base import CodecMixin
-from dac.nn.layers import Snake1d
-from dac.nn.layers import WNConv1d
-from dac.nn.layers import WNConvTranspose1d
-from dac.nn.quantize import ResidualVectorQuantize
-def init_weights(m):
-    if isinstance(m, nn.Conv1d):
-        nn.init.trunc_normal_(m.weight, std=0.02)
-        nn.init.constant_(m.bias, 0)
-class ResidualUnit(nn.Module):
-    def __init__(self, dim: int = 16, dilation: int = 1):
-        super().__init__()
-        pad = ((7 - 1) * dilation) // 2
-        self.block = nn.Sequential(
-            Snake1d(dim),
-            WNConv1d(dim, dim, kernel_size=7, dilation=dilation, padding=pad),
-            Snake1d(dim),
-            WNConv1d(dim, dim, kernel_size=1),
-        )
-    def forward(self, x):
-        y = self.block(x)
-        pad = (x.shape[-1] - y.shape[-1]) // 2
-        if pad > 0:
-            x = x[..., pad:-pad]
-        return x + y
-class EncoderBlock(nn.Module):
-    def __init__(self, dim: int = 16, stride: int = 1):
-        super().__init__()
-        self.block = nn.Sequential(
-            ResidualUnit(dim // 2, dilation=1),
-            ResidualUnit(dim // 2, dilation=3),
-            ResidualUnit(dim // 2, dilation=9),
-            Snake1d(dim // 2),
-            WNConv1d(
-                dim // 2,
-                dim,
-                kernel_size=2 * stride,
-                stride=stride,
-                padding=math.ceil(stride / 2),
-            ),
-        )
-    def forward(self, x):
-        return self.block(x)
-class Encoder(nn.Module):
-    def __init__(
-        self,
-        d_model: int = 64,
-        strides: list = [2, 4, 8, 8],
-        d_latent: int = 256,
-    ):
-        super().__init__()
-        # Create first convolution
-        self.block = [WNConv1d(1, d_model, kernel_size=7, padding=3)]
-        # Create EncoderBlocks that double channels as they downsample by `stride`
-        for stride in strides:
-            d_model *= 2
-            self.block += [EncoderBlock(d_model, stride=stride)]
-        # Create last convolution
-        self.block += [
-            Snake1d(d_model),
-            WNConv1d(d_model, d_latent, kernel_size=3, padding=1),
-        ]
-        # Wrap black into nn.Sequential
-        self.block = nn.Sequential(*self.block)
-        self.enc_dim = d_model
-    def forward(self, x):
-        return self.block(x)
-class DecoderBlock(nn.Module):
-    def __init__(self, input_dim: int = 16, output_dim: int = 8, stride: int = 1, out_pad=0):
-        super().__init__()
-        self.block = nn.Sequential(
-            Snake1d(input_dim),
-            WNConvTranspose1d(
-                input_dim,
-                output_dim,
-                kernel_size=2 * stride,
-                stride=stride,
-                padding=math.ceil(stride / 2),
-                output_padding=stride % 2,  # out_pad,
-            ),
-            ResidualUnit(output_dim, dilation=1),
-            ResidualUnit(output_dim, dilation=3),
-            ResidualUnit(output_dim, dilation=9),
-        )
-    def forward(self, x):
-        return self.block(x)
-class Decoder(nn.Module):
-    def __init__(
-        self,
-        input_channel,
-        channels,
-        rates,
-        d_out: int = 1,
-    ):
-        super().__init__()
-        # Add first conv layer
-        layers = [WNConv1d(input_channel, channels, kernel_size=7, padding=3)]
-        # Add upsampling + MRF blocks
-        for i, stride in enumerate(rates):
-            input_dim = channels // 2**i
-            output_dim = channels // 2 ** (i + 1)
-            if i == 1:
-                out_pad = 1
-            else:
-                out_pad = 0
-            layers += [DecoderBlock(input_dim, output_dim, stride, out_pad)]
-        # Add final conv layer
-        layers += [
-            Snake1d(output_dim),
-            WNConv1d(output_dim, d_out, kernel_size=7, padding=3),
-            # nn.Tanh(),
-        ]
-        self.model = nn.Sequential(*layers)
-    def forward(self, x):
-        return self.model(x)
-class DAC(BaseModel, CodecMixin):
-    def __init__(
-        self,
-        encoder_dim: int = 64,
-        encoder_rates: List[int] = [2, 4, 8, 8],
-        latent_dim: int = None,
-        decoder_dim: int = 1536,
-        decoder_rates: List[int] = [8, 8, 4, 2],
-        n_codebooks: int = 9,
-        codebook_size: int = 1024,
-        codebook_dim: Union[int, list] = 8,
-        quantizer_dropout: bool = False,
-        sample_rate: int = 44100,
-    ):
-        super().__init__()
-        self.encoder_dim = encoder_dim
-        self.encoder_rates = encoder_rates
-        self.decoder_dim = decoder_dim
-        self.decoder_rates = decoder_rates
-        self.sample_rate = sample_rate
-        if latent_dim is None:
-            latent_dim = encoder_dim * (2 ** len(encoder_rates))
-        self.latent_dim = latent_dim
-        self.hop_length = np.prod(encoder_rates)
-        self.encoder = Encoder(encoder_dim, encoder_rates, latent_dim)
-        self.n_codebooks = n_codebooks
-        self.codebook_size = codebook_size
-        self.codebook_dim = codebook_dim
-        self.quantizer = ResidualVectorQuantize(
-            input_dim=latent_dim,
-            n_codebooks=n_codebooks,
-            codebook_size=codebook_size,
-            codebook_dim=codebook_dim,
-            quantizer_dropout=quantizer_dropout,
-        )
-        self.decoder = Decoder(
-            latent_dim,
-            decoder_dim,
-            decoder_rates,
-        )
-        self.sample_rate = sample_rate
-        self.apply(init_weights)
-        self.delay = self.get_delay()
-    def preprocess(self, audio_data, sample_rate):
-        if sample_rate is None:
-            sample_rate = self.sample_rate
-        assert sample_rate == self.sample_rate
-        length = audio_data.shape[-1]
-        right_pad = math.ceil(length / self.hop_length) * self.hop_length - length
-        audio_data = nn.functional.pad(audio_data, (0, right_pad))
-        return audio_data
-    def encode(
-        self,
-        audio_data: torch.Tensor,
-        n_quantizers: int = None,
-    ):
-        """Encode given audio data and return quantized latent codes
-        Parameters
-        ----------
-        audio_data : Tensor[B x 1 x T]
-            Audio data to encode
-        n_quantizers : int, optional
-            Number of quantizers to use, by default None
-            If None, all quantizers are used.
-        Returns
-        -------
-        dict
-            A dictionary with the following keys:
-            "z" : Tensor[B x D x T]
-                Quantized continuous representation of input
-            "codes" : Tensor[B x N x T]
-                Codebook indices for each codebook
-                (quantized discrete representation of input)
-            "latents" : Tensor[B x N*D x T]
-                Projected latents (continuous representation of input before quantization)
-            "vq/commitment_loss" : Tensor[1]
-                Commitment loss to train encoder to predict vectors closer to codebook
-                entries
-            "vq/codebook_loss" : Tensor[1]
-                Codebook loss to update the codebook
-            "length" : int
-                Number of samples in input audio
-        """
-        z = self.encoder(audio_data)
-        z, codes, latents, commitment_loss, codebook_loss = self.quantizer(z, n_quantizers)
-        return z, codes, latents, commitment_loss, codebook_loss
-    def decode(self, z: torch.Tensor):
-        """Decode given latent codes and return audio data
-        Parameters
-        ----------
-        z : Tensor[B x D x T]
-            Quantized continuous representation of input
-        length : int, optional
-            Number of samples in output audio, by default None
-        Returns
-        -------
-        dict
-            A dictionary with the following keys:
-            "audio" : Tensor[B x 1 x length]
-                Decoded audio data.
-        """
-        return self.decoder(z)
-    def forward(
-        self,
-        audio_data: torch.Tensor,
-        sample_rate: int = None,
-        n_quantizers: int = None,
-    ):
-        """Model forward pass
-        Parameters
-        ----------
-        audio_data : Tensor[B x 1 x T]
-            Audio data to encode
-        sample_rate : int, optional
-            Sample rate of audio data in Hz, by default None
-            If None, defaults to `self.sample_rate`
-        n_quantizers : int, optional
-            Number of quantizers to use, by default None.
-            If None, all quantizers are used.
-        Returns
-        -------
-        dict
-            A dictionary with the following keys:
-            "z" : Tensor[B x D x T]
-                Quantized continuous representation of input
-            "codes" : Tensor[B x N x T]
-                Codebook indices for each codebook
-                (quantized discrete representation of input)
-            "latents" : Tensor[B x N*D x T]
-                Projected latents (continuous representation of input before quantization)
-            "vq/commitment_loss" : Tensor[1]
-                Commitment loss to train encoder to predict vectors closer to codebook
-                entries
-            "vq/codebook_loss" : Tensor[1]
-                Codebook loss to update the codebook
-            "length" : int
-                Number of samples in input audio
-            "audio" : Tensor[B x 1 x length]
-                Decoded audio data.
-        """
-        length = audio_data.shape[-1]
-        audio_data = self.preprocess(audio_data, sample_rate)
-        z, codes, latents, commitment_loss, codebook_loss = self.encode(audio_data, n_quantizers)
-        x = self.decode(z)
-        return {
-            "audio": x[..., :length],
-            "z": z,
-            "codes": codes,
-            "latents": latents,
-            "vq/commitment_loss": commitment_loss,
-            "vq/codebook_loss": codebook_loss,
-        }
-if __name__ == "__main__":
-    import numpy as np
-    from functools import partial
-    model = DAC().to("cpu")
-    for n, m in model.named_modules():
-        o = m.extra_repr()
-        p = sum([np.prod(p.size()) for p in m.parameters()])
-        fn = lambda o, p: o + f" {p / 1e6:<.3f}M params."
-        setattr(m, "extra_repr", partial(fn, o=o, p=p))
-    print(model)
-    print("Total # of params: ", sum([np.prod(p.size()) for p in model.parameters()]))
-    length = 88200 * 2
-    x = torch.randn(1, 1, length).to(model.device)
-    x.requires_grad_(True)
-    x.retain_grad()
-    # Make a forward pass
-    out = model(x)["audio"]
-    print("Input shape:", x.shape)
-    print("Output shape:", out.shape)
-    # Create gradient variable
-    grad = torch.zeros_like(out)
-    grad[:, :, grad.shape[-1] // 2] = 1
-    # Make a backward pass
-    out.backward(grad)
-    # Check non-zero values
-    gradmap = x.grad.squeeze(0)
-    gradmap = (gradmap != 0).sum(0)  # sum across features
-    rf = (gradmap != 0).sum()
-    print(f"Receptive field: {rf.item()}")
-    x = AudioSignal(torch.randn(1, 1, 44100 * 60), 44100)
-    model.decompress(model.compress(x, verbose=True), verbose=True)

higgs_audio/audio_processing/descriptaudiocodec/dac/nn/layers.py DELETED Viewed

@@ -1,33 +0,0 @@
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange
-from torch.nn.utils import weight_norm
-def WNConv1d(*args, **kwargs):
-    return weight_norm(nn.Conv1d(*args, **kwargs))
-def WNConvTranspose1d(*args, **kwargs):
-    return weight_norm(nn.ConvTranspose1d(*args, **kwargs))
-# Scripting this brings model speed up 1.4x
-@torch.jit.script
-def snake(x, alpha):
-    shape = x.shape
-    x = x.reshape(shape[0], shape[1], -1)
-    x = x + (alpha + 1e-9).reciprocal() * torch.sin(alpha * x).pow(2)
-    x = x.reshape(shape)
-    return x
-class Snake1d(nn.Module):
-    def __init__(self, channels):
-        super().__init__()
-        self.alpha = nn.Parameter(torch.ones(1, channels, 1))
-    def forward(self, x):
-        return snake(x, self.alpha)

higgs_audio/audio_processing/descriptaudiocodec/dac/nn/quantize.py DELETED Viewed

@@ -1,251 +0,0 @@
-from typing import Union
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange
-from torch.nn.utils import weight_norm
-from dac.nn.layers import WNConv1d
-class VectorQuantize(nn.Module):
-    """
-    Implementation of VQ similar to Karpathy's repo:
-    https://github.com/karpathy/deep-vector-quantization
-    Additionally uses following tricks from Improved VQGAN
-    (https://arxiv.org/pdf/2110.04627.pdf):
-        1. Factorized codes: Perform nearest neighbor lookup in low-dimensional space
-            for improved codebook usage
-        2. l2-normalized codes: Converts euclidean distance to cosine similarity which
-            improves training stability
-    """
-    def __init__(self, input_dim: int, codebook_size: int, codebook_dim: int):
-        super().__init__()
-        self.codebook_size = codebook_size
-        self.codebook_dim = codebook_dim
-        self.in_proj = WNConv1d(input_dim, codebook_dim, kernel_size=1)
-        self.out_proj = WNConv1d(codebook_dim, input_dim, kernel_size=1)
-        self.codebook = nn.Embedding(codebook_size, codebook_dim)
-    def forward(self, z):
-        """Quantized the input tensor using a fixed codebook and returns
-        the corresponding codebook vectors
-        Parameters
-        ----------
-        z : Tensor[B x D x T]
-        Returns
-        -------
-        Tensor[B x D x T]
-            Quantized continuous representation of input
-        Tensor[1]
-            Commitment loss to train encoder to predict vectors closer to codebook
-            entries
-        Tensor[1]
-            Codebook loss to update the codebook
-        Tensor[B x T]
-            Codebook indices (quantized discrete representation of input)
-        Tensor[B x D x T]
-            Projected latents (continuous representation of input before quantization)
-        """
-        # Factorized codes (ViT-VQGAN) Project input into low-dimensional space
-        z_e = self.in_proj(z)  # z_e : (B x D x T)
-        z_q, indices = self.decode_latents(z_e)
-        commitment_loss = F.mse_loss(z_e, z_q.detach(), reduction="none").mean([1, 2])
-        codebook_loss = F.mse_loss(z_q, z_e.detach(), reduction="none").mean([1, 2])
-        z_q = z_e + (z_q - z_e).detach()  # noop in forward pass, straight-through gradient estimator in backward pass
-        z_q = self.out_proj(z_q)
-        return z_q, commitment_loss, codebook_loss, indices, z_e
-    def embed_code(self, embed_id):
-        return F.embedding(embed_id, self.codebook.weight)
-    def decode_code(self, embed_id):
-        return self.embed_code(embed_id).transpose(1, 2)
-    def decode_latents(self, latents):
-        encodings = rearrange(latents, "b d t -> (b t) d")
-        codebook = self.codebook.weight  # codebook: (N x D)
-        # L2 normalize encodings and codebook (ViT-VQGAN)
-        encodings = F.normalize(encodings)
-        codebook = F.normalize(codebook)
-        # Compute euclidean distance with codebook
-        dist = (
-            encodings.pow(2).sum(1, keepdim=True)
-            - 2 * encodings @ codebook.t()
-            + codebook.pow(2).sum(1, keepdim=True).t()
-        )
-        indices = rearrange((-dist).max(1)[1], "(b t) -> b t", b=latents.size(0))
-        z_q = self.decode_code(indices)
-        return z_q, indices
-class ResidualVectorQuantize(nn.Module):
-    """
-    Introduced in SoundStream: An end2end neural audio codec
-    https://arxiv.org/abs/2107.03312
-    """
-    def __init__(
-        self,
-        input_dim: int = 512,
-        n_codebooks: int = 9,
-        codebook_size: int = 1024,
-        codebook_dim: Union[int, list] = 8,
-        quantizer_dropout: float = 0.0,
-    ):
-        super().__init__()
-        if isinstance(codebook_dim, int):
-            codebook_dim = [codebook_dim for _ in range(n_codebooks)]
-        self.n_codebooks = n_codebooks
-        self.codebook_dim = codebook_dim
-        self.codebook_size = codebook_size
-        self.quantizers = nn.ModuleList(
-            [VectorQuantize(input_dim, codebook_size, codebook_dim[i]) for i in range(n_codebooks)]
-        )
-        self.quantizer_dropout = quantizer_dropout
-    def forward(self, z, n_quantizers: int = None):
-        """Quantized the input tensor using a fixed set of `n` codebooks and returns
-        the corresponding codebook vectors
-        Parameters
-        ----------
-        z : Tensor[B x D x T]
-        n_quantizers : int, optional
-            No. of quantizers to use
-            (n_quantizers < self.n_codebooks ex: for quantizer dropout)
-            Note: if `self.quantizer_dropout` is True, this argument is ignored
-                when in training mode, and a random number of quantizers is used.
-        Returns
-        -------
-        dict
-            A dictionary with the following keys:
-            "z" : Tensor[B x D x T]
-                Quantized continuous representation of input
-            "codes" : Tensor[B x N x T]
-                Codebook indices for each codebook
-                (quantized discrete representation of input)
-            "latents" : Tensor[B x N*D x T]
-                Projected latents (continuous representation of input before quantization)
-            "vq/commitment_loss" : Tensor[1]
-                Commitment loss to train encoder to predict vectors closer to codebook
-                entries
-            "vq/codebook_loss" : Tensor[1]
-                Codebook loss to update the codebook
-        """
-        z_q = 0
-        residual = z
-        commitment_loss = 0
-        codebook_loss = 0
-        codebook_indices = []
-        latents = []
-        if n_quantizers is None:
-            n_quantizers = self.n_codebooks
-        if self.training:
-            n_quantizers = torch.ones((z.shape[0],)) * self.n_codebooks + 1
-            dropout = torch.randint(1, self.n_codebooks + 1, (z.shape[0],))
-            n_dropout = int(z.shape[0] * self.quantizer_dropout)
-            n_quantizers[:n_dropout] = dropout[:n_dropout]
-            n_quantizers = n_quantizers.to(z.device)
-        for i, quantizer in enumerate(self.quantizers):
-            if self.training is False and i >= n_quantizers:
-                break
-            z_q_i, commitment_loss_i, codebook_loss_i, indices_i, z_e_i = quantizer(residual)
-            # Create mask to apply quantizer dropout
-            mask = torch.full((z.shape[0],), fill_value=i, device=z.device) < n_quantizers
-            z_q = z_q + z_q_i * mask[:, None, None]
-            residual = residual - z_q_i
-            # Sum losses
-            commitment_loss += (commitment_loss_i * mask).mean()
-            codebook_loss += (codebook_loss_i * mask).mean()
-            codebook_indices.append(indices_i)
-            latents.append(z_e_i)
-        codes = torch.stack(codebook_indices, dim=1)
-        latents = torch.cat(latents, dim=1)
-        return z_q, codes, latents, commitment_loss, codebook_loss
-    def from_codes(self, codes: torch.Tensor):
-        """Given the quantized codes, reconstruct the continuous representation
-        Parameters
-        ----------
-        codes : Tensor[B x N x T]
-            Quantized discrete representation of input
-        Returns
-        -------
-        Tensor[B x D x T]
-            Quantized continuous representation of input
-        """
-        z_q = 0.0
-        z_p = []
-        n_codebooks = codes.shape[1]
-        for i in range(n_codebooks):
-            z_p_i = self.quantizers[i].decode_code(codes[:, i, :])
-            z_p.append(z_p_i)
-            z_q_i = self.quantizers[i].out_proj(z_p_i)
-            z_q = z_q + z_q_i
-        return z_q, torch.cat(z_p, dim=1), codes
-    def from_latents(self, latents: torch.Tensor):
-        """Given the unquantized latents, reconstruct the
-        continuous representation after quantization.
-        Parameters
-        ----------
-        latents : Tensor[B x N x T]
-            Continuous representation of input after projection
-        Returns
-        -------
-        Tensor[B x D x T]
-            Quantized representation of full-projected space
-        Tensor[B x D x T]
-            Quantized representation of latent space
-        """
-        z_q = 0
-        z_p = []
-        codes = []
-        dims = np.cumsum([0] + [q.codebook_dim for q in self.quantizers])
-        n_codebooks = np.where(dims <= latents.shape[1])[0].max(axis=0, keepdims=True)[0]
-        for i in range(n_codebooks):
-            j, k = dims[i], dims[i + 1]
-            z_p_i, codes_i = self.quantizers[i].decode_latents(latents[:, j:k, :])
-            z_p.append(z_p_i)
-            codes.append(codes_i)
-            z_q_i = self.quantizers[i].out_proj(z_p_i)
-            z_q = z_q + z_q_i
-        return z_q, torch.cat(z_p, dim=1), torch.stack(codes, dim=1)
-if __name__ == "__main__":
-    rvq = ResidualVectorQuantize(quantizer_dropout=True)
-    x = torch.randn(16, 512, 80)
-    y = rvq(x)
-    print(y["latents"].shape)

higgs_audio/audio_processing/higgs_audio_tokenizer.py DELETED Viewed

@@ -1,341 +0,0 @@
-# Based on code from: https://github.com/zhenye234/xcodec
-# Licensed under MIT License
-# Modifications by BosonAI
-import math
-import os
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from typing import Optional, Union, Sequence
-import numpy as np
-from transformers import AutoModel
-import torchaudio
-import json
-import librosa
-from huggingface_hub import snapshot_download
-from vector_quantize_pytorch import ResidualFSQ
-from .descriptaudiocodec.dac.model import dac as dac2
-from .quantization.vq import ResidualVectorQuantizer
-from .semantic_module import Encoder, Decoder
-class EncodedResult:
-    def __init__(self, audio_codes):
-        self.audio_codes = audio_codes
-class HiggsAudioFeatureExtractor(nn.Module):
-    def __init__(self, sampling_rate=16000):
-        super().__init__()
-        self.sampling_rate = sampling_rate
-    def forward(self, raw_audio, sampling_rate=16000, return_tensors="pt"):
-        # Convert from librosa to torch
-        audio_signal = torch.tensor(raw_audio)
-        audio_signal = audio_signal.unsqueeze(0)
-        if len(audio_signal.shape) < 3:
-            audio_signal = audio_signal.unsqueeze(0)
-        return {"input_values": audio_signal}
-class HiggsAudioTokenizer(nn.Module):
-    def __init__(
-        self,
-        n_filters: int = 32,
-        D: int = 128,
-        target_bandwidths: Sequence[Union[int, float]] = [1, 1.5, 2, 4, 6],
-        ratios: Sequence[int] = [8, 5, 4, 2],  #  downsampling by 320
-        sample_rate: int = 16000,
-        bins: int = 1024,
-        n_q: int = 8,
-        codebook_dim: int = None,
-        normalize: bool = False,
-        causal: bool = False,
-        semantic_techer: str = "hubert_base_general",
-        last_layer_semantic: bool = True,
-        merge_mode: str = "concat",
-        downsample_mode: str = "step_down",
-        semantic_mode: str = "classic",
-        vq_scale: int = 1,
-        semantic_sample_rate: int = None,
-        device: str = "cuda",
-    ):
-        super().__init__()
-        self.hop_length = np.prod(ratios)
-        self.semantic_techer = semantic_techer
-        self.frame_rate = math.ceil(sample_rate / np.prod(ratios))  # 50 Hz
-        self.target_bandwidths = target_bandwidths
-        self.n_q = n_q
-        self.sample_rate = sample_rate
-        self.encoder = dac2.Encoder(64, ratios, D)
-        self.decoder_2 = dac2.Decoder(D, 1024, ratios)
-        self.last_layer_semantic = last_layer_semantic
-        self.device = device
-        if semantic_techer == "hubert_base":
-            self.semantic_model = AutoModel.from_pretrained("facebook/hubert-base-ls960")
-            self.semantic_sample_rate = 16000
-            self.semantic_dim = 768
-            self.encoder_semantic_dim = 768
-        elif semantic_techer == "wavlm_base_plus":
-            self.semantic_model = AutoModel.from_pretrained("microsoft/wavlm-base-plus")
-            self.semantic_sample_rate = 16000
-            self.semantic_dim = 768
-            self.encoder_semantic_dim = 768
-        elif semantic_techer == "hubert_base_general":
-            self.semantic_model = AutoModel.from_pretrained("ZhenYe234/hubert_base_general_audio")
-            self.semantic_sample_rate = 16000
-            self.semantic_dim = 768
-            self.encoder_semantic_dim = 768
-        # Overwrite semantic model sr to ensure semantic_downsample_factor is an integer
-        if semantic_sample_rate is not None:
-            self.semantic_sample_rate = semantic_sample_rate
-        self.semantic_model.eval()
-        # make the semantic model parameters do not need gradient
-        for param in self.semantic_model.parameters():
-            param.requires_grad = False
-        self.semantic_downsample_factor = int(self.hop_length / (self.sample_rate / self.semantic_sample_rate) / 320)
-        self.quantizer_dim = int((D + self.encoder_semantic_dim) // vq_scale)
-        self.encoder_semantic = Encoder(input_channels=self.semantic_dim, encode_channels=self.encoder_semantic_dim)
-        self.decoder_semantic = Decoder(
-            code_dim=self.encoder_semantic_dim,
-            output_channels=self.semantic_dim,
-            decode_channels=self.semantic_dim,
-        )
-        # out_D=D+768
-        if isinstance(bins, int):  # RVQ
-            self.quantizer = ResidualVectorQuantizer(
-                dimension=self.quantizer_dim,
-                codebook_dim=codebook_dim,
-                n_q=n_q,
-                bins=bins,
-            )
-            self.quantizer_type = "RVQ"
-        else:  # RFSQ
-            self.quantizer = ResidualFSQ(dim=self.quantizer_dim, levels=bins, num_quantizers=n_q)
-            self.quantizer_type = "RFSQ"
-        self.fc_prior = nn.Linear(D + self.encoder_semantic_dim, self.quantizer_dim)
-        self.fc_post1 = nn.Linear(self.quantizer_dim, self.encoder_semantic_dim)
-        self.fc_post2 = nn.Linear(self.quantizer_dim, D)
-        self.downsample_mode = downsample_mode
-        if downsample_mode == "avg":
-            self.semantic_pooling = nn.AvgPool1d(
-                kernel_size=self.semantic_downsample_factor,
-                stride=self.semantic_downsample_factor,
-            )
-        self.audio_tokenizer_feature_extractor = HiggsAudioFeatureExtractor(sampling_rate=self.sample_rate)
-    @property
-    def tps(self):
-        return self.frame_rate
-    @property
-    def sampling_rate(self):
-        return self.sample_rate
-    @property
-    def num_codebooks(self):
-        return self.n_q
-    @property
-    def codebook_size(self):
-        return self.quantizer_dim
-    def get_last_layer(self):
-        return self.decoder.layers[-1].weight
-    def calculate_rec_loss(self, rec, target):
-        target = target / target.norm(dim=-1, keepdim=True)
-        rec = rec / rec.norm(dim=-1, keepdim=True)
-        rec_loss = (1 - (target * rec).sum(-1)).mean()
-        return rec_loss
-    @torch.no_grad()
-    def get_regress_target(self, x):
-        x = torchaudio.functional.resample(x, self.sample_rate, self.semantic_sample_rate)
-        if (
-            self.semantic_techer == "hubert_base"
-            or self.semantic_techer == "hubert_base_general"
-            or self.semantic_techer == "wavlm_base_plus"
-        ):
-            x = x[:, 0, :]
-            x = F.pad(x, (160, 160))
-            target = self.semantic_model(x, output_hidden_states=True).hidden_states
-            target = torch.stack(target, dim=1)  # .transpose(-1, -2)#.flatten(start_dim=1, end_dim=2)
-            # average for all layers
-            target = target.mean(1)
-            # target = target[9]
-            # if self.hop_length > 320:
-            #     target = self.semantic_pooling(target.transpose(1, 2)).transpose(1, 2)
-        elif self.semantic_techer == "w2v_bert2":
-            target = self.semantic_model(x)
-        elif self.semantic_techer.startswith("whisper"):
-            if self.last_layer_semantic:
-                target = self.semantic_model(x, avg_layers=False)
-            else:
-                target = self.semantic_model(x, avg_layers=True)
-        elif self.semantic_techer.startswith("mert_music"):
-            if self.last_layer_semantic:
-                target = self.semantic_model(x, avg_layers=False)
-            else:
-                target = self.semantic_model(x, avg_layers=True)
-        elif self.semantic_techer.startswith("qwen_audio_omni"):
-            target = self.semantic_model(x)
-        if self.downsample_mode == "step_down":
-            if self.semantic_downsample_factor > 1:
-                target = target[:, :: self.semantic_downsample_factor, :]
-        elif self.downsample_mode == "avg":
-            target = self.semantic_pooling(target.transpose(1, 2)).transpose(1, 2)
-        return target
-    def forward(self, x: torch.Tensor, bw: int):
-        e_semantic_input = self.get_regress_target(x).detach()
-        e_semantic = self.encoder_semantic(e_semantic_input.transpose(1, 2))
-        e_acoustic = self.encoder(x)
-        e = torch.cat([e_acoustic, e_semantic], dim=1)
-        e = self.fc_prior(e.transpose(1, 2))
-        if self.quantizer_type == "RVQ":
-            e = e.transpose(1, 2)
-            quantized, codes, bandwidth, commit_loss = self.quantizer(e, self.frame_rate, bw)
-            quantized = quantized.transpose(1, 2)
-        else:
-            quantized, codes = self.quantizer(e)
-            commit_loss = torch.tensor(0.0)
-        quantized_semantic = self.fc_post1(quantized).transpose(1, 2)
-        quantized_acoustic = self.fc_post2(quantized).transpose(1, 2)
-        o = self.decoder_2(quantized_acoustic)
-        o_semantic = self.decoder_semantic(quantized_semantic)
-        semantic_recon_loss = F.mse_loss(e_semantic_input.transpose(1, 2).detach(), o_semantic)
-        return o, commit_loss, semantic_recon_loss, None
-    def encode(
-        self,
-        audio_path_or_wv,
-        sr=None,
-        loudness_normalize=False,
-        loudness_threshold=-23.0,
-    ):
-        if isinstance(audio_path_or_wv, str):
-            wv, sr = librosa.load(audio_path_or_wv, mono=True, sr=None)
-        else:
-            wv = audio_path_or_wv
-            assert sr is not None
-        if loudness_normalize:
-            import pyloudnorm as pyln
-            meter = pyln.Meter(sr)
-            l = meter.integrated_loudness(wv)
-            wv = pyln.normalize.loudness(wv, l, loudness_threshold)
-        if sr != self.sampling_rate:
-            wv = librosa.resample(wv, orig_sr=sr, target_sr=self.sampling_rate)
-        if self.audio_tokenizer_feature_extractor is not None:
-            inputs = self.audio_tokenizer_feature_extractor(
-                raw_audio=wv,
-                sampling_rate=self.audio_tokenizer_feature_extractor.sampling_rate,
-                return_tensors="pt",
-            )
-            input_values = inputs["input_values"].to(self.device)
-        else:
-            input_values = torch.from_numpy(wv).float().unsqueeze(0)
-        with torch.no_grad():
-            encoder_outputs = self._xcodec_encode(input_values)
-            vq_code = encoder_outputs.audio_codes[0]
-        return vq_code
-    def _xcodec_encode(self, x: torch.Tensor, target_bw: Optional[int] = None) -> torch.Tensor:
-        bw = target_bw
-        e_semantic_input = self.get_regress_target(x).detach()
-        e_semantic = self.encoder_semantic(e_semantic_input.transpose(1, 2))
-        e_acoustic = self.encoder(x)
-        if e_acoustic.shape[2] != e_semantic.shape[2]:
-            pad_size = 160 * self.semantic_downsample_factor
-            e_acoustic = self.encoder(F.pad(x[:, 0, :], (pad_size, pad_size)).unsqueeze(0))
-        if e_acoustic.shape[2] != e_semantic.shape[2]:
-            if e_acoustic.shape[2] > e_semantic.shape[2]:
-                e_acoustic = e_acoustic[:, :, : e_semantic.shape[2]]
-            else:
-                e_semantic = e_semantic[:, :, : e_acoustic.shape[2]]
-        e = torch.cat([e_acoustic, e_semantic], dim=1)
-        e = self.fc_prior(e.transpose(1, 2))
-        if self.quantizer_type == "RVQ":
-            e = e.transpose(1, 2)
-            quantized, codes, bandwidth, commit_loss = self.quantizer(e, self.frame_rate, bw)
-            codes = codes.permute(1, 0, 2)
-        else:
-            quantized, codes = self.quantizer(e)
-            codes = codes.permute(0, 2, 1)
-        # return codes
-        return EncodedResult(codes)
-    def decode(self, vq_code: torch.Tensor) -> torch.Tensor:
-        if self.quantizer_type == "RVQ":
-            vq_code = vq_code.permute(1, 0, 2)
-            quantized = self.quantizer.decode(vq_code)
-            quantized = quantized.transpose(1, 2)
-        else:
-            vq_code = vq_code.permute(0, 2, 1)
-            quantized = self.quantizer.get_output_from_indices(vq_code)
-        quantized_acoustic = self.fc_post2(quantized).transpose(1, 2)
-        o = self.decoder_2(quantized_acoustic)
-        return o.cpu().numpy()
-def load_higgs_audio_tokenizer(tokenizer_name_or_path, device="cuda"):
-    is_local = os.path.exists(tokenizer_name_or_path)
-    if not is_local:
-        tokenizer_path = snapshot_download(tokenizer_name_or_path)
-    else:
-        tokenizer_path = tokenizer_name_or_path
-    config_path = os.path.join(tokenizer_path, "config.json")
-    model_path = os.path.join(tokenizer_path, "model.pth")
-    config = json.load(open(config_path))
-    model = HiggsAudioTokenizer(
-        **config,
-        device=device,
-    )
-    parameter_dict = torch.load(model_path, map_location=device)
-    model.load_state_dict(parameter_dict, strict=False)
-    model.to(device)
-    model.eval()
-    return model

higgs_audio/audio_processing/quantization/__init__.py DELETED Viewed

@@ -1,8 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-# flake8: noqa
-from .vq import QuantizedResult, ResidualVectorQuantizer

higgs_audio/audio_processing/quantization/ac.py DELETED Viewed

@@ -1,301 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""Arithmetic coder."""
-import io
-import math
-import random
-import typing as tp
-import torch
-from ..binary import BitPacker, BitUnpacker
-def build_stable_quantized_cdf(
-    pdf: torch.Tensor,
-    total_range_bits: int,
-    roundoff: float = 1e-8,
-    min_range: int = 2,
-    check: bool = True,
-) -> torch.Tensor:
-    """Turn the given PDF into a quantized CDF that splits
-    [0, 2 ** self.total_range_bits - 1] into chunks of size roughly proportional
-    to the PDF.
-    Args:
-        pdf (torch.Tensor): probability distribution, shape should be `[N]`.
-        total_range_bits (int): see `ArithmeticCoder`, the typical range we expect
-            during the coding process is `[0, 2 ** total_range_bits - 1]`.
-        roundoff (float): will round the pdf up to that level to remove difference coming
-        from e.g. evaluating the Language Model on different architectures.
-        min_range (int): minimum range width. Should always be at least 2 for numerical
-            stability. Use this to avoid pathological behavior is a value
-            that is expected to be rare actually happens in real life.
-        check (bool): if True, checks that nothing bad happened, can be deactivated for speed.
-    """
-    pdf = pdf.detach()
-    if roundoff:
-        pdf = (pdf / roundoff).floor() * roundoff
-    # interpolate with uniform distribution to achieve desired minimum probability.
-    total_range = 2**total_range_bits
-    cardinality = len(pdf)
-    alpha = min_range * cardinality / total_range
-    assert alpha <= 1, "you must reduce min_range"
-    ranges = (((1 - alpha) * total_range) * pdf).floor().long()
-    ranges += min_range
-    quantized_cdf = torch.cumsum(ranges, dim=-1)
-    if min_range < 2:
-        raise ValueError("min_range must be at least 2.")
-    if check:
-        assert quantized_cdf[-1] <= 2**total_range_bits, quantized_cdf[-1]
-        if ((quantized_cdf[1:] - quantized_cdf[:-1]) < min_range).any() or quantized_cdf[0] < min_range:
-            raise ValueError("You must increase your total_range_bits.")
-    return quantized_cdf
-class ArithmeticCoder:
-    """ArithmeticCoder,
-    Let us take a distribution `p` over `N` symbols, and assume we have a stream
-    of random variables `s_t` sampled from `p`. Let us assume that we have a budget
-    of `B` bits that we can afford to write on device. There are `2**B` possible numbers,
-    corresponding to the range `[0, 2 ** B - 1]`. We can map each of those number to a single
-    sequence `(s_t)` by doing the following:
-    1) Initialize the current range to` [0 ** 2 B - 1]`.
-    2) For each time step t, split the current range into contiguous chunks,
-        one for each possible outcome, with size roughly proportional to `p`.
-        For instance, if `p = [0.75, 0.25]`, and the range is `[0, 3]`, the chunks
-        would be `{[0, 2], [3, 3]}`.
-    3) Select the chunk corresponding to `s_t`, and replace the current range with this.
-    4) When done encoding all the values, just select any value remaining in the range.
-    You will notice that this procedure can fail: for instance if at any point in time
-    the range is smaller than `N`, then we can no longer assign a non-empty chunk to each
-    possible outcome. Intuitively, the more likely a value is, the less the range width
-    will reduce, and the longer we can go on encoding values. This makes sense: for any efficient
-    coding scheme, likely outcomes would take less bits, and more of them can be coded
-    with a fixed budget.
-    In practice, we do not know `B` ahead of time, but we have a way to inject new bits
-    when the current range decreases below a given limit (given by `total_range_bits`), without
-    having to redo all the computations. If we encode mostly likely values, we will seldom
-    need to inject new bits, but a single rare value can deplete our stock of entropy!
-    In this explanation, we assumed that the distribution `p` was constant. In fact, the present
-    code works for any sequence `(p_t)` possibly different for each timestep.
-    We also assume that `s_t ~ p_t`, but that doesn't need to be true, although the smaller
-    the KL between the true distribution and `p_t`, the most efficient the coding will be.
-    Args:
-        fo (IO[bytes]): file-like object to which the bytes will be written to.
-        total_range_bits (int): the range `M` described above is `2 ** total_range_bits.
-            Any time the current range width fall under this limit, new bits will
-            be injected to rescale the initial range.
-    """
-    def __init__(self, fo: tp.IO[bytes], total_range_bits: int = 24):
-        assert total_range_bits <= 30
-        self.total_range_bits = total_range_bits
-        self.packer = BitPacker(bits=1, fo=fo)  # we push single bits at a time.
-        self.low: int = 0
-        self.high: int = 0
-        self.max_bit: int = -1
-        self._dbg: tp.List[tp.Any] = []
-        self._dbg2: tp.List[tp.Any] = []
-    @property
-    def delta(self) -> int:
-        """Return the current range width."""
-        return self.high - self.low + 1
-    def _flush_common_prefix(self):
-        # If self.low and self.high start with the sames bits,
-        # those won't change anymore as we always just increase the range
-        # by powers of 2, and we can flush them out to the bit stream.
-        assert self.high >= self.low, (self.low, self.high)
-        assert self.high < 2 ** (self.max_bit + 1)
-        while self.max_bit >= 0:
-            b1 = self.low >> self.max_bit
-            b2 = self.high >> self.max_bit
-            if b1 == b2:
-                self.low -= b1 << self.max_bit
-                self.high -= b1 << self.max_bit
-                assert self.high >= self.low, (self.high, self.low, self.max_bit)
-                assert self.low >= 0
-                self.max_bit -= 1
-                self.packer.push(b1)
-            else:
-                break
-    def push(self, symbol: int, quantized_cdf: torch.Tensor):
-        """Push the given symbol on the stream, flushing out bits
-        if possible.
-        Args:
-            symbol (int): symbol to encode with the AC.
-            quantized_cdf (torch.Tensor): use `build_stable_quantized_cdf`
-                to build this from your pdf estimate.
-        """
-        while self.delta < 2**self.total_range_bits:
-            self.low *= 2
-            self.high = self.high * 2 + 1
-            self.max_bit += 1
-        range_low = 0 if symbol == 0 else quantized_cdf[symbol - 1].item()
-        range_high = quantized_cdf[symbol].item() - 1
-        effective_low = int(math.ceil(range_low * (self.delta / (2**self.total_range_bits))))
-        effective_high = int(math.floor(range_high * (self.delta / (2**self.total_range_bits))))
-        assert self.low <= self.high
-        self.high = self.low + effective_high
-        self.low = self.low + effective_low
-        assert self.low <= self.high, (
-            effective_low,
-            effective_high,
-            range_low,
-            range_high,
-        )
-        self._dbg.append((self.low, self.high))
-        self._dbg2.append((self.low, self.high))
-        outs = self._flush_common_prefix()
-        assert self.low <= self.high
-        assert self.max_bit >= -1
-        assert self.max_bit <= 61, self.max_bit
-        return outs
-    def flush(self):
-        """Flush the remaining information to the stream."""
-        while self.max_bit >= 0:
-            b1 = (self.low >> self.max_bit) & 1
-            self.packer.push(b1)
-            self.max_bit -= 1
-        self.packer.flush()
-class ArithmeticDecoder:
-    """ArithmeticDecoder, see `ArithmeticCoder` for a detailed explanation.
-    Note that this must be called with **exactly** the same parameters and sequence
-    of quantized cdf as the arithmetic encoder or the wrong values will be decoded.
-    If the AC encoder current range is [L, H], with `L` and `H` having the some common
-    prefix (i.e. the same most significant bits), then this prefix will be flushed to the stream.
-    For instances, having read 3 bits `b1 b2 b3`, we know that `[L, H]` is contained inside
-    `[b1 b2 b3 0 ... 0 b1 b3 b3 1 ... 1]`. Now this specific sub-range can only be obtained
-    for a specific sequence of symbols and a binary-search allows us to decode those symbols.
-    At some point, the prefix `b1 b2 b3` will no longer be sufficient to decode new symbols,
-    and we will need to read new bits from the stream and repeat the process.
-    """
-    def __init__(self, fo: tp.IO[bytes], total_range_bits: int = 24):
-        self.total_range_bits = total_range_bits
-        self.low: int = 0
-        self.high: int = 0
-        self.current: int = 0
-        self.max_bit: int = -1
-        self.unpacker = BitUnpacker(bits=1, fo=fo)  # we pull single bits at a time.
-        # Following is for debugging
-        self._dbg: tp.List[tp.Any] = []
-        self._dbg2: tp.List[tp.Any] = []
-        self._last: tp.Any = None
-    @property
-    def delta(self) -> int:
-        return self.high - self.low + 1
-    def _flush_common_prefix(self):
-        # Given the current range [L, H], if both have a common prefix,
-        # we know we can remove it from our representation to avoid handling large numbers.
-        while self.max_bit >= 0:
-            b1 = self.low >> self.max_bit
-            b2 = self.high >> self.max_bit
-            if b1 == b2:
-                self.low -= b1 << self.max_bit
-                self.high -= b1 << self.max_bit
-                self.current -= b1 << self.max_bit
-                assert self.high >= self.low
-                assert self.low >= 0
-                self.max_bit -= 1
-            else:
-                break
-    def pull(self, quantized_cdf: torch.Tensor) -> tp.Optional[int]:
-        """Pull a symbol, reading as many bits from the stream as required.
-        This returns `None` when the stream has been exhausted.
-        Args:
-            quantized_cdf (torch.Tensor): use `build_stable_quantized_cdf`
-                to build this from your pdf estimate. This must be **exatly**
-                the same cdf as the one used at encoding time.
-        """
-        while self.delta < 2**self.total_range_bits:
-            bit = self.unpacker.pull()
-            if bit is None:
-                return None
-            self.low *= 2
-            self.high = self.high * 2 + 1
-            self.current = self.current * 2 + bit
-            self.max_bit += 1
-        def bin_search(low_idx: int, high_idx: int):
-            # Binary search is not just for coding interviews :)
-            if high_idx < low_idx:
-                raise RuntimeError("Binary search failed")
-            mid = (low_idx + high_idx) // 2
-            range_low = quantized_cdf[mid - 1].item() if mid > 0 else 0
-            range_high = quantized_cdf[mid].item() - 1
-            effective_low = int(math.ceil(range_low * (self.delta / (2**self.total_range_bits))))
-            effective_high = int(math.floor(range_high * (self.delta / (2**self.total_range_bits))))
-            low = effective_low + self.low
-            high = effective_high + self.low
-            if self.current >= low:
-                if self.current <= high:
-                    return (mid, low, high, self.current)
-                else:
-                    return bin_search(mid + 1, high_idx)
-            else:
-                return bin_search(low_idx, mid - 1)
-        self._last = (self.low, self.high, self.current, self.max_bit)
-        sym, self.low, self.high, self.current = bin_search(0, len(quantized_cdf) - 1)
-        self._dbg.append((self.low, self.high, self.current))
-        self._flush_common_prefix()
-        self._dbg2.append((self.low, self.high, self.current))
-        return sym
-def test():
-    torch.manual_seed(1234)
-    random.seed(1234)
-    for _ in range(4):
-        pdfs = []
-        cardinality = random.randrange(4000)
-        steps = random.randrange(100, 500)
-        fo = io.BytesIO()
-        encoder = ArithmeticCoder(fo)
-        symbols = []
-        for step in range(steps):
-            pdf = torch.softmax(torch.randn(cardinality), dim=0)
-            pdfs.append(pdf)
-            q_cdf = build_stable_quantized_cdf(pdf, encoder.total_range_bits)
-            symbol = torch.multinomial(pdf, 1).item()
-            symbols.append(symbol)
-            encoder.push(symbol, q_cdf)
-        encoder.flush()
-        fo.seek(0)
-        decoder = ArithmeticDecoder(fo)
-        for idx, (pdf, symbol) in enumerate(zip(pdfs, symbols)):
-            q_cdf = build_stable_quantized_cdf(pdf, encoder.total_range_bits)
-            decoded_symbol = decoder.pull(q_cdf)
-            assert decoded_symbol == symbol, idx
-        assert decoder.pull(torch.zeros(1)) is None
-if __name__ == "__main__":
-    test()

higgs_audio/audio_processing/quantization/core_vq.py DELETED Viewed

@@ -1,360 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-#
-# This implementation is inspired from
-# https://github.com/lucidrains/vector-quantize-pytorch
-# which is released under MIT License. Hereafter, the original license:
-# MIT License
-#
-# Copyright (c) 2020 Phil Wang
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-"""Core vector quantization implementation."""
-import typing as tp
-from einops import rearrange, repeat
-import torch
-from torch import nn
-import torch.nn.functional as F
-from xcodec.quantization.distrib import broadcast_tensors, rank
-def default(val: tp.Any, d: tp.Any) -> tp.Any:
-    return val if val is not None else d
-def ema_inplace(moving_avg, new, decay: float):
-    moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay))
-def laplace_smoothing(x, n_categories: int, epsilon: float = 1e-5):
-    return (x + epsilon) / (x.sum() + n_categories * epsilon)
-def uniform_init(*shape: int):
-    t = torch.empty(shape)
-    nn.init.kaiming_uniform_(t)
-    return t
-def sample_vectors(samples, num: int):
-    num_samples, device = samples.shape[0], samples.device
-    if num_samples >= num:
-        indices = torch.randperm(num_samples, device=device)[:num]
-    else:
-        indices = torch.randint(0, num_samples, (num,), device=device)
-    return samples[indices]
-def kmeans(samples, num_clusters: int, num_iters: int = 10):
-    dim, dtype = samples.shape[-1], samples.dtype
-    means = sample_vectors(samples, num_clusters)
-    for _ in range(num_iters):
-        diffs = rearrange(samples, "n d -> n () d") - rearrange(means, "c d -> () c d")
-        dists = -(diffs**2).sum(dim=-1)
-        buckets = dists.max(dim=-1).indices
-        bins = torch.bincount(buckets, minlength=num_clusters)
-        zero_mask = bins == 0
-        bins_min_clamped = bins.masked_fill(zero_mask, 1)
-        new_means = buckets.new_zeros(num_clusters, dim, dtype=dtype)
-        new_means.scatter_add_(0, repeat(buckets, "n -> n d", d=dim), samples)
-        new_means = new_means / bins_min_clamped[..., None]
-        means = torch.where(zero_mask[..., None], means, new_means)
-    return means, bins
-class EuclideanCodebook(nn.Module):
-    """Codebook with Euclidean distance.
-    Args:
-        dim (int): Dimension.
-        codebook_size (int): Codebook size.
-        kmeans_init (bool): Whether to use k-means to initialize the codebooks.
-            If set to true, run the k-means algorithm on the first training batch and use
-            the learned centroids as initialization.
-        kmeans_iters (int): Number of iterations used for k-means algorithm at initialization.
-        decay (float): Decay for exponential moving average over the codebooks.
-        epsilon (float): Epsilon value for numerical stability.
-        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
-            that have an exponential moving average cluster size less than the specified threshold with
-            randomly selected vector from the current batch.
-    """
-    def __init__(
-        self,
-        dim: int,
-        codebook_size: int,
-        kmeans_init: int = False,
-        kmeans_iters: int = 10,
-        decay: float = 0.99,
-        epsilon: float = 1e-5,
-        threshold_ema_dead_code: int = 2,
-    ):
-        super().__init__()
-        self.decay = decay
-        init_fn: tp.Union[tp.Callable[..., torch.Tensor], tp.Any] = uniform_init if not kmeans_init else torch.zeros
-        embed = init_fn(codebook_size, dim)
-        self.codebook_size = codebook_size
-        self.kmeans_iters = kmeans_iters
-        self.epsilon = epsilon
-        self.threshold_ema_dead_code = threshold_ema_dead_code
-        self.register_buffer("inited", torch.Tensor([not kmeans_init]))
-        self.register_buffer("cluster_size", torch.zeros(codebook_size))
-        self.register_buffer("embed", embed)
-        self.register_buffer("embed_avg", embed.clone())
-    @torch.jit.ignore
-    def init_embed_(self, data):
-        if self.inited:
-            return
-        embed, cluster_size = kmeans(data, self.codebook_size, self.kmeans_iters)
-        self.embed.data.copy_(embed)
-        self.embed_avg.data.copy_(embed.clone())
-        self.cluster_size.data.copy_(cluster_size)
-        self.inited.data.copy_(torch.Tensor([True]))
-        # Make sure all buffers across workers are in sync after initialization
-        broadcast_tensors(self.buffers())
-    def replace_(self, samples, mask):
-        modified_codebook = torch.where(mask[..., None], sample_vectors(samples, self.codebook_size), self.embed)
-        self.embed.data.copy_(modified_codebook)
-    def expire_codes_(self, batch_samples):
-        if self.threshold_ema_dead_code == 0:
-            return
-        expired_codes = self.cluster_size < self.threshold_ema_dead_code
-        if not torch.any(expired_codes):
-            return
-        batch_samples = rearrange(batch_samples, "... d -> (...) d")
-        self.replace_(batch_samples, mask=expired_codes)
-        broadcast_tensors(self.buffers())
-    def preprocess(self, x):
-        x = rearrange(x, "... d -> (...) d")
-        return x
-    def quantize(self, x):
-        embed = self.embed.t()
-        dist = -(x.pow(2).sum(1, keepdim=True) - 2 * x @ embed + embed.pow(2).sum(0, keepdim=True))
-        embed_ind = dist.max(dim=-1).indices
-        return embed_ind
-    def postprocess_emb(self, embed_ind, shape):
-        return embed_ind.view(*shape[:-1])
-    def dequantize(self, embed_ind):
-        quantize = F.embedding(embed_ind, self.embed)  # get embedding based on index
-        return quantize
-    def encode(self, x):
-        shape = x.shape
-        # pre-process
-        x = self.preprocess(x)
-        # quantize
-        embed_ind = self.quantize(x)  # get index based on Euclidean distance
-        # post-process
-        embed_ind = self.postprocess_emb(embed_ind, shape)
-        return embed_ind
-    def decode(self, embed_ind):
-        quantize = self.dequantize(embed_ind)
-        return quantize
-    def forward(self, x):
-        shape, dtype = x.shape, x.dtype
-        x = self.preprocess(x)
-        self.init_embed_(x)
-        embed_ind = self.quantize(x)
-        embed_onehot = F.one_hot(embed_ind, self.codebook_size).type(dtype)
-        embed_ind = self.postprocess_emb(embed_ind, shape)
-        quantize = self.dequantize(embed_ind)
-        if self.training:
-            # We do the expiry of code at that point as buffers are in sync
-            # and all the workers will take the same decision.
-            self.expire_codes_(x)
-            ema_inplace(self.cluster_size, embed_onehot.sum(0), self.decay)
-            embed_sum = x.t() @ embed_onehot
-            ema_inplace(self.embed_avg, embed_sum.t(), self.decay)
-            cluster_size = (
-                laplace_smoothing(self.cluster_size, self.codebook_size, self.epsilon) * self.cluster_size.sum()
-            )
-            embed_normalized = self.embed_avg / cluster_size.unsqueeze(1)
-            self.embed.data.copy_(embed_normalized)
-        return quantize, embed_ind
-class VectorQuantization(nn.Module):
-    """Vector quantization implementation.
-    Currently supports only euclidean distance.
-    Args:
-        dim (int): Dimension
-        codebook_size (int): Codebook size
-        codebook_dim (int): Codebook dimension. If not defined, uses the specified dimension in dim.
-        decay (float): Decay for exponential moving average over the codebooks.
-        epsilon (float): Epsilon value for numerical stability.
-        kmeans_init (bool): Whether to use kmeans to initialize the codebooks.
-        kmeans_iters (int): Number of iterations used for kmeans initialization.
-        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
-            that have an exponential moving average cluster size less than the specified threshold with
-            randomly selected vector from the current batch.
-        commitment_weight (float): Weight for commitment loss.
-    """
-    def __init__(
-        self,
-        dim: int,
-        codebook_size: int,
-        codebook_dim: tp.Optional[int] = None,
-        decay: float = 0.99,
-        epsilon: float = 1e-5,
-        kmeans_init: bool = True,
-        kmeans_iters: int = 50,
-        threshold_ema_dead_code: int = 2,
-        commitment_weight: float = 1.0,
-    ):
-        super().__init__()
-        _codebook_dim: int = default(codebook_dim, dim)
-        requires_projection = _codebook_dim != dim
-        self.project_in = nn.Linear(dim, _codebook_dim) if requires_projection else nn.Identity()
-        self.project_out = nn.Linear(_codebook_dim, dim) if requires_projection else nn.Identity()
-        self.epsilon = epsilon
-        self.commitment_weight = commitment_weight
-        self._codebook = EuclideanCodebook(
-            dim=_codebook_dim,
-            codebook_size=codebook_size,
-            kmeans_init=kmeans_init,
-            kmeans_iters=kmeans_iters,
-            decay=decay,
-            epsilon=epsilon,
-            threshold_ema_dead_code=threshold_ema_dead_code,
-        )
-        self.codebook_size = codebook_size
-    @property
-    def codebook(self):
-        return self._codebook.embed
-    def encode(self, x):
-        x = rearrange(x, "b d n -> b n d")
-        x = self.project_in(x)
-        embed_in = self._codebook.encode(x)
-        return embed_in
-    def decode(self, embed_ind):
-        quantize = self._codebook.decode(embed_ind)
-        quantize = self.project_out(quantize)
-        quantize = rearrange(quantize, "b n d -> b d n")
-        return quantize
-    def forward(self, x):
-        device = x.device
-        x = rearrange(x, "b d n -> b n d")
-        x = self.project_in(x)
-        quantize, embed_ind = self._codebook(x)
-        if self.training:
-            quantize = x + (quantize - x).detach()
-        loss = torch.tensor([0.0], device=device, requires_grad=self.training)
-        if self.training:
-            if self.commitment_weight > 0:
-                commit_loss = F.mse_loss(quantize.detach(), x)
-                loss = loss + commit_loss * self.commitment_weight
-        quantize = self.project_out(quantize)
-        quantize = rearrange(quantize, "b n d -> b d n")
-        return quantize, embed_ind, loss
-class ResidualVectorQuantization(nn.Module):
-    """Residual vector quantization implementation.
-    Follows Algorithm 1. in https://arxiv.org/pdf/2107.03312.pdf
-    """
-    def __init__(self, *, num_quantizers, **kwargs):
-        super().__init__()
-        self.layers = nn.ModuleList([VectorQuantization(**kwargs) for _ in range(num_quantizers)])
-    def forward(self, x, n_q: tp.Optional[int] = None):
-        quantized_out = 0.0
-        residual = x
-        all_losses = []
-        all_indices = []
-        n_q = n_q or len(self.layers)
-        for layer in self.layers[:n_q]:
-            quantized, indices, loss = layer(residual)
-            residual = residual - quantized
-            quantized_out = quantized_out + quantized
-            all_indices.append(indices)
-            all_losses.append(loss)
-        out_losses, out_indices = map(torch.stack, (all_losses, all_indices))
-        return quantized_out, out_indices, out_losses
-    def encode(self, x: torch.Tensor, n_q: tp.Optional[int] = None) -> torch.Tensor:
-        residual = x
-        all_indices = []
-        n_q = n_q or len(self.layers)
-        for layer in self.layers[:n_q]:
-            indices = layer.encode(residual)
-            quantized = layer.decode(indices)
-            residual = residual - quantized
-            all_indices.append(indices)
-        out_indices = torch.stack(all_indices)
-        return out_indices
-    def decode(self, q_indices: torch.Tensor) -> torch.Tensor:
-        quantized_out = torch.tensor(0.0, device=q_indices.device)
-        for i, indices in enumerate(q_indices):
-            layer = self.layers[i]
-            quantized = layer.decode(indices)
-            quantized_out = quantized_out + quantized
-        return quantized_out

higgs_audio/audio_processing/quantization/core_vq_lsx_version.py DELETED Viewed

@@ -1,431 +0,0 @@
-# Copyright (c)
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-# This implementation is inspired from
-# https://github.com/rosinality/vq-vae-2-pytorch/blob/master/vqvae.py and
-# https://github.com/clementchadebec/benchmark_VAE/blob/dfa0dcf6c79172df5d27769c09c860c42008baaa/src/pythae/models/vq_vae/vq_vae_utils.py#L81
-#
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-#
-# This implementation is inspired from
-# https://github.com/lucidrains/vector-quantize-pytorch
-# which is released under MIT License. Hereafter, the original license:
-# MIT License
-#
-# Copyright (c) 2020 Phil Wang
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-"""Core vector quantization implementation."""
-import typing as tp
-from einops import rearrange
-import torch
-from torch import nn
-import torch.nn.functional as F
-import torch.distributed as dist
-from .distrib import broadcast_tensors, is_distributed
-from .ddp_utils import SyncFunction
-def default(val: tp.Any, d: tp.Any) -> tp.Any:
-    return val if val is not None else d
-def ema_inplace(moving_avg, new, decay: float):
-    moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay))
-def laplace_smoothing(x, n_categories: int, epsilon: float = 1e-5):
-    return (x + epsilon) / (x.sum() + n_categories * epsilon)
-def uniform_init(*shape: int):
-    t = torch.empty(shape)
-    nn.init.kaiming_uniform_(t)
-    return t
-def sample_vectors(samples, num: int):
-    num_samples, device = samples.shape[0], samples.device
-    if num_samples >= num:
-        indices = torch.randperm(num_samples, device=device)[:num]
-    else:
-        indices = torch.randint(0, num_samples, (num,), device=device)
-    return samples[indices]
-def kmeans(
-    samples,
-    num_clusters: int,
-    num_iters: int = 10,
-    frames_to_use: int = 10_000,
-    batch_size: int = 64,
-):
-    """
-    Memory-efficient K-means clustering.
-    Args:
-        samples (tensor): shape [N, D]
-        num_clusters (int): number of centroids.
-        num_iters (int): number of iterations.
-        frames_to_use (int): subsample size from total samples.
-        batch_size (int): batch size used in distance computation.
-    Returns:
-        means: [num_clusters, D]
-        bins: [num_clusters] (number of points per cluster)
-    """
-    N, D = samples.shape
-    dtype, device = samples.dtype, samples.device
-    if frames_to_use < N:
-        indices = torch.randperm(N, device=device)[:frames_to_use]
-        samples = samples[indices]
-    means = sample_vectors(samples, num_clusters)
-    for _ in range(num_iters):
-        # Store cluster assignments
-        all_assignments = []
-        for i in range(0, samples.shape[0], batch_size):
-            batch = samples[i : i + batch_size]  # [B, D]
-            dists = torch.cdist(batch, means, p=2)  # [B, C]
-            assignments = dists.argmin(dim=1)  # [B]
-            all_assignments.append(assignments)
-        buckets = torch.cat(all_assignments, dim=0)  # [N]
-        bins = torch.bincount(buckets, minlength=num_clusters)
-        zero_mask = bins == 0
-        bins_min_clamped = bins.masked_fill(zero_mask, 1)
-        # Compute new means
-        new_means = torch.zeros_like(means)
-        for i in range(num_clusters):
-            mask = buckets == i
-            if mask.any():
-                new_means[i] = samples[mask].mean(dim=0)
-        means = torch.where(zero_mask[:, None], means, new_means)
-    return means, bins
-class EuclideanCodebook(nn.Module):
-    """Codebook with Euclidean distance.
-    Args:
-        dim (int): Dimension.
-        codebook_size (int): Codebook size.
-        kmeans_init (bool): Whether to use k-means to initialize the codebooks.
-            If set to true, run the k-means algorithm on the first training batch and use
-            the learned centroids as initialization.
-        kmeans_iters (int): Number of iterations used for k-means algorithm at initialization.
-        decay (float): Decay for exponential moving average over the codebooks.
-        epsilon (float): Epsilon value for numerical stability.
-        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
-            that have an exponential moving average cluster size less than the specified threshold with
-            randomly selected vector from the current batch.
-    """
-    def __init__(
-        self,
-        dim: int,
-        codebook_size: int,
-        kmeans_init: int = False,
-        kmeans_iters: int = 10,
-        decay: float = 0.99,
-        epsilon: float = 1e-5,
-        threshold_ema_dead_code: int = 2,
-    ):
-        super().__init__()
-        self.decay = decay
-        init_fn: tp.Union[tp.Callable[..., torch.Tensor], tp.Any] = uniform_init if not kmeans_init else torch.zeros
-        embed = init_fn(codebook_size, dim)
-        self.codebook_size = codebook_size
-        self.kmeans_iters = kmeans_iters
-        self.epsilon = epsilon
-        self.threshold_ema_dead_code = threshold_ema_dead_code
-        # Flag variable to indicate whether the codebook is initialized
-        self.register_buffer("inited", torch.Tensor([not kmeans_init]))
-        # Runing EMA cluster size/count: N_i^t in eq. (6) in vqvae paper
-        self.register_buffer("cluster_size", torch.zeros(codebook_size))
-        # Codebook
-        self.register_buffer("embed", embed)
-        # EMA codebook: eq. (7) in vqvae paper
-        self.register_buffer("embed_avg", embed.clone())
-    @torch.jit.ignore
-    def init_embed_(self, data):
-        """Initialize codebook.
-        Args:
-            data (tensor): [B * T, D].
-        """
-        if self.inited:
-            return
-        ## NOTE (snippet added by Songxiang Liu): gather data from all gpus
-        if dist.is_available() and dist.is_initialized():
-            # [B * T * world_size, D]
-            data = SyncFunction.apply(data)
-        embed, cluster_size = kmeans(data, self.codebook_size, self.kmeans_iters)
-        self.embed.data.copy_(embed)
-        self.embed_avg.data.copy_(embed.clone())
-        self.cluster_size.data.copy_(cluster_size)
-        self.inited.data.copy_(torch.Tensor([True]))
-        # Make sure all buffers across workers are in sync after initialization
-        broadcast_tensors(self.buffers())
-    def replace_(self, samples, mask):
-        modified_codebook = torch.where(mask[..., None], sample_vectors(samples, self.codebook_size), self.embed)
-        self.embed.data.copy_(modified_codebook)
-    def expire_codes_(self, batch_samples):
-        if self.threshold_ema_dead_code == 0:
-            return
-        expired_codes = self.cluster_size < self.threshold_ema_dead_code
-        if not torch.any(expired_codes):
-            return
-        ## NOTE (snippet added by Songxiang Liu): gather data from all gpus
-        if is_distributed():
-            # [B * T * world_size, D]
-            batch_samples = SyncFunction.apply(batch_samples)
-        batch_samples = rearrange(batch_samples, "... d -> (...) d")
-        self.replace_(batch_samples, mask=expired_codes)
-        broadcast_tensors(self.buffers())
-    def preprocess(self, x):
-        x = rearrange(x, "... d -> (...) d")
-        return x
-    def quantize(self, x):
-        embed = self.embed.t()
-        dist = -(x.pow(2).sum(1, keepdim=True) - 2 * x @ embed + embed.pow(2).sum(0, keepdim=True))
-        embed_ind = dist.max(dim=-1).indices
-        return embed_ind
-    def postprocess_emb(self, embed_ind, shape):
-        return embed_ind.view(*shape[:-1])
-    def dequantize(self, embed_ind):
-        quantize = F.embedding(embed_ind, self.embed)
-        return quantize
-    def encode(self, x):
-        shape = x.shape
-        # pre-process
-        x = self.preprocess(x)  # [B, T, D] -> [B*T, D]
-        # quantize
-        embed_ind = self.quantize(x)
-        # post-process
-        embed_ind = self.postprocess_emb(embed_ind, shape)
-        return embed_ind
-    def decode(self, embed_ind):
-        quantize = self.dequantize(embed_ind)
-        return quantize
-    def forward(self, x):
-        # shape: [B, T, D]
-        shape, dtype = x.shape, x.dtype
-        x = self.preprocess(x)  # [B, T, D] -> [B*T, D]
-        # Initialize codebook
-        self.init_embed_(x)
-        embed_ind = self.quantize(x)  # [B*T,]
-        embed_onehot = F.one_hot(embed_ind, self.codebook_size).type(dtype)  # [B*T, cb-size]
-        embed_ind = self.postprocess_emb(embed_ind, shape)  # [B, T]
-        quantize = self.dequantize(embed_ind)  # [B, T, D]
-        if self.training:
-            ### Update codebook by EMA
-            embed_onehot_sum = embed_onehot.sum(0)  # [cb-size,]
-            embed_sum = x.t() @ embed_onehot  # [D, cb-size]
-            if is_distributed():
-                dist.all_reduce(embed_onehot_sum)
-                dist.all_reduce(embed_sum)
-            # Update ema cluster count N_i^t, eq. (6) in vqvae paper
-            self.cluster_size.data.mul_(self.decay).add_(embed_onehot_sum, alpha=1 - self.decay)
-            # Update ema embed: eq. (7) in vqvae paper
-            self.embed_avg.data.mul_(self.decay).add_(embed_sum.t(), alpha=1 - self.decay)
-            # apply laplace smoothing
-            n = self.cluster_size.sum()
-            cluster_size = (self.cluster_size + self.epsilon) / (n + self.codebook_size * self.epsilon) * n
-            # Update ema embed: eq. (8) in vqvae paper
-            embed_normalized = self.embed_avg / cluster_size.unsqueeze(1)
-            self.embed.data.copy_(embed_normalized)
-            # We do the expiry of code at that point as buffers are in sync
-            # and all the workers will take the same decision.
-            self.expire_codes_(x)
-        return quantize, embed_ind
-class VectorQuantization(nn.Module):
-    """Vector quantization implementation.
-    Currently supports only euclidean distance.
-    Args:
-        dim (int): Dimension
-        codebook_size (int): Codebook size
-        codebook_dim (int): Codebook dimension. If not defined, uses the specified dimension in dim.
-        decay (float): Decay for exponential moving average over the codebooks.
-        epsilon (float): Epsilon value for numerical stability.
-        kmeans_init (bool): Whether to use kmeans to initialize the codebooks.
-        kmeans_iters (int): Number of iterations used for kmeans initialization.
-        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
-            that have an exponential moving average cluster size less than the specified threshold with
-            randomly selected vector from the current batch.
-        commitment_weight (float): Weight for commitment loss.
-    """
-    def __init__(
-        self,
-        dim: int,
-        codebook_size: int,
-        codebook_dim: tp.Optional[int] = None,
-        decay: float = 0.99,
-        epsilon: float = 1e-5,
-        kmeans_init: bool = True,
-        kmeans_iters: int = 50,
-        threshold_ema_dead_code: int = 2,
-        commitment_weight: float = 1.0,
-    ):
-        super().__init__()
-        _codebook_dim: int = default(codebook_dim, dim)
-        requires_projection = _codebook_dim != dim
-        self.project_in = nn.Linear(dim, _codebook_dim) if requires_projection else nn.Identity()
-        self.project_out = nn.Linear(_codebook_dim, dim) if requires_projection else nn.Identity()
-        self.epsilon = epsilon
-        self.commitment_weight = commitment_weight
-        self._codebook = EuclideanCodebook(
-            dim=_codebook_dim,
-            codebook_size=codebook_size,
-            kmeans_init=kmeans_init,
-            kmeans_iters=kmeans_iters,
-            decay=decay,
-            epsilon=epsilon,
-            threshold_ema_dead_code=threshold_ema_dead_code,
-        )
-        self.codebook_size = codebook_size
-    @property
-    def codebook(self):
-        return self._codebook.embed
-    def encode(self, x):
-        x = rearrange(x, "b d n -> b n d")
-        x = self.project_in(x)
-        embed_in = self._codebook.encode(x)
-        return embed_in
-    def decode(self, embed_ind):
-        quantize = self._codebook.decode(embed_ind)
-        quantize = self.project_out(quantize)
-        quantize = rearrange(quantize, "b n d -> b d n")
-        return quantize
-    def forward(self, x):
-        device = x.device
-        x = x.transpose(1, 2).contiguous()  # [b d n] -> [b n d]
-        x = self.project_in(x)
-        quantize, embed_ind = self._codebook(x)
-        if self.training:
-            quantize = x + (quantize - x).detach()
-        loss = torch.tensor([0.0], device=device, requires_grad=self.training)
-        if self.training:
-            if self.commitment_weight > 0:
-                commit_loss = F.mse_loss(quantize.detach(), x)
-                loss = loss + commit_loss * self.commitment_weight
-        quantize = self.project_out(quantize)
-        quantize = quantize.transpose(1, 2).contiguous()  # [b n d] -> [b d n]
-        return quantize, embed_ind, loss
-class ResidualVectorQuantization(nn.Module):
-    """Residual vector quantization implementation.
-    Follows Algorithm 1. in https://arxiv.org/pdf/2107.03312.pdf
-    """
-    def __init__(self, *, num_quantizers, **kwargs):
-        super().__init__()
-        self.layers = nn.ModuleList([VectorQuantization(**kwargs) for _ in range(num_quantizers)])
-    def forward(self, x, n_q: tp.Optional[int] = None):
-        quantized_out = 0.0
-        residual = x
-        all_losses = []
-        all_indices = []
-        n_q = n_q or len(self.layers)
-        for layer in self.layers[:n_q]:
-            quantized, indices, loss = layer(residual)
-            residual = residual - quantized
-            quantized_out = quantized_out + quantized
-            all_indices.append(indices)
-            all_losses.append(loss)
-        out_losses, out_indices = map(torch.stack, (all_losses, all_indices))
-        return quantized_out, out_indices, out_losses
-    def encode(self, x: torch.Tensor, n_q: tp.Optional[int] = None) -> torch.Tensor:
-        residual = x
-        all_indices = []
-        n_q = n_q or len(self.layers)
-        for layer in self.layers[:n_q]:
-            indices = layer.encode(residual)
-            quantized = layer.decode(indices)
-            residual = residual - quantized
-            all_indices.append(indices)
-        out_indices = torch.stack(all_indices)
-        return out_indices
-    def decode(self, q_indices: torch.Tensor) -> torch.Tensor:
-        quantized_out = torch.tensor(0.0, device=q_indices.device)
-        for i, indices in enumerate(q_indices):
-            layer = self.layers[i]
-            quantized = layer.decode(indices)
-            quantized_out = quantized_out + quantized
-        return quantized_out

higgs_audio/audio_processing/quantization/ddp_utils.py DELETED Viewed

@@ -1,197 +0,0 @@
-import logging
-import random
-import subprocess
-from datetime import datetime
-import numpy as np
-import torch
-import torch.distributed as dist
-from torch.nn.parallel import DistributedDataParallel
-from torch.nn.parallel.distributed import _find_tensors
-import torch.optim
-import torch.utils.data
-from packaging import version
-from omegaconf import OmegaConf
-def set_random_seed(seed):
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed_all(seed)
-def is_logging_process():
-    return not dist.is_initialized() or dist.get_rank() == 0
-def get_logger(cfg, name=None):
-    # log_file_path is used when unit testing
-    if is_logging_process():
-        logging.config.dictConfig(OmegaConf.to_container(cfg.job_logging_config, resolve=True))
-        return logging.getLogger(name)
-# from https://github.com/Lightning-AI/lightning-bolts/blob/5d61197cd2f491f69e238137a5edabe80ae14ad9/pl_bolts/models/self_supervised/simclr/simclr_module.py#L20
-class SyncFunction(torch.autograd.Function):
-    @staticmethod
-    # @torch.no_grad()
-    def forward(ctx, tensor):
-        ctx.batch_size = tensor.shape[0]
-        gathered_tensor = [torch.zeros_like(tensor) for _ in range(torch.distributed.get_world_size())]
-        torch.distributed.all_gather(gathered_tensor, tensor)
-        gathered_tensor = torch.cat(gathered_tensor, 0)
-        return gathered_tensor
-    @staticmethod
-    def backward(ctx, grad_output):
-        grad_input = grad_output.clone()
-        torch.distributed.all_reduce(grad_input, op=torch.distributed.ReduceOp.SUM, async_op=False)
-        idx_from = torch.distributed.get_rank() * ctx.batch_size
-        idx_to = (torch.distributed.get_rank() + 1) * ctx.batch_size
-        return grad_input[idx_from:idx_to]
-def get_timestamp():
-    return datetime.now().strftime("%y%m%d-%H%M%S")
-def get_commit_hash():
-    message = subprocess.check_output(["git", "rev-parse", "--short", "HEAD"])
-    return message.strip().decode("utf-8")
-class DDP(DistributedDataParallel):
-    """
-    Override the forward call in lightning so it goes to training and validation step respectively
-    """
-    def forward(self, *inputs, **kwargs):  # pragma: no cover
-        if version.parse(torch.__version__[:6]) < version.parse("1.11"):
-            self._sync_params()
-            inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
-            assert len(self.device_ids) == 1
-            if self.module.training:
-                output = self.module.training_step(*inputs[0], **kwargs[0])
-            elif self.module.testing:
-                output = self.module.test_step(*inputs[0], **kwargs[0])
-            else:
-                output = self.module.validation_step(*inputs[0], **kwargs[0])
-            if torch.is_grad_enabled():
-                # We'll return the output object verbatim since it is a freeform
-                # object. We need to find any tensors in this object, though,
-                # because we need to figure out which parameters were used during
-                # this forward pass, to ensure we short circuit reduction for any
-                # unused parameters. Only if `find_unused_parameters` is set.
-                if self.find_unused_parameters:
-                    self.reducer.prepare_for_backward(list(_find_tensors(output)))
-                else:
-                    self.reducer.prepare_for_backward([])
-        else:
-            from torch.nn.parallel.distributed import (
-                logging,
-                Join,
-                _DDPSink,
-                _tree_flatten_with_rref,
-                _tree_unflatten_with_rref,
-            )
-            with torch.autograd.profiler.record_function("DistributedDataParallel.forward"):
-                if torch.is_grad_enabled() and self.require_backward_grad_sync:
-                    self.logger.set_runtime_stats_and_log()
-                    self.num_iterations += 1
-                    self.reducer.prepare_for_forward()
-                # Notify the join context that this process has not joined, if
-                # needed
-                work = Join.notify_join_context(self)
-                if work:
-                    self.reducer._set_forward_pass_work_handle(work, self._divide_by_initial_world_size)
-                # Calling _rebuild_buckets before forward compuation,
-                # It may allocate new buckets before deallocating old buckets
-                # inside _rebuild_buckets. To save peak memory usage,
-                # call _rebuild_buckets before the peak memory usage increases
-                # during forward computation.
-                # This should be called only once during whole training period.
-                if torch.is_grad_enabled() and self.reducer._rebuild_buckets():
-                    logging.info("Reducer buckets have been rebuilt in this iteration.")
-                    self._has_rebuilt_buckets = True
-                # sync params according to location (before/after forward) user
-                # specified as part of hook, if hook was specified.
-                buffer_hook_registered = hasattr(self, "buffer_hook")
-                if self._check_sync_bufs_pre_fwd():
-                    self._sync_buffers()
-                if self._join_config.enable:
-                    # Notify joined ranks whether they should sync in backwards pass or not.
-                    self._check_global_requires_backward_grad_sync(is_joined_rank=False)
-                inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
-                if self.module.training:
-                    output = self.module.training_step(*inputs[0], **kwargs[0])
-                elif self.module.testing:
-                    output = self.module.test_step(*inputs[0], **kwargs[0])
-                else:
-                    output = self.module.validation_step(*inputs[0], **kwargs[0])
-                # sync params according to location (before/after forward) user
-                # specified as part of hook, if hook was specified.
-                if self._check_sync_bufs_post_fwd():
-                    self._sync_buffers()
-                if torch.is_grad_enabled() and self.require_backward_grad_sync:
-                    self.require_forward_param_sync = True
-                    # We'll return the output object verbatim since it is a freeform
-                    # object. We need to find any tensors in this object, though,
-                    # because we need to figure out which parameters were used during
-                    # this forward pass, to ensure we short circuit reduction for any
-                    # unused parameters. Only if `find_unused_parameters` is set.
-                    if self.find_unused_parameters and not self.static_graph:
-                        # Do not need to populate this for static graph.
-                        self.reducer.prepare_for_backward(list(_find_tensors(output)))
-                    else:
-                        self.reducer.prepare_for_backward([])
-                else:
-                    self.require_forward_param_sync = False
-            # TODO: DDPSink is currently enabled for unused parameter detection and
-            # static graph training for first iteration.
-            if (self.find_unused_parameters and not self.static_graph) or (
-                self.static_graph and self.num_iterations == 1
-            ):
-                state_dict = {
-                    "static_graph": self.static_graph,
-                    "num_iterations": self.num_iterations,
-                }
-                output_tensor_list, treespec, output_is_rref = _tree_flatten_with_rref(output)
-                output_placeholders = [None for _ in range(len(output_tensor_list))]
-                # Do not touch tensors that have no grad_fn, which can cause issues
-                # such as https://github.com/pytorch/pytorch/issues/60733
-                for i, output in enumerate(output_tensor_list):
-                    if torch.is_tensor(output) and output.grad_fn is None:
-                        output_placeholders[i] = output
-                # When find_unused_parameters=True, makes tensors which require grad
-                # run through the DDPSink backward pass. When not all outputs are
-                # used in loss, this makes those corresponding tensors receive
-                # undefined gradient which the reducer then handles to ensure
-                # param.grad field is not touched and we don't error out.
-                passthrough_tensor_list = _DDPSink.apply(
-                    self.reducer,
-                    state_dict,
-                    *output_tensor_list,
-                )
-                for i in range(len(output_placeholders)):
-                    if output_placeholders[i] is None:
-                        output_placeholders[i] = passthrough_tensor_list[i]
-                # Reconstruct output data structure.
-                output = _tree_unflatten_with_rref(output_placeholders, treespec, output_is_rref)
-        return output

higgs_audio/audio_processing/quantization/distrib.py DELETED Viewed

@@ -1,123 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""Torch distributed utilities."""
-import typing as tp
-import torch
-def rank():
-    if torch.distributed.is_initialized():
-        return torch.distributed.get_rank()
-    else:
-        return 0
-def world_size():
-    if torch.distributed.is_initialized():
-        return torch.distributed.get_world_size()
-    else:
-        return 1
-def is_distributed():
-    return world_size() > 1
-def all_reduce(tensor: torch.Tensor, op=torch.distributed.ReduceOp.SUM):
-    if is_distributed():
-        return torch.distributed.all_reduce(tensor, op)
-def _is_complex_or_float(tensor):
-    return torch.is_floating_point(tensor) or torch.is_complex(tensor)
-def _check_number_of_params(params: tp.List[torch.Tensor]):
-    # utility function to check that the number of params in all workers is the same,
-    # and thus avoid a deadlock with distributed all reduce.
-    if not is_distributed() or not params:
-        return
-    # print('params[0].device ', params[0].device)
-    tensor = torch.tensor([len(params)], device=params[0].device, dtype=torch.long)
-    all_reduce(tensor)
-    if tensor.item() != len(params) * world_size():
-        # If not all the workers have the same number, for at least one of them,
-        # this inequality will be verified.
-        raise RuntimeError(
-            f"Mismatch in number of params: ours is {len(params)}, at least one worker has a different one."
-        )
-def broadcast_tensors(tensors: tp.Iterable[torch.Tensor], src: int = 0):
-    """Broadcast the tensors from the given parameters to all workers.
-    This can be used to ensure that all workers have the same model to start with.
-    """
-    if not is_distributed():
-        return
-    tensors = [tensor for tensor in tensors if _is_complex_or_float(tensor)]
-    _check_number_of_params(tensors)
-    handles = []
-    for tensor in tensors:
-        handle = torch.distributed.broadcast(tensor.data, src=src, async_op=True)
-        handles.append(handle)
-    for handle in handles:
-        handle.wait()
-def sync_buffer(buffers, average=True):
-    """
-    Sync grad for buffers. If average is False, broadcast instead of averaging.
-    """
-    if not is_distributed():
-        return
-    handles = []
-    for buffer in buffers:
-        if torch.is_floating_point(buffer.data):
-            if average:
-                handle = torch.distributed.all_reduce(buffer.data, op=torch.distributed.ReduceOp.SUM, async_op=True)
-            else:
-                handle = torch.distributed.broadcast(buffer.data, src=0, async_op=True)
-            handles.append((buffer, handle))
-    for buffer, handle in handles:
-        handle.wait()
-        if average:
-            buffer.data /= world_size
-def sync_grad(params):
-    """
-    Simpler alternative to DistributedDataParallel, that doesn't rely
-    on any black magic. For simple models it can also be as fast.
-    Just call this on your model parameters after the call to backward!
-    """
-    if not is_distributed():
-        return
-    handles = []
-    for p in params:
-        if p.grad is not None:
-            handle = torch.distributed.all_reduce(p.grad.data, op=torch.distributed.ReduceOp.SUM, async_op=True)
-            handles.append((p, handle))
-    for p, handle in handles:
-        handle.wait()
-        p.grad.data /= world_size()
-def average_metrics(metrics: tp.Dict[str, float], count=1.0):
-    """Average a dictionary of metrics across all workers, using the optional
-    `count` as unormalized weight.
-    """
-    if not is_distributed():
-        return metrics
-    keys, values = zip(*metrics.items())
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    tensor = torch.tensor(list(values) + [1], device=device, dtype=torch.float32)
-    tensor *= count
-    all_reduce(tensor)
-    averaged = (tensor[:-1] / tensor[-1]).cpu().tolist()
-    return dict(zip(keys, averaged))

higgs_audio/audio_processing/quantization/vq.py DELETED Viewed

@@ -1,116 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""Residual vector quantizer implementation."""
-from dataclasses import dataclass, field
-import math
-import typing as tp
-import torch
-from torch import nn
-# from .core_vq import ResidualVectorQuantization
-from .core_vq_lsx_version import ResidualVectorQuantization
-@dataclass
-class QuantizedResult:
-    quantized: torch.Tensor
-    codes: torch.Tensor
-    bandwidth: torch.Tensor  # bandwidth in kb/s used, per batch item.
-    penalty: tp.Optional[torch.Tensor] = None
-    metrics: dict = field(default_factory=dict)
-class ResidualVectorQuantizer(nn.Module):
-    """Residual Vector Quantizer.
-    Args:
-        dimension (int): Dimension of the codebooks.
-        n_q (int): Number of residual vector quantizers used.
-        bins (int): Codebook size.
-        decay (float): Decay for exponential moving average over the codebooks.
-        kmeans_init (bool): Whether to use kmeans to initialize the codebooks.
-        kmeans_iters (int): Number of iterations used for kmeans initialization.
-        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
-            that have an exponential moving average cluster size less than the specified threshold with
-            randomly selected vector from the current batch.
-    """
-    def __init__(
-        self,
-        dimension: int = 256,
-        codebook_dim: int = None,
-        n_q: int = 8,
-        bins: int = 1024,
-        decay: float = 0.99,
-        kmeans_init: bool = True,
-        kmeans_iters: int = 50,
-        threshold_ema_dead_code: int = 2,
-    ):
-        super().__init__()
-        self.n_q = n_q
-        self.dimension = dimension
-        self.codebook_dim = codebook_dim
-        self.bins = bins
-        self.decay = decay
-        self.kmeans_init = kmeans_init
-        self.kmeans_iters = kmeans_iters
-        self.threshold_ema_dead_code = threshold_ema_dead_code
-        self.vq = ResidualVectorQuantization(
-            dim=self.dimension,
-            codebook_dim=self.codebook_dim,
-            codebook_size=self.bins,
-            num_quantizers=self.n_q,
-            decay=self.decay,
-            kmeans_init=self.kmeans_init,
-            kmeans_iters=self.kmeans_iters,
-            threshold_ema_dead_code=self.threshold_ema_dead_code,
-        )
-    def forward(self, x: torch.Tensor, sample_rate: int, bandwidth: tp.Optional[float] = None):  # -> QuantizedResult:
-        """Residual vector quantization on the given input tensor.
-        Args:
-            x (torch.Tensor): Input tensor.
-            sample_rate (int): Sample rate of the input tensor.
-            bandwidth (float): Target bandwidth.
-        Returns:
-            QuantizedResult:
-                The quantized (or approximately quantized) representation with
-                the associated bandwidth and any penalty term for the loss.
-        """
-        bw_per_q = self.get_bandwidth_per_quantizer(sample_rate)
-        n_q = self.get_num_quantizers_for_bandwidth(sample_rate, bandwidth)
-        quantized, codes, commit_loss = self.vq(x, n_q=n_q)
-        bw = torch.tensor(n_q * bw_per_q).to(x)
-        return quantized, codes, bw, torch.mean(commit_loss)
-        # return QuantizedResult(quantized, codes, bw, penalty=torch.mean(commit_loss))
-    def get_num_quantizers_for_bandwidth(self, sample_rate: int, bandwidth: tp.Optional[float] = None) -> int:
-        """Return n_q based on specified target bandwidth."""
-        bw_per_q = self.get_bandwidth_per_quantizer(sample_rate)
-        n_q = self.n_q
-        if bandwidth and bandwidth > 0.0:
-            n_q = int(max(1, math.floor(bandwidth / bw_per_q)))
-        return n_q
-    def get_bandwidth_per_quantizer(self, sample_rate: int):
-        """Return bandwidth per quantizer for a given input sample rate."""
-        return math.log2(self.bins) * sample_rate / 1000
-    def encode(self, x: torch.Tensor, sample_rate: int, bandwidth: tp.Optional[float] = None) -> torch.Tensor:
-        """Encode a given input tensor with the specified sample rate at the given bandwidth.
-        The RVQ encode method sets the appropriate number of quantizer to use
-        and returns indices for each quantizer.
-        """
-        n_q = self.get_num_quantizers_for_bandwidth(sample_rate, bandwidth)
-        codes = self.vq.encode(x, n_q=n_q)
-        return codes
-    def decode(self, codes: torch.Tensor) -> torch.Tensor:
-        """Decode the given codes to the quantized representation."""
-        quantized = self.vq.decode(codes)
-        return quantized

higgs_audio/audio_processing/semantic_module.py DELETED Viewed

@@ -1,310 +0,0 @@
-# Based on code from: https://github.com/zhenye234/xcodec
-# Licensed under MIT License
-# Modifications by BosonAI
-import torch
-import torch.nn as nn
-class Conv1d1x1(nn.Conv1d):
-    """1x1 Conv1d."""
-    def __init__(self, in_channels, out_channels, bias=True):
-        super(Conv1d1x1, self).__init__(in_channels, out_channels, kernel_size=1, bias=bias)
-class Conv1d(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        kernel_size: int,
-        stride: int = 1,
-        padding: int = -1,
-        dilation: int = 1,
-        groups: int = 1,
-        bias: bool = True,
-    ):
-        super().__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.kernel_size = kernel_size
-        if padding < 0:
-            padding = (kernel_size - 1) // 2 * dilation
-        self.dilation = dilation
-        self.conv = nn.Conv1d(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            groups=groups,
-            bias=bias,
-        )
-    def forward(self, x):
-        """
-        Args:
-            x (Tensor): Float tensor variable with the shape  (B, C, T).
-        Returns:
-            Tensor: Float tensor variable with the shape (B, C, T).
-        """
-        x = self.conv(x)
-        return x
-class ResidualUnit(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        kernel_size=3,
-        dilation=1,
-        bias=False,
-        nonlinear_activation="ELU",
-        nonlinear_activation_params={},
-    ):
-        super().__init__()
-        self.activation = getattr(nn, nonlinear_activation)(**nonlinear_activation_params)
-        self.conv1 = Conv1d(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=kernel_size,
-            stride=1,
-            dilation=dilation,
-            bias=bias,
-        )
-        self.conv2 = Conv1d1x1(out_channels, out_channels, bias)
-    def forward(self, x):
-        y = self.conv1(self.activation(x))
-        y = self.conv2(self.activation(y))
-        return x + y
-class ConvTranspose1d(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        kernel_size: int,
-        stride: int,
-        padding=-1,
-        output_padding=-1,
-        groups=1,
-        bias=True,
-    ):
-        super().__init__()
-        if padding < 0:
-            padding = (stride + 1) // 2
-        if output_padding < 0:
-            output_padding = 1 if stride % 2 else 0
-        self.deconv = nn.ConvTranspose1d(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-            output_padding=output_padding,
-            groups=groups,
-            bias=bias,
-        )
-    def forward(self, x):
-        """
-        Args:
-            x (Tensor): Float tensor variable with the shape  (B, C, T).
-        Returns:
-            Tensor: Float tensor variable with the shape (B, C', T').
-        """
-        x = self.deconv(x)
-        return x
-class EncoderBlock(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        stride: int,
-        dilations=(1, 1),
-        unit_kernel_size=3,
-        bias=True,
-    ):
-        super().__init__()
-        self.res_units = torch.nn.ModuleList()
-        for dilation in dilations:
-            self.res_units += [
-                ResidualUnit(
-                    in_channels,
-                    in_channels,
-                    kernel_size=unit_kernel_size,
-                    dilation=dilation,
-                )
-            ]
-        self.num_res = len(self.res_units)
-        self.conv = Conv1d(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=3 if stride == 1 else (2 * stride),  # special case: stride=1, do not use kernel=2
-            stride=stride,
-            bias=bias,
-        )
-    def forward(self, x):
-        for idx in range(self.num_res):
-            x = self.res_units[idx](x)
-        x = self.conv(x)
-        return x
-class Encoder(nn.Module):
-    def __init__(
-        self,
-        input_channels: int,
-        encode_channels: int,
-        channel_ratios=(1, 1),
-        strides=(1, 1),
-        kernel_size=3,
-        bias=True,
-        block_dilations=(1, 1),
-        unit_kernel_size=3,
-    ):
-        super().__init__()
-        assert len(channel_ratios) == len(strides)
-        self.conv = Conv1d(
-            in_channels=input_channels,
-            out_channels=encode_channels,
-            kernel_size=kernel_size,
-            stride=1,
-            bias=False,
-        )
-        self.conv_blocks = torch.nn.ModuleList()
-        in_channels = encode_channels
-        for idx, stride in enumerate(strides):
-            out_channels = int(encode_channels * channel_ratios[idx])  # could be float
-            self.conv_blocks += [
-                EncoderBlock(
-                    in_channels,
-                    out_channels,
-                    stride,
-                    dilations=block_dilations,
-                    unit_kernel_size=unit_kernel_size,
-                    bias=bias,
-                )
-            ]
-            in_channels = out_channels
-        self.num_blocks = len(self.conv_blocks)
-        self.out_channels = out_channels
-    def forward(self, x):
-        x = self.conv(x)
-        for i in range(self.num_blocks):
-            x = self.conv_blocks[i](x)
-        return x
-class DecoderBlock(nn.Module):
-    """Decoder block (no up-sampling)"""
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        stride: int,
-        dilations=(1, 1),
-        unit_kernel_size=3,
-        bias=True,
-    ):
-        super().__init__()
-        if stride == 1:
-            self.conv = Conv1d(
-                in_channels=in_channels,
-                out_channels=out_channels,
-                kernel_size=3,  # fix kernel=3 when stride=1 for unchanged shape
-                stride=stride,
-                bias=bias,
-            )
-        else:
-            self.conv = ConvTranspose1d(
-                in_channels=in_channels,
-                out_channels=out_channels,
-                kernel_size=(2 * stride),
-                stride=stride,
-                bias=bias,
-            )
-        self.res_units = torch.nn.ModuleList()
-        for idx, dilation in enumerate(dilations):
-            self.res_units += [
-                ResidualUnit(
-                    out_channels,
-                    out_channels,
-                    kernel_size=unit_kernel_size,
-                    dilation=dilation,
-                )
-            ]
-        self.num_res = len(self.res_units)
-    def forward(self, x):
-        x = self.conv(x)
-        for idx in range(self.num_res):
-            x = self.res_units[idx](x)
-        return x
-class Decoder(nn.Module):
-    def __init__(
-        self,
-        code_dim: int,
-        output_channels: int,
-        decode_channels: int,
-        channel_ratios=(1, 1),
-        strides=(1, 1),
-        kernel_size=3,
-        bias=True,
-        block_dilations=(1, 1),
-        unit_kernel_size=3,
-    ):
-        super().__init__()
-        assert len(channel_ratios) == len(strides)
-        self.conv1 = Conv1d(
-            in_channels=code_dim,
-            out_channels=int(decode_channels * channel_ratios[0]),
-            kernel_size=kernel_size,
-            stride=1,
-            bias=False,
-        )
-        self.conv_blocks = torch.nn.ModuleList()
-        for idx, stride in enumerate(strides):
-            in_channels = int(decode_channels * channel_ratios[idx])
-            if idx < (len(channel_ratios) - 1):
-                out_channels = int(decode_channels * channel_ratios[idx + 1])
-            else:
-                out_channels = decode_channels
-            self.conv_blocks += [
-                DecoderBlock(
-                    in_channels,
-                    out_channels,
-                    stride,
-                    dilations=block_dilations,
-                    unit_kernel_size=unit_kernel_size,
-                    bias=bias,
-                )
-            ]
-        self.num_blocks = len(self.conv_blocks)
-        self.conv2 = Conv1d(out_channels, output_channels, kernel_size, 1, bias=False)
-    def forward(self, z):
-        x = self.conv1(z)
-        for i in range(self.num_blocks):
-            x = self.conv_blocks[i](x)
-        x = self.conv2(x)
-        return x

higgs_audio/constants.py DELETED Viewed

@@ -1,3 +0,0 @@
-AUDIO_IN_TOKEN = "<|AUDIO|>"
-AUDIO_OUT_TOKEN = "<|AUDIO_OUT|>"
-EOS_TOKEN = "<|end_of_text|>"

higgs_audio/data_collator/__init__.py DELETED Viewed

File without changes

higgs_audio/data_collator/higgs_audio_collator.py DELETED Viewed

@@ -1,583 +0,0 @@
-import librosa
-import torch
-import torch.nn.functional as F
-import math
-import numpy as np
-from typing import List, Tuple, Dict
-from dataclasses import dataclass
-from typing import List, Optional
-from transformers.models.whisper.processing_whisper import WhisperProcessor
-from ..dataset.chatml_dataset import ChatMLDatasetSample, RankedChatMLDatasetSampleTuple
-from ..model.utils import build_delay_pattern_mask
-def _ceil_to_nearest(n, round_to):
-    return (n + round_to - 1) // round_to * round_to
-@dataclass
-class HiggsAudioBatchInput:
-    input_ids: torch.LongTensor  # shape (bsz, seq_len).
-    attention_mask: torch.Tensor  # shape (bsz, seq_len).
-    audio_features: Optional[torch.Tensor]  # shape (num_audio_in, feature_dim, max_mel_seq_len).
-    audio_feature_attention_mask: Optional[torch.Tensor]  # shape (num_audio_in, max_mel_seq_len).
-    audio_out_ids: Optional[torch.LongTensor]  # shape (num_codebooks, audio_out_total_length)
-    audio_out_ids_start: Optional[torch.LongTensor]  # shape (num_audio_out,)
-    # The audio_out_ids_start_group_loc has the same length as audio_out_ids_start. It is used to recover group location in a batch for an audio segment
-    # Currently, we concatenante audio segments along dim 0 to handle variadic audio segment length. However, in the alignment stage, we need the location information
-    # For example,
-    #  audio_out_ids_start = [0, 2, 4, 8]; and the first two audio segments come from the same sample in a batch, and other two come from different samples.
-    #  This is a batch of 3 samples, then we will have the group location as:
-    #  audio_out_ids_start_group_loc = [0, 0, 1, 2]
-    audio_out_ids_start_group_loc: Optional[
-        torch.LongTensor
-    ]  # shape (num_audio_out,), specify which a sample's group location in the batch
-    audio_in_ids: Optional[torch.LongTensor]  # shape (num_codebooks, audio_in_total_length)
-    audio_in_ids_start: Optional[torch.LongTensor]  # shape (num_audio_in,)
-    label_ids: Optional[torch.LongTensor]  # shape (bsz, seq_len)
-    label_audio_ids: Optional[torch.LongTensor]  # shape (num_codebooks, audio_out_total_length)
-    reward: Optional[float] = None
-class HiggsAudioSampleCollator:
-    """Sample collator for Higgs-Audio model.
-    Args:
-        whisper_processor (WhisperProcessor): The whisper processor.
-        audio_in_token_id (int): The token id for audio-in.
-        audio_out_token_id (int): The token id for audio-out.
-        pad_token_id (int): The token id for padding.
-        audio_stream_bos_id (int): The token id for audio-stream beginning of sentence.
-        audio_stream_eos_id (int): The token id for audio-stream end of sentence.
-        round_to (int): The round-to value.
-        pad_left (bool): Whether to pad left.
-        return_audio_in_tokens (bool): Whether to return audio-in tokens.
-        use_delay_pattern (bool): Whether to use delay pattern.
-        disable_audio_codes_transform (bool): Whether to add bos and eos tokens to audio codes.
-        chunk_size_seconds (int): The chunk size in seconds.
-        add_new_bos_eos_for_long_chunk (bool): Whether to add new bos and eos tokens for long chunks.
-        mask_audio_out_token_label (bool): Whether to always mask the label associated with <|AUDIO_OUT|> token. Since we will always have `<|AUDIO_OUT|>` after `<|audio_bos|>`, we can safely mask <|AUDIO_OUT|>.
-    """
-    def __init__(
-        self,
-        whisper_processor: WhisperProcessor,
-        audio_in_token_id,
-        audio_out_token_id,
-        pad_token_id,
-        audio_stream_bos_id,
-        audio_stream_eos_id,
-        round_to=8,
-        pad_left=False,
-        encode_whisper_embed=True,
-        return_audio_in_tokens=True,
-        audio_num_codebooks=None,
-        use_delay_pattern=False,
-        disable_audio_codes_transform=False,
-        chunk_size_seconds=30,  # Maximum duration for each chunk
-        add_new_bos_eos_for_long_chunk=True,
-        mask_audio_out_token_label=True,
-    ):
-        self.whisper_processor = whisper_processor
-        self.round_to = round_to
-        self.pad_left = pad_left
-        self.audio_in_token_id = audio_in_token_id
-        self.audio_out_token_id = audio_out_token_id
-        self.audio_stream_bos_id = audio_stream_bos_id
-        self.audio_stream_eos_id = audio_stream_eos_id
-        self.pad_token_id = pad_token_id
-        self.encode_whisper_embed = encode_whisper_embed
-        self.return_audio_in_tokens = return_audio_in_tokens
-        self.audio_num_codebooks = audio_num_codebooks
-        self.use_delay_pattern = use_delay_pattern
-        if encode_whisper_embed:
-            self.chunk_size_seconds = chunk_size_seconds
-            self.chunk_size_samples = int(chunk_size_seconds * whisper_processor.feature_extractor.sampling_rate)
-        else:
-            self.chunk_size_seconds = None
-            self.chunk_size_samples = None
-        self.disable_audio_codes_transform = disable_audio_codes_transform
-        self.add_new_bos_eos_for_long_chunk = add_new_bos_eos_for_long_chunk
-        self.mask_audio_out_token_label = mask_audio_out_token_label
-    def _process_and_duplicate_audio_tokens(
-        self,
-        input_ids: torch.Tensor,
-        audio_idx: int,
-        wv: torch.Tensor,
-        sr: int,
-        labels: Optional[torch.Tensor] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor, int]:
-        """Process long audio and duplicate corresponding audio tokens.
-        Args:
-            input_ids: Input token ids
-            audio_idx: Index of the audio token in the sequence
-            wv: Audio waveform
-            sr: Sample rate
-            labels: Optional label ids to be duplicated alongside input ids
-        Returns:
-            Tuple of:
-                - New input ids with duplicated audio tokens
-                - New label ids (if labels were provided) or None
-                - Number of chunks created
-        """
-        # Calculate number of chunks needed
-        total_samples = len(wv)
-        num_chunks = math.ceil(total_samples / self.chunk_size_samples)
-        if num_chunks <= 1:
-            return input_ids, labels, 1
-        # Get the three tokens: <|audio_bos|><|AUDIO|><|audio_eos|>
-        audio_token_seq = input_ids[audio_idx - 1 : audio_idx + 2]
-        # Duplicate sequence for each chunk
-        duplicated_sequence = audio_token_seq.repeat(num_chunks)
-        # Create new input_ids with duplicated tokens
-        new_input_ids = torch.cat(
-            [
-                input_ids[: audio_idx - 1],
-                duplicated_sequence,
-                input_ids[audio_idx + 2 :],
-            ]
-        )
-        # If labels are provided, duplicate them as well
-        new_labels = None
-        if labels is not None:
-            label_seq = labels[audio_idx - 1 : audio_idx + 2]
-            duplicated_labels = label_seq.repeat(num_chunks)
-            new_labels = torch.cat([labels[: audio_idx - 1], duplicated_labels, labels[audio_idx + 2 :]])
-        return new_input_ids, new_labels, num_chunks
-    def __call__(self, batch: List[ChatMLDatasetSample]):
-        """Collate the input data with support for long audio processing."""
-        label_ids = None
-        label_audio_ids = None
-        if all([ele.label_ids is None for ele in batch]):
-            return_labels = False
-        else:
-            return_labels = True
-        if self.encode_whisper_embed:
-            # Process each sample in the batch to handle long audio
-            # TODO(?) The implementation here can be optimized.
-            processed_batch = []
-            for i in range(len(batch)):
-                sample = batch[i]
-                audio_in_mask = sample.input_ids == self.audio_in_token_id
-                audio_in_indices = torch.where(audio_in_mask)[0]
-                audio_out_mask = sample.input_ids == self.audio_out_token_id
-                # Process each audio token and duplicate if needed
-                modified_input_ids = sample.input_ids
-                modified_labels = sample.label_ids if return_labels else None
-                modified_waveforms_concat = []
-                modified_waveforms_start = []
-                modified_sample_rate = []
-                offset = 0  # Track position changes from duplicating tokens
-                curr_wv_offset = 0
-                # Process input audio tokens
-                for idx, audio_idx in enumerate(audio_in_indices):
-                    # Get the audio for this token
-                    wv, sr = sample.get_wv(idx)  # Use idx since we want the original audio index
-                    if sr != self.whisper_processor.feature_extractor.sampling_rate:
-                        resampled_wv = librosa.resample(
-                            wv.cpu().numpy(),
-                            orig_sr=sr,
-                            target_sr=self.whisper_processor.feature_extractor.sampling_rate,
-                        )
-                    else:
-                        resampled_wv = wv.cpu().numpy()
-                    wv = torch.tensor(resampled_wv, device=wv.device)
-                    sr = self.whisper_processor.feature_extractor.sampling_rate
-                    # Process and duplicate tokens if necessary
-                    token_pos = audio_idx + offset
-                    modified_input_ids, modified_labels, num_chunks = self._process_and_duplicate_audio_tokens(
-                        modified_input_ids, token_pos, wv, sr, modified_labels
-                    )
-                    # Update audio data
-                    for chunk_idx in range(num_chunks):
-                        chunk_start = chunk_idx * self.chunk_size_samples
-                        chunk_end = min((chunk_idx + 1) * self.chunk_size_samples, len(wv))
-                        chunk_wv = wv[chunk_start:chunk_end]
-                        modified_waveforms_concat.append(chunk_wv)
-                        modified_waveforms_start.append(curr_wv_offset)
-                        curr_wv_offset += len(chunk_wv)
-                        modified_sample_rate.append(sr)
-                    # Update offset for next iteration
-                    offset += (num_chunks - 1) * 3  # Each new chunk adds 3 more tokens
-                # Create new sample with modified tokens and audio data
-                processed_sample = ChatMLDatasetSample(
-                    input_ids=modified_input_ids,
-                    label_ids=modified_labels if return_labels else sample.label_ids,
-                    audio_ids_concat=sample.audio_ids_concat,
-                    audio_ids_start=sample.audio_ids_start,
-                    audio_waveforms_concat=torch.cat(modified_waveforms_concat)
-                    if modified_waveforms_concat
-                    else sample.audio_waveforms_concat,
-                    audio_waveforms_start=torch.tensor(modified_waveforms_start, dtype=torch.long)
-                    if modified_waveforms_start
-                    else sample.audio_waveforms_start,
-                    audio_sample_rate=torch.tensor(modified_sample_rate)
-                    if modified_sample_rate
-                    else sample.audio_sample_rate,
-                    audio_speaker_indices=torch.tensor([]),
-                    # FIXME(sxjscience): The logic here is not correct for audio_label_ids_concat.
-                    audio_label_ids_concat=sample.audio_label_ids_concat,
-                )
-                # audio_in_chunk_len = len(torch.where(modified_input_ids == self.audio_in_token_id)[0])
-                # assert audio_in_chunk_len == processed_sample.num_audios(), f"Mismatch: audio_in_chunk_len={audio_in_chunk_len}, processed_sample.num_audios()={processed_sample.num_audios()}"
-                processed_batch.append(processed_sample)
-        else:
-            processed_batch = batch
-        # Get the max sequence length based on processed batch
-        max_seq_length = _ceil_to_nearest(max([len(sample.input_ids) for sample in processed_batch]), self.round_to)
-        # Get the ids for audio-in and audio-out for each batch
-        audio_in_wv_l = []
-        audio_in_ids_l = []
-        audio_out_ids_l = []
-        audio_out_ids_group_loc_l = []
-        audio_in_label_ids_l = None
-        audio_out_label_ids_l = None
-        reward_l = []
-        if return_labels:
-            audio_out_no_train_flag = []  # Whether the audio-out data should be trained on or not.
-        # Process the audio inputs and outputs
-        for i in range(len(processed_batch)):
-            audio_in_mask = processed_batch[i].input_ids == self.audio_in_token_id
-            audio_out_mask = processed_batch[i].input_ids == self.audio_out_token_id
-            audio_ids = torch.ones_like(processed_batch[i].input_ids)
-            audio_ids[audio_in_mask ^ audio_out_mask] = torch.cumsum(audio_ids[audio_in_mask ^ audio_out_mask], 0) - 1
-            audio_in_ids = audio_ids[audio_in_mask]
-            audio_out_ids = audio_ids[audio_out_mask]
-            if return_labels:
-                audio_out_no_train_flag.append(processed_batch[i].label_ids[audio_out_mask] < 0)
-                if self.mask_audio_out_token_label:
-                    processed_batch[i].label_ids[audio_out_mask] = -100
-            # Process audio inputs
-            if self.return_audio_in_tokens:
-                audio_in_ids_l.extend(
-                    [processed_batch[i].get_audio_codes(idx)[: self.audio_num_codebooks, :] for idx in audio_in_ids]
-                )
-                if processed_batch[i].audio_label_ids_concat is not None:
-                    if audio_in_label_ids_l is None:
-                        audio_in_label_ids_l = []
-                    audio_in_label_ids_l.extend(
-                        [
-                            processed_batch[i].get_audio_codes_labels(idx)[: self.audio_num_codebooks, :]
-                            for idx in audio_in_ids
-                        ]
-                    )
-            audio_out_ids_l.extend(
-                [processed_batch[i].get_audio_codes(idx)[: self.audio_num_codebooks, :] for idx in audio_out_ids]
-            )
-            audio_out_ids_group_loc_l.append(i)
-            if processed_batch[i].reward is not None:
-                reward_l.append(processed_batch[i].reward)
-            if processed_batch[i].audio_label_ids_concat is not None:
-                if audio_out_label_ids_l is None:
-                    audio_out_label_ids_l = []
-                audio_out_label_ids_l.extend(
-                    [
-                        processed_batch[i].get_audio_codes_labels(idx)[: self.audio_num_codebooks, :]
-                        for idx in audio_out_ids
-                    ]
-                )
-            if self.encode_whisper_embed:
-                for idx in audio_in_ids:
-                    wv, sr = processed_batch[i].get_wv(idx)
-                    resampled_wv = wv.cpu().numpy()
-                    # Split long audio into chunks
-                    total_samples = len(resampled_wv)
-                    for chunk_start in range(0, total_samples, self.chunk_size_samples):
-                        chunk_end = min(chunk_start + self.chunk_size_samples, total_samples)
-                        chunk = resampled_wv[chunk_start:chunk_end]
-                        audio_in_wv_l.append(chunk)
-            # assert len(audio_in_wv_l) == processed_batch[i].num_audios(), \
-            #     f"Assertion failed: Mismatch in number of audios. " \
-            #     f"Expected {processed_batch[i].num_audios()}, but got {len(audio_in_wv_l)} at index {i}."
-        if return_labels:
-            audio_out_no_train_flag = torch.cat(audio_out_no_train_flag, dim=0)
-        # Process all audio features
-        if len(audio_in_wv_l) > 0:
-            feature_ret = self.whisper_processor.feature_extractor(
-                audio_in_wv_l,
-                sampling_rate=self.whisper_processor.feature_extractor.sampling_rate,
-                return_attention_mask=True,
-                padding="max_length",
-            )
-            audio_features = torch.from_numpy(feature_ret["input_features"])
-            audio_feature_attention_mask = torch.from_numpy(feature_ret["attention_mask"])
-        else:
-            if self.encode_whisper_embed:
-                audio_features = torch.zeros(
-                    (
-                        0,
-                        self.whisper_processor.feature_extractor.feature_size,
-                        self.whisper_processor.feature_extractor.nb_max_frames,
-                    ),
-                    dtype=torch.float32,
-                )
-                audio_feature_attention_mask = torch.zeros(
-                    (0, self.whisper_processor.feature_extractor.nb_max_frames),
-                    dtype=torch.int32,
-                )
-            else:
-                audio_features = None
-                audio_feature_attention_mask = None
-        # Process audio input tokens
-        if len(audio_in_ids_l) > 0:
-            # Append audio-stream-bos and eos tokens
-            new_audio_in_ids_l = []
-            for ele in audio_in_ids_l:
-                if self.disable_audio_codes_transform:
-                    # Do not add audio-stream-bos or eos tokens.
-                    # This may indicate that the sample comes from ConstantLengthDatasetWithBuffer.
-                    audio_codes = ele
-                else:
-                    audio_codes = torch.cat(
-                        [
-                            torch.full(
-                                (ele.shape[0], 1),
-                                self.audio_stream_bos_id,
-                                dtype=torch.long,
-                            ),
-                            ele,
-                            torch.full(
-                                (ele.shape[0], 1),
-                                self.audio_stream_eos_id,
-                                dtype=torch.long,
-                            ),
-                        ],
-                        dim=1,
-                    )
-                    if self.use_delay_pattern:
-                        audio_codes = build_delay_pattern_mask(
-                            audio_codes.unsqueeze(0),
-                            bos_token_id=self.audio_stream_bos_id,
-                            pad_token_id=self.audio_stream_eos_id,
-                        )[0].squeeze(0)
-                new_audio_in_ids_l.append(audio_codes)
-            audio_in_ids = torch.cat(new_audio_in_ids_l, dim=1).long()
-            audio_in_ids_start = torch.cumsum(
-                torch.tensor([0] + [audio_codes.shape[1] for audio_codes in new_audio_in_ids_l[:-1]]),
-                dim=0,
-            )
-        else:
-            audio_in_ids = torch.zeros((0, 0), dtype=torch.long)
-            audio_in_ids_start = torch.zeros(0, dtype=torch.long)
-        # Process audio output tokens
-        audio_out_ids_start_group_loc = None
-        if len(audio_out_ids_l) > 0:
-            new_audio_out_ids_l = []
-            label_audio_ids_l = []
-            for idx, ele in enumerate(audio_out_ids_l):
-                if self.disable_audio_codes_transform:
-                    # Do not add audio-stream-bos or eos tokens.
-                    # This may indicate that the sample comes from ConstantLengthDatasetWithBuffer.
-                    audio_codes = ele
-                    if return_labels:
-                        label_audio_ids = audio_out_label_ids_l[idx]
-                else:
-                    audio_codes = torch.cat(
-                        [
-                            torch.full(
-                                (ele.shape[0], 1),
-                                self.audio_stream_bos_id,
-                                dtype=torch.long,
-                            ),
-                            ele,
-                            torch.full(
-                                (ele.shape[0], 1),
-                                self.audio_stream_eos_id,
-                                dtype=torch.long,
-                            ),
-                        ],
-                        dim=1,
-                    )
-                    if return_labels:
-                        label_audio_ids = torch.cat(
-                            [
-                                torch.full((ele.shape[0], 1), -100, dtype=torch.long),
-                                ele,
-                                torch.full(
-                                    (ele.shape[0], 1),
-                                    self.audio_stream_eos_id,
-                                    dtype=torch.long,
-                                ),
-                            ],
-                            dim=1,
-                        )
-                    if self.use_delay_pattern:
-                        audio_codes = build_delay_pattern_mask(
-                            audio_codes.unsqueeze(0),
-                            bos_token_id=self.audio_stream_bos_id,
-                            pad_token_id=self.audio_stream_eos_id,
-                        )[0].squeeze(0)
-                        if return_labels:
-                            label_audio_ids = build_delay_pattern_mask(
-                                label_audio_ids.unsqueeze(0),
-                                bos_token_id=-100,
-                                pad_token_id=-100,
-                            )[0].squeeze(0)
-                new_audio_out_ids_l.append(audio_codes)
-                if return_labels:
-                    if audio_out_no_train_flag[idx]:
-                        label_audio_ids[:] = -100
-                    label_audio_ids_l.append(label_audio_ids)
-            audio_out_ids = torch.cat(new_audio_out_ids_l, dim=1).long()
-            if return_labels:
-                label_audio_ids = torch.cat(label_audio_ids_l, dim=1).long()
-            audio_out_ids_start = torch.cumsum(
-                torch.tensor([0] + [audio_codes.shape[1] for audio_codes in new_audio_out_ids_l[:-1]]),
-                dim=0,
-            )
-            audio_out_ids_start_group_loc = torch.tensor(audio_out_ids_group_loc_l, dtype=torch.long)
-        else:
-            audio_out_ids = torch.zeros((0, 0), dtype=torch.long)
-            audio_out_ids_start = torch.zeros(0, dtype=torch.long)
-            if return_labels:
-                label_audio_ids = torch.zeros((0, 0), dtype=torch.long)
-        reward = torch.tensor(reward_l, dtype=torch.float32)
-        # Handle padding for input ids and attention mask
-        if self.pad_left:
-            input_ids = torch.stack(
-                [
-                    F.pad(
-                        ele.input_ids,
-                        (max_seq_length - len(ele.input_ids), 0),
-                        value=self.pad_token_id,
-                    )
-                    for ele in processed_batch
-                ]
-            )
-            if return_labels:
-                label_ids = torch.stack(
-                    [
-                        F.pad(
-                            ele.label_ids,
-                            (max_seq_length - len(ele.label_ids), 0),
-                            value=-100,
-                        )
-                        for ele in processed_batch
-                    ]
-                )
-            attention_mask = torch.stack(
-                [
-                    F.pad(
-                        torch.ones_like(ele.input_ids),
-                        (max_seq_length - len(ele.input_ids), 0),
-                        value=0,
-                    )
-                    for ele in processed_batch
-                ]
-            )
-        else:
-            input_ids = torch.stack(
-                [
-                    F.pad(
-                        ele.input_ids,
-                        (0, max_seq_length - len(ele.input_ids)),
-                        value=self.pad_token_id,
-                    )
-                    for ele in processed_batch
-                ]
-            )
-            if return_labels:
-                label_ids = torch.stack(
-                    [
-                        F.pad(
-                            ele.label_ids,
-                            (0, max_seq_length - len(ele.label_ids)),
-                            value=-100,
-                        )
-                        for ele in processed_batch
-                    ]
-                )
-            attention_mask = torch.stack(
-                [
-                    F.pad(
-                        torch.ones_like(ele.input_ids),
-                        (0, max_seq_length - len(ele.input_ids)),
-                        value=0,
-                    )
-                    for ele in processed_batch
-                ]
-            )
-        if not self.return_audio_in_tokens:
-            audio_in_ids = None
-            audio_in_ids_start = None
-        # Apply audio_num_codebooks limit if specified
-        if self.audio_num_codebooks is not None:
-            if audio_in_ids is not None:
-                audio_in_ids = audio_in_ids[: self.audio_num_codebooks]
-            if audio_out_ids is not None:
-                audio_out_ids = audio_out_ids[: self.audio_num_codebooks]
-            if label_audio_ids is not None:
-                label_audio_ids = label_audio_ids[: self.audio_num_codebooks]
-        return HiggsAudioBatchInput(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            audio_features=audio_features,
-            audio_feature_attention_mask=audio_feature_attention_mask,
-            audio_out_ids=audio_out_ids,
-            audio_out_ids_start=audio_out_ids_start,
-            audio_out_ids_start_group_loc=audio_out_ids_start_group_loc,
-            audio_in_ids=audio_in_ids,
-            audio_in_ids_start=audio_in_ids_start,
-            label_ids=label_ids,
-            label_audio_ids=label_audio_ids,
-            reward=reward,
-        )
-class HiggsAudioDPOSamplesCollator(HiggsAudioSampleCollator):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-    def __call__(self, batch: List[RankedChatMLDatasetSampleTuple]) -> HiggsAudioBatchInput:
-        # flatten ranked chatml samples
-        chosen = []
-        rejected = []
-        for sample in batch:
-            chosen.append(sample.max_score_sample())
-            rejected.append(sample.min_score_sample())
-        merged = chosen
-        merged.extend(rejected)
-        return super().__call__(batch=merged)

higgs_audio/data_types.py DELETED Viewed

@@ -1,38 +0,0 @@
-"""Basic data types for multimodal ChatML format."""
-from dataclasses import dataclass
-from typing import Dict, List, Optional, Union
-@dataclass
-class AudioContent:
-    audio_url: str
-    # Base64 encoded audio bytes
-    raw_audio: Optional[str] = None
-    offset: Optional[float] = None
-    duration: Optional[float] = None
-    row_id: Optional[int] = None
-    type: str = "audio"
-@dataclass
-class TextContent:
-    text: str
-    type: str = "text"
-@dataclass
-class Message:
-    role: str
-    content: Union[str, AudioContent, TextContent, List[Union[str, AudioContent, TextContent]]]
-    recipient: Optional[str] = None
-@dataclass
-class ChatMLSample:
-    """Dataclass to hold multimodal ChatML data."""
-    messages: List[Message]
-    start_index: Optional[int] = None  # We will mask the messages[:start_index] when finetuning the LLM.
-    misc: Optional[Dict] = None
-    speaker: Optional[str] = None

higgs_audio/dataset/__init__.py DELETED Viewed

File without changes

higgs_audio/dataset/chatml_dataset.py DELETED Viewed

@@ -1,554 +0,0 @@
-import dacite
-import pandas as pd
-import torch
-import json
-import numpy as np
-import multiprocessing as mp
-from dataclasses import dataclass, fields
-from abc import ABC, abstractmethod
-from typing import Union, List, Dict, Optional
-from ..data_types import ChatMLSample, TextContent, AudioContent
-from ..constants import AUDIO_IN_TOKEN, AUDIO_OUT_TOKEN
-from loguru import logger
-# Whisper processor, 30 sec -> 3000 features
-# Then we divide 4 in the audio towker, we decrease 3000 features to 750, which gives 25 Hz
-WHISPER_EMBED_NUM_HIDDEN_STATE_PER_SEC = 25
-@dataclass
-class ChatMLDatasetSample:
-    input_ids: torch.LongTensor  # Shape (seq_len,): The input text tokens.
-    label_ids: torch.LongTensor  # Shape (seq_len,): The label ids.
-    audio_ids_concat: torch.LongTensor  # Shape (num_codebooks, audio_seq_len): The audio tokens that are concatenated.
-    # Here `audio_seq_len` is the length of the concatenated audio tokens.`
-    audio_ids_start: (
-        torch.LongTensor
-    )  # Shape (num_audios,): The start index of each audio token in the concatenated audio tokens.
-    audio_waveforms_concat: (
-        torch.Tensor
-    )  # Shape (total_wv_length,): The concatenated audio waveforms for audio-in features.
-    audio_waveforms_start: (
-        torch.LongTensor
-    )  # Shape (num_audios,): The start index of each audio waveform in the concatenated audio waveforms.
-    audio_sample_rate: torch.Tensor  # Shape (num_audios,): The sampling rate of the audio waveforms.
-    audio_speaker_indices: (
-        torch.LongTensor
-    )  # Shape (num_audios,) -1 means unknown speaker: The speaker indices for each audio.
-    audio_label_ids_concat: Optional[torch.LongTensor] = (
-        None  # Shape (num_codebooks, audio_seq_len): The audio tokens that are concatenated.
-    )
-    # Here `audio_seq_len` is the length of the concatenated audio tokens.`
-    reward: Optional[float] = None
-    def num_audios(self):
-        return max(len(self.audio_waveforms_start), len(self.audio_ids_start))
-    def get_audio_codes(self, idx):
-        code_start = self.audio_ids_start[idx]
-        if idx < len(self.audio_ids_start) - 1:
-            code_end = self.audio_ids_start[idx + 1]
-        else:
-            code_end = self.audio_ids_concat.shape[-1]
-        return self.audio_ids_concat[:, code_start:code_end]
-    def get_audio_codes_labels(self, idx):
-        if self.audio_label_ids_concat is None:
-            return None
-        code_start = self.audio_ids_start[idx]
-        if idx < len(self.audio_ids_start) - 1:
-            code_end = self.audio_ids_start[idx + 1]
-        else:
-            code_end = self.audio_ids_concat.shape[-1]
-        return self.audio_label_ids_concat[:, code_start:code_end]
-    def get_wv(self, idx):
-        wv_start = self.audio_waveforms_start[idx]
-        sr = self.audio_sample_rate[idx]
-        if idx < len(self.audio_waveforms_start) - 1:
-            wv_end = self.audio_waveforms_start[idx + 1]
-        else:
-            wv_end = self.audio_waveforms_concat.shape[-1]
-        return self.audio_waveforms_concat[wv_start:wv_end], sr
-    def cal_num_tokens(
-        self,
-        encode_whisper_embed: bool = True,
-        encode_audio_in_tokens: bool = False,
-        encode_audio_out_tokens: bool = True,
-        audio_in_token_id: int = 128015,
-        audio_out_token_id: int = 128016,
-    ) -> int:
-        # we firstly exclude <|AUDIO|> and <|AUDIO_OUT|> because we do late merging and replace those position with actual audio features and audio token ids
-        # It's assumed that we always have audio_ids when audio_waveforms are there (but not vice-versa)
-        num_tokens = len(self.input_ids) - len(self.audio_ids_start)
-        if encode_whisper_embed and len(self.audio_waveforms_concat) > 0:
-            audio_lengths = torch.diff(self.audio_waveforms_start)
-            if len(audio_lengths):
-                # Sum before calling .item()
-                num_tokens += (
-                    (
-                        np.ceil(WHISPER_EMBED_NUM_HIDDEN_STATE_PER_SEC * audio_lengths / self.audio_sample_rate[:-1])
-                    ).sum()
-                ).item()
-            # add the last audio's token estimation
-            num_tokens += (
-                np.ceil(
-                    WHISPER_EMBED_NUM_HIDDEN_STATE_PER_SEC
-                    * (self.audio_waveforms_concat.shape[0] - self.audio_waveforms_start[-1])
-                    / self.audio_sample_rate[-1]
-                )
-            ).item()
-        if self.audio_ids_concat.size(1) > 0:
-            audio_io_ids = self.input_ids[
-                (self.input_ids == audio_in_token_id) | (self.input_ids == audio_out_token_id)
-            ]
-            audio_io_id_lengths = torch.concat(
-                [
-                    torch.diff(self.audio_ids_start),
-                    torch.tensor([self.audio_ids_concat.shape[-1] - self.audio_ids_start[-1]]),
-                ]
-            )
-            if encode_audio_in_tokens:
-                num_tokens += torch.sum(audio_io_id_lengths[audio_io_ids == audio_in_token_id]).item()
-            if encode_audio_out_tokens:
-                num_tokens += torch.sum(audio_io_id_lengths[audio_io_ids == audio_out_token_id]).item()
-        return int(num_tokens)
-    @classmethod
-    def merge(
-        cls,
-        samples: List["ChatMLDatasetSample"],
-        eos_token_id: int,
-        ignore_index: int,
-        padding_size: Optional[int] = None,
-    ) -> "ChatMLDatasetSample":
-        """Merges a list of ChatMLDatasetSample instances, inserting eos_token_id and ignore_index between them, and adjusting offsets for audio_ids_start and audio_waveforms_start.
-        Args:
-            samples (List[ChatMLDatasetSample]): List of samples to merge.
-            eos_token_id (int): Tokens to be inserted into input_ids between samples.
-            ignore_index (int): Default label for padding.
-            padding_size (Optional[int]): If provided, pad the sequence to with this length.
-        Returns:
-            ChatMLDatasetSample: Merged and potentially padded sample.
-        """
-        if not samples:
-            logger.fatal("The samples list is empty and cannot be merged.")
-            raise ValueError("The samples list is empty and cannot be merged.")
-        # Initialize empty lists for concatenation
-        input_ids_list = []
-        label_ids_list = []
-        audio_ids_concat_list = []
-        audio_ids_start_list = []
-        audio_waveforms_concat_list = []
-        audio_waveforms_start_list = []
-        audio_sample_rate_list = []
-        audio_speaker_indices_list = []
-        # Track offsets
-        audio_ids_offset = 0
-        audio_waveforms_offset = 0
-        for sample in samples:
-            # Add input_ids and label_ids with padding
-            if input_ids_list:
-                input_ids_list.append(torch.tensor([eos_token_id], dtype=torch.long))
-                label_ids_list.append(torch.tensor([ignore_index], dtype=torch.long))
-            input_ids_list.append(sample.input_ids)
-            label_ids_list.append(sample.label_ids)
-            # Add audio_ids_concat and handle empty audio ids
-            if sample.audio_ids_concat.size(1) > 0:
-                audio_ids_concat_list.append(sample.audio_ids_concat)
-                # Offset and add audio_ids_start
-                audio_ids_start_list.append(sample.audio_ids_start + audio_ids_offset)
-                audio_ids_offset += sample.audio_ids_concat.size(
-                    1
-                )  # (num_codebooks, seq_len): Update offset by audio_seq_len
-            # Add audio_waveforms_concat
-            if sample.audio_waveforms_concat.size(0) > 0:
-                # Check dimensions of the audio waveform to ensure consistency
-                if (
-                    audio_waveforms_concat_list
-                    and sample.audio_waveforms_concat.dim() != audio_waveforms_concat_list[0].dim()
-                ):
-                    logger.warning(
-                        f"Skipping audio waveform with inconsistent dimensions: expected {audio_waveforms_concat_list[0].dim()}D, got {sample.audio_waveforms_concat.dim()}D"
-                    )
-                    continue
-                audio_waveforms_concat_list.append(sample.audio_waveforms_concat)
-                audio_waveforms_start_list.append(sample.audio_waveforms_start + audio_waveforms_offset)
-                audio_waveforms_offset += sample.audio_waveforms_concat.size(0)
-                # Add audio_sample_rate and audio_speaker_indices
-                audio_sample_rate_list.append(sample.audio_sample_rate)
-            audio_speaker_indices_list.append(sample.audio_speaker_indices)
-        # Concatenate all tensors
-        input_ids = torch.cat(input_ids_list, dim=0)
-        label_ids = torch.cat(label_ids_list, dim=0)
-        # Apply padding if padding_size is specified
-        if padding_size is not None and padding_size > 0:
-            input_ids = torch.cat(
-                [
-                    input_ids,
-                    torch.full((padding_size,), eos_token_id, dtype=torch.long),
-                ],
-                dim=0,
-            )
-            label_ids = torch.cat(
-                [
-                    label_ids,
-                    torch.full((padding_size,), ignore_index, dtype=torch.long),
-                ],
-                dim=0,
-            )
-        # Safely concatenate audio tensors with proper error handling
-        try:
-            audio_ids_concat = torch.cat(audio_ids_concat_list, dim=1) if audio_ids_concat_list else torch.tensor([[]])
-            audio_ids_start = torch.cat(audio_ids_start_list, dim=0) if audio_ids_start_list else torch.tensor([])
-            # Check for dimensional consistency in audio waveforms
-            if audio_waveforms_concat_list:
-                dims = [t.dim() for t in audio_waveforms_concat_list]
-                if not all(d == dims[0] for d in dims):
-                    # If dimensions don't match, log warning and filter out the problematic tensors
-                    logger.warning(
-                        f"Inconsistent dimensions in audio waveforms: {dims}. Filtering to keep only consistent ones."
-                    )
-                    expected_dim = max(set(dims), key=dims.count)  # Most common dimension
-                    audio_waveforms_concat_list = [t for t in audio_waveforms_concat_list if t.dim() == expected_dim]
-                    # Recalculate audio_waveforms_start with the filtered list
-                    if audio_waveforms_concat_list:
-                        audio_waveforms_offset = 0
-                        audio_waveforms_start_list = []
-                        for waveform in audio_waveforms_concat_list:
-                            audio_waveforms_start_list.append(torch.tensor([audio_waveforms_offset]))
-                            audio_waveforms_offset += waveform.size(0)
-            audio_waveforms_concat = (
-                torch.cat(audio_waveforms_concat_list, dim=0) if audio_waveforms_concat_list else torch.tensor([])
-            )
-            audio_waveforms_start = (
-                torch.cat(audio_waveforms_start_list, dim=0) if audio_waveforms_start_list else torch.tensor([])
-            )
-            audio_sample_rate = (
-                torch.cat(audio_sample_rate_list, dim=0) if audio_sample_rate_list else torch.tensor([])
-            )
-            audio_speaker_indices = (
-                torch.cat(audio_speaker_indices_list, dim=0) if audio_speaker_indices_list else torch.tensor([])
-            )
-        except RuntimeError as e:
-            logger.error(f"Error during tensor concatenation: {str(e)}")
-            logger.warning("Falling back to empty audio tensors")
-            # Fall back to empty tensors
-            audio_ids_concat = torch.tensor([[]])
-            audio_ids_start = torch.tensor([])
-            audio_waveforms_concat = torch.tensor([])
-            audio_waveforms_start = torch.tensor([])
-            audio_sample_rate = torch.tensor([])
-            audio_speaker_indices = torch.tensor([])
-        # Create the merged sample
-        merged_sample = cls(
-            input_ids=input_ids,
-            label_ids=label_ids,
-            audio_ids_concat=audio_ids_concat,
-            audio_ids_start=audio_ids_start,
-            audio_waveforms_concat=audio_waveforms_concat,
-            audio_waveforms_start=audio_waveforms_start,
-            audio_sample_rate=audio_sample_rate,
-            audio_speaker_indices=audio_speaker_indices,
-        )
-        return merged_sample
-@dataclass
-class RankedChatMLDatasetSampleTuple:
-    samples: List[ChatMLDatasetSample]
-    scores: List[float]
-    def max_score_sample(self) -> ChatMLDatasetSample:
-        idx = self.scores.index(max(self.scores))
-        self.samples[idx].reward = self.scores[idx]
-        return self.samples[idx]
-    def min_score_sample(self) -> ChatMLDatasetSample:
-        idx = self.scores.index(min(self.scores))
-        self.samples[idx].reward = self.scores[idx]
-        return self.samples[idx]
-@dataclass
-class ChatMLDatasetStorageSample:
-    input_tokens: torch.LongTensor
-    label_tokens: torch.LongTensor
-    audio_bytes_cache_dir_index: int
-    audio_codes_cache_dir_index: int
-    audio_bytes_indices: torch.LongTensor
-    audio_codes_indices: torch.LongTensor
-    speaker_indices: torch.LongTensor
-    file_index: int
-    original_sample_index: int
-# TODO(sxjscience): We need to revist the logic about parsing speaker ids.
-# Currently, we assume that the speaker id is stored at the "misc" field in ChatMLSample.
-def prepare_chatml_sample(sample: Union[ChatMLSample, Dict], tokenizer):
-    """Preprocess the ChatML sample to get the tokens for the text part.
-    Args:
-        sample (ChatMLSample): The ChatML sample to preprocess.
-        tokenizer: The tokenizer to use for encoding the text.
-    """
-    try:
-        if not isinstance(sample, ChatMLSample):
-            # Handle all fields that could be NaN
-            if "speaker" in sample and pd.isna(sample["speaker"]):
-                sample["speaker"] = None
-            if "start_index" in sample and pd.isna(sample["start_index"]):
-                sample["start_index"] = None
-            if "content" in sample and pd.isna(sample["content"]):
-                sample["content"] = ""
-            # Convert any other potential NaN values in nested structures
-            def convert_nan_to_none(obj):
-                import numpy as np
-                if isinstance(obj, (pd.Series, np.ndarray)):
-                    return obj.tolist()
-                elif pd.api.types.is_scalar(obj) and pd.isna(obj):
-                    return None
-                elif isinstance(obj, dict):
-                    return {k: convert_nan_to_none(v) for k, v in obj.items()}
-                elif isinstance(obj, (list, tuple)):  # Fixed: Handle both list and tuple
-                    return [convert_nan_to_none(item) for item in obj]
-                return obj
-            # Clean the sample data
-            clean_sample = convert_nan_to_none(sample)
-            val_keys = []
-            for field in fields(ChatMLSample):
-                if field.name in clean_sample:
-                    val_keys.append(field.name)
-            clean_sample = {k: clean_sample[k] for k in val_keys}
-            try:
-                sample = dacite.from_dict(
-                    data_class=ChatMLSample,
-                    data=clean_sample,
-                    config=dacite.Config(strict=True, check_types=True),
-                )
-            except Exception as e:
-                print(f"Failed to convert to ChatMLSample: {e}")
-                print(f"Clean sample: {json.dumps(clean_sample, indent=2)}")
-                return None, None, None, None
-        input_tokens = []
-        label_tokens = []
-        audio_contents = []
-        speaker_id = None
-        if sample.speaker is not None:
-            speaker_id = sample.speaker
-        elif sample.misc is not None:
-            if "speaker" in sample.misc:
-                speaker_id = sample.misc["speaker"]
-        total_m = len(sample.messages)
-        for turn_id, message in enumerate(sample.messages):
-            role = message.role
-            recipient = message.recipient
-            content = message.content
-            content_l = []
-            if isinstance(content, str):
-                content_l.append(TextContent(text=content))
-            elif isinstance(content, TextContent):
-                content_l.append(content)
-            elif isinstance(content, AudioContent):
-                content_l.append(content)
-            elif isinstance(content, list):
-                for ele in content:
-                    if isinstance(ele, str):
-                        content_l.append(TextContent(text=ele))
-                    else:
-                        content_l.append(ele)
-            if turn_id == 0:
-                prefix = f"<|begin_of_text|><|start_header_id|>{role}<|end_header_id|>\n\n"
-            else:
-                prefix = f"<|start_header_id|>{role}<|end_header_id|>\n\n"
-            eot_postfix = "<|eot_id|>"
-            eom_postfix = "<|eom_id|>"
-            prefix_tokens = tokenizer.encode(prefix, add_special_tokens=False)
-            input_tokens.extend(prefix_tokens)
-            label_tokens.extend([-100 for _ in prefix_tokens])
-            if recipient:
-                assert role == "assistant", "Recipient is only available for assistant role."
-                recipient_tokens = tokenizer.encode(f"{recipient}<|recipient|>", add_special_tokens=False)
-                input_tokens.extend(recipient_tokens)
-                label_tokens.extend(recipient_tokens)
-            for content in content_l:
-                if content.type == "text":
-                    text_tokens = tokenizer.encode(content.text, add_special_tokens=False)
-                    input_tokens.extend(text_tokens)
-                    if role == "assistant" and (sample.start_index is None or turn_id >= sample.start_index):
-                        label_tokens.extend(text_tokens)
-                    else:
-                        label_tokens.extend([-100 for _ in text_tokens])
-                elif content.type == "audio":
-                    # Generate the text-part of the audio tokens
-                    audio_contents.append(content)
-                    if role == "user" or role == "system":
-                        # Add the text tokens
-                        text_tokens = tokenizer.encode(
-                            f"<|audio_bos|><|AUDIO|><|audio_eos|>",
-                            add_special_tokens=False,
-                        )
-                        input_tokens.extend(text_tokens)
-                        label_tokens.extend([-100 for _ in text_tokens])
-                    elif role == "assistant":
-                        # Add the text tokens for audio-out part.
-                        text_tokens = tokenizer.encode(
-                            f"<|audio_out_bos|><|AUDIO_OUT|><|audio_eos|>",
-                            add_special_tokens=False,
-                        )
-                        input_tokens.extend(text_tokens)
-                        if sample.start_index is None or turn_id >= sample.start_index:
-                            label_tokens.extend(text_tokens)
-                        else:
-                            label_tokens.extend([-100 for _ in text_tokens])
-            next_id = turn_id + 1
-            if role == "assistant" and next_id != total_m and sample.messages[next_id].role == "assistant":
-                postfix_tokens = tokenizer.encode(eom_postfix, add_special_tokens=False)
-                input_tokens.extend(postfix_tokens)
-            else:
-                postfix_tokens = tokenizer.encode(eot_postfix, add_special_tokens=False)
-                input_tokens.extend(postfix_tokens)
-            if role == "assistant" and (sample.start_index is None or turn_id >= sample.start_index):
-                label_tokens.extend(postfix_tokens)
-            else:
-                label_tokens.extend([-100 for _ in postfix_tokens])
-        return input_tokens, label_tokens, audio_contents, speaker_id
-    except Exception as e:
-        print(f"Error in prepare_chatml_sample: {str(e)}")
-        print(f"Sample data: {json.dumps(sample, indent=2)}")
-        return None, None, None, None
-def extract_generation_prompt_from_input_tokens(input_tokens, tokenizer):
-    """Extract the generation prompt and reference answer from the input tokens.
-    For example:
-    Input Text = '<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n
-    What words do you hear from the provided audio? Write it down for me.<|audio_bos|><|AUDIO|><|audio_eos|><|eot_id|>
-    <|start_header_id|>assistant<|end_header_id|>\n\nAt first they went by quick, too quick to even get.<|eot_id|>'
-    -->
-    Prompt = '<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n
-    What words do you hear from the provided audio? Write it down for me.<|audio_bos|><|AUDIO|><|audio_eos|><|eot_id|>
-    <|start_header_id|>assistant<|end_header_id|>\n\n',
-    Reference = 'At first they went by quick, too quick to even get.'
-    Args:
-        input_tokens: The input tokens.
-        audio_contents: The audio contents.
-        tokenizer: The tokenizer to use for decoding the text.
-    Returns:
-        prompt_tokens: The tokens for the prompt.
-        reference_answer: The reference answer.
-        num_audios_in_reference: The number of audios in the reference answer.
-    """
-    input_text = tokenizer.decode(input_tokens)
-    generation_prefix = "<|start_header_id|>assistant<|end_header_id|>\n\n"
-    postfix = "<|eot_id|>"
-    assert generation_prefix in input_text
-    generation_prompt_end_loc = input_text.rfind(generation_prefix) + len(generation_prefix)
-    generation_prompt = input_text[:generation_prompt_end_loc]
-    reference_answer = input_text[generation_prompt_end_loc : input_text.find(postfix, generation_prompt_end_loc)]
-    num_audios_in_reference = reference_answer.count(AUDIO_IN_TOKEN) + reference_answer.count(AUDIO_OUT_TOKEN)
-    return (
-        tokenizer.encode(generation_prompt, add_special_tokens=False),
-        reference_answer,
-        num_audios_in_reference,
-    )
-def prepare_chatml_dataframe_single_process(df, tokenizer):
-    """Prepare the ChatML DataFrame."""
-    ret = []
-    for _, row in df.iterrows():
-        input_tokens, label_tokens, audio_contents, speaker_id = prepare_chatml_sample(row.to_dict(), tokenizer)
-        ret.append((input_tokens, label_tokens, audio_contents, speaker_id))
-    return ret
-def prepare_chatml_dataframe(df, tokenizer, num_process=16):
-    if num_process is None:
-        return prepare_chatml_dataframe_single_process(df, tokenizer)
-    else:
-        num_process = max(min(len(df) // 1000, num_process), 1)
-        workloads = np.array_split(df, num_process)
-        with mp.Pool(num_process) as pool:
-            ret = pool.starmap(
-                prepare_chatml_dataframe_single_process,
-                [(workload, tokenizer) for workload in workloads],
-            )
-    return sum(ret, [])
-class DatasetInterface(ABC):
-    @abstractmethod
-    def __getitem__(self, idx) -> Union["ChatMLDatasetSample", "RankedChatMLDatasetSampleTuple"]:
-        """Retrieve a dataset sample by index."""
-        raise NotImplementedError
-class IterableDatasetInterface(ABC):
-    @abstractmethod
-    def __iter__(
-        self,
-    ) -> Union["ChatMLDatasetSample", "RankedChatMLDatasetSampleTuple"]:
-        """Retrieve a sample by iterating through the dataset."""
-        raise NotImplementedError
-@dataclass
-class DatasetInfo:
-    dataset_type: str
-    group_type: Optional[str] = None
-    mask_text: Optional[bool] = None  # Whether to mask the text tokens for pretraining samples.

higgs_audio/model/__init__.py DELETED Viewed

@@ -1,9 +0,0 @@
-from transformers import AutoConfig, AutoModel
-from .configuration_higgs_audio import HiggsAudioConfig, HiggsAudioEncoderConfig
-from .modeling_higgs_audio import HiggsAudioModel
-AutoConfig.register("higgs_audio_encoder", HiggsAudioEncoderConfig)
-AutoConfig.register("higgs_audio", HiggsAudioConfig)
-AutoModel.register(HiggsAudioConfig, HiggsAudioModel)

higgs_audio/model/audio_head.py DELETED Viewed

@@ -1,139 +0,0 @@
-"""Projector that maps hidden states from the LLM component to multimodal logits."""
-import torch
-from torch import nn
-from dataclasses import dataclass
-from typing import Optional, Tuple
-from .common import HiggsAudioPreTrainedModel
-from .configuration_higgs_audio import HiggsAudioConfig
-@dataclass
-class HiggsAudioDecoderLayerOutput:
-    logits: torch.FloatTensor
-    audio_logits: torch.FloatTensor
-    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
-    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
-class HiggsAudioDecoderProjector(HiggsAudioPreTrainedModel):
-    """Projection layers that map hidden states from the LLM component to audio / text logits.
-    We support two type of audio head:
-    - Basic Audio Head:
-        Directly map the hidden states to audio logits for all the codebooks.
-    """
-    def __init__(self, config: HiggsAudioConfig, layer_idx: Optional[int] = None):
-        super().__init__(config)
-        self.text_lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
-        self.audio_lm_head = nn.Linear(
-            config.text_config.hidden_size,
-            config.audio_num_codebooks * (config.audio_codebook_size + 2),
-            bias=False,
-        )
-        # Initialize weights and apply final processing
-        self.post_init()
-    def forward(
-        self,
-        hidden_states,
-        audio_out_mask,
-        label_audio_ids=None,
-        attention_mask=None,
-        position_ids=None,
-        past_key_values=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        output_audio_hidden_states=False,
-        cache_position=None,
-    ):
-        """
-        Args:
-            hidden_states (`torch.Tensor` of shape `(batch_size, seq_len, hidden_size)`):
-                Hidden states from the LLM component
-            audio_out_mask (`torch.Tensor` of shape `(batch_size, seq_len)`):
-                Mask for identifying the audio out tokens.
-            label_audio_ids (`torch.Tensor` of shape `(num_codebooks, num_audio_out_tokens)`):
-                Label tokens for the audio-out part. This is used for calculating the logits if RQ-Transformer is used.
-            attention_mask (`torch.Tensor` of shape `(batch_size, seq_len)`):
-                Mask to avoid performing attention on padding token indices
-            position_ids (`torch.Tensor` of shape `(batch_size, seq_len)`):
-                Position ids for the input tokens
-        Returns:
-            logits (`torch.Tensor` of shape `(batch_size, seq_len, vocab_size)`):
-                Logits for text tokens
-            audio_logits (`torch.Tensor` of shape `(num_audio_out_tokens, audio_num_codebooks * audio_codebook_size)`):
-                Logits for audio tokens. We ensure `num_text_tokens + num_audio_tokens == batch_size * seq_len`
-        """
-        logits = self.text_lm_head(hidden_states)
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = None
-        # TODO(sxjscience) Need to check if DeepSpeed Zero3 supports zero-shape input.
-        if self.config.audio_decoder_proj_num_layers > 0:
-            # create position embeddings to be shared across the decoder layers
-            position_embeddings = self.rotary_emb(hidden_states, position_ids)
-            for decoder_layer in self.transformer_layers:
-                if output_hidden_states:
-                    all_hidden_states += (hidden_states,)
-                if self.gradient_checkpointing and self.training:
-                    layer_outputs = self._gradient_checkpointing_func(
-                        decoder_layer.__call__,
-                        hidden_states,
-                        attention_mask,
-                        position_ids,
-                        past_key_values,
-                        output_attentions,
-                        use_cache,
-                        cache_position,
-                        position_embeddings,
-                    )
-                else:
-                    layer_outputs = decoder_layer(
-                        hidden_states,
-                        attention_mask=attention_mask,
-                        position_ids=position_ids,
-                        past_key_value=past_key_values,
-                        output_attentions=output_attentions,
-                        use_cache=use_cache,
-                        cache_position=cache_position,
-                        position_embeddings=position_embeddings,
-                    )
-                hidden_states = layer_outputs[0]
-            hidden_states = self.norm(hidden_states)
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-            if use_cache:
-                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
-        next_cache = next_decoder_cache if use_cache else None
-        audio_logits = self.audio_lm_head(hidden_states[audio_out_mask])
-        if output_audio_hidden_states:
-            audio_hidden_states = hidden_states[audio_out_mask]
-        else:
-            audio_hidden_states = None
-        return (
-            logits,
-            audio_logits,
-            all_self_attns,
-            all_hidden_states,
-            audio_hidden_states,
-            next_cache,
-        )

higgs_audio/model/common.py DELETED Viewed

@@ -1,27 +0,0 @@
-from torch import nn
-from transformers.modeling_utils import PreTrainedModel
-from .configuration_higgs_audio import HiggsAudioConfig
-class HiggsAudioPreTrainedModel(PreTrainedModel):
-    config_class = HiggsAudioConfig
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = []
-    _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn_2 = True
-    _supports_sdpa = True
-    def _init_weights(self, module):
-        std = self.config.init_std if hasattr(self.config, "init_std") else self.config.audio_encoder_config.init_std
-        if isinstance(module, (nn.Linear, nn.Conv1d)):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()

higgs_audio/model/configuration_higgs_audio.py DELETED Viewed

@@ -1,235 +0,0 @@
-from transformers.configuration_utils import PretrainedConfig
-from transformers.models.auto import CONFIG_MAPPING
-class HiggsAudioEncoderConfig(PretrainedConfig):
-    """Configuration of the Audio encoder in Higgs-Audio."""
-    model_type = "higgs_audio_encoder"
-    def __init__(
-        self,
-        num_mel_bins=128,
-        encoder_layers=32,
-        encoder_attention_heads=20,
-        encoder_ffn_dim=5120,
-        encoder_layerdrop=0.0,
-        d_model=1280,
-        dropout=0.0,
-        attention_dropout=0.0,
-        activation_function="gelu",
-        activation_dropout=0.0,
-        scale_embedding=False,
-        init_std=0.02,
-        max_source_positions=1500,
-        pad_token_id=128001,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.num_mel_bins = num_mel_bins
-        self.d_model = d_model
-        self.encoder_layers = encoder_layers
-        self.encoder_attention_heads = encoder_attention_heads
-        self.encoder_ffn_dim = encoder_ffn_dim
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.activation_function = activation_function
-        self.activation_dropout = activation_dropout
-        self.encoder_layerdrop = encoder_layerdrop
-        self.num_hidden_layers = encoder_layers
-        self.init_std = init_std
-        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
-        self.max_source_positions = max_source_positions
-        self.pad_token_id = pad_token_id
-class HiggsAudioConfig(PretrainedConfig):
-    r"""
-    This is the configuration class for the HiggsAudioModel.
-    Args:
-        text_config (`Union[AutoConfig, dict]`):
-            The config object or dictionary of the text backbone.
-        audio_encoder_config (`Union[AutoConfig, dict]`):
-            The config object or dictionary of the whisper encoder.
-            The audio encoder will be bidirectional and will be only available for audio understanding.
-        audio_tokenizer_config
-            The config object or dictionary of the audio tokenizer.
-        audio_adapter_type
-            The type of audio adapter to use. We support two types of adapter:
-            - stack:
-                We stack additional Transformer layers after the main LLM backbone for audio generation.
-            - dual_ffn:
-                For selected part of the LLM backbone, we replace the text FFN with a dual FFN architecture
-                that contains an additional audio FFN. The audio FFN will be triggered when the location is marked for audio tokens.
-            - dual_ffn_fast_forward:
-                We pick a few layers in the LLM backbone to plug-in the audio FFN. For the remaining layers,
-                the audio hidden states will be directly fast-forward to the next layer.
-                This reduces the computational cost for audio generation.
-        audio_embed_avg (`bool`, *optional*, defaults to False):
-            Whether to average the audio embeddings before sending them to the text attention layer.
-        audio_ffn_hidden_size
-            The hidden size of the audio feedforward network in dual-path FFN
-        audio_ffn_intermediate_size
-            The intermediate size of the audio feedforward network in dual-path FFN
-        audio_dual_ffn_layers
-            The layers in the LLM backbone to plug-in the dual FFN layer (mixture of audio FFN and text FFN).
-        audio_decoder_proj_num_attention (`int`, *optional*, defaults to 0):
-            The number of attention heads in the audio decoder projection layer.
-        use_delay_pattern (`bool`, *optional*, defaults to False):
-            Whether to use delay pattern in the audio decoder.
-        skip_audio_tower (`bool`, *optional*, defaults to False):
-            Whether to skip the audio tower in the audio encoder.
-        use_audio_out_embed_projector (`bool`, *optional*, defaults to False):
-            Whether to use an embedding projector to map audio out embeddings.
-        use_audio_out_self_attention (`bool`, *optional*, defaults to False):
-            Whether to use self-attention to aggregate information from audio-tokens before sending to the text attention layer.
-        audio_num_codebooks (`int`, *optional*, defaults to 12):
-            The number of codebooks in RVQGAN.
-        audio_codebook_size (`int`, *optional*, defaults to 1024):
-            The size of each codebook in RVQGAN.
-        audio_stream_bos_id
-            The id of the bos in the audio stream
-        audio_stream_eos_id
-            The id of the eos in the audio stream
-        audio_bos_token (`str`, *optional*, defaults to "<|audio_bos|>"):
-            The special `<|audio_bos|>` token. In Higgs-Audio, it is mapped to 128011,
-            which is the index of `<|reserved_special_token_3|>` in Llama-3.1-8B-Instruct's tokenizer.
-        audio_eos_token (`str`, *optional*, defaults to "<|audio_eos|>"):
-            The special `<|audio_eos|>` token. We use 128012 as the default value,
-            which is the index of `<|reserved_special_token_4|>` in Llama-3.1-8B-Instruct's tokenizer.
-        audio_out_bos_token (`str`, *optional*, defaults to "<|audio_out_bos|>"):
-            The special `<|audio_out_bos|>` token. We use 128013 as the default value,
-            which is the index of `<|reserved_special_token_5|>` in Llama-3.1-8B-Instruct's tokenizer.
-        audio_token (`str`, *optional*, defaults to "<|AUDIO|>"):
-            The special `<|AUDIO|>` token. We use 128015 as the default value,
-            which is the index of `<|reserved_special_token_7|>` in Llama-3.1-8B-Instruct's tokenizer.
-            This token indicates that the location should be filled in with whisper features.
-        audio_out_token (`str`, *optional*, defaults to "<|AUDIO_OUT|>"):
-            The special `<|AUDIO_OUT|>` token. We use 128016 as the default value,
-            which is the index of `<|reserved_special_token_8|>` in Llama-3.1-8B-Instruct's tokenizer.
-            This token indicates that the location should be filled in with audio tokens extracted via audio tokenizer.
-    """
-    model_type = "higgs_audio"
-    is_composition = True
-    def __init__(
-        self,
-        text_config=None,
-        audio_encoder_config=None,
-        audio_tokenizer_config=None,
-        audio_adapter_type="stack",
-        audio_embed_avg=False,
-        audio_ffn_hidden_size=4096,
-        audio_ffn_intermediate_size=14336,
-        audio_dual_ffn_layers=None,
-        audio_decoder_proj_num_layers=0,
-        encode_whisper_embed=True,
-        encode_audio_in_tokens=False,
-        use_delay_pattern=False,
-        skip_audio_tower=False,
-        use_audio_out_embed_projector=False,
-        use_audio_out_self_attention=False,
-        use_rq_transformer=False,
-        rq_transformer_hidden_size=None,
-        rq_transformer_intermediate_size=None,
-        rq_transformer_num_attention_heads=None,
-        rq_transformer_num_key_value_heads=None,
-        rq_transformer_num_hidden_layers=3,
-        audio_num_codebooks=12,
-        audio_codebook_size=1024,
-        audio_stream_bos_id=1024,
-        audio_stream_eos_id=1025,
-        audio_bos_token="<|audio_bos|>",
-        audio_eos_token="<|audio_eos|>",
-        audio_out_bos_token="<|audio_out_bos|>",
-        audio_in_token="<|AUDIO|>",
-        audio_out_token="<|AUDIO_OUT|>",
-        audio_in_token_idx=128015,
-        audio_out_token_idx=128016,
-        pad_token_id=128001,
-        audio_out_bos_token_id=128013,
-        audio_eos_token_id=128012,
-        **kwargs,
-    ):
-        if isinstance(audio_encoder_config, dict):
-            audio_encoder_config["model_type"] = (
-                audio_encoder_config["model_type"] if "model_type" in audio_encoder_config else "higgs_audio_encoder"
-            )
-            audio_encoder_config = CONFIG_MAPPING[audio_encoder_config["model_type"]](**audio_encoder_config)
-        elif audio_encoder_config is None:
-            audio_encoder_config = HiggsAudioEncoderConfig()
-        if isinstance(text_config, dict):
-            text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama"
-            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
-        elif text_config is None:
-            text_config = CONFIG_MAPPING["llama"]()
-        assert audio_adapter_type in [
-            "stack",
-            "dual_ffn",
-            "dual_ffn_fast_forward",
-        ], f"Invalid audio adapter type: {audio_adapter_type}"
-        if audio_adapter_type.startswith("dual_ffn"):
-            assert audio_dual_ffn_layers is not None, (
-                "audio_dual_ffn_layers must be specified when using dual_ffn adapter."
-            )
-        self.text_config = text_config
-        self.audio_encoder_config = audio_encoder_config
-        self.audio_tokenizer_config = audio_tokenizer_config
-        self.audio_adapter_type = audio_adapter_type
-        self.audio_embed_avg = audio_embed_avg
-        self.audio_ffn_hidden_size = audio_ffn_hidden_size
-        self.audio_ffn_intermediate_size = audio_ffn_intermediate_size
-        self.audio_dual_ffn_layers = audio_dual_ffn_layers
-        self.audio_decoder_proj_num_layers = audio_decoder_proj_num_layers
-        self.encode_whisper_embed = encode_whisper_embed
-        self.encode_audio_in_tokens = encode_audio_in_tokens
-        self.use_delay_pattern = use_delay_pattern
-        self.skip_audio_tower = skip_audio_tower
-        self.use_audio_out_embed_projector = use_audio_out_embed_projector
-        self.use_audio_out_self_attention = use_audio_out_self_attention
-        self.use_rq_transformer = use_rq_transformer
-        if self.use_rq_transformer:
-            assert not self.use_delay_pattern, "Delay pattern is not supported if you turned on RQ-Transformer!"
-        self.rq_transformer_hidden_size = rq_transformer_hidden_size
-        self.rq_transformer_intermediate_size = rq_transformer_intermediate_size
-        self.rq_transformer_num_attention_heads = rq_transformer_num_attention_heads
-        self.rq_transformer_num_key_value_heads = rq_transformer_num_key_value_heads
-        self.rq_transformer_num_hidden_layers = rq_transformer_num_hidden_layers
-        if use_rq_transformer:
-            # For RQ-Transformer, we set the hidden_size to the same as the text model's hidden size if it is not specified.
-            if self.rq_transformer_hidden_size is None:
-                self.rq_transformer_hidden_size = text_config.hidden_size
-            assert self.rq_transformer_hidden_size % 128 == 0
-            if self.rq_transformer_intermediate_size is None:
-                self.rq_transformer_intermediate_size = text_config.intermediate_size
-            if self.rq_transformer_num_attention_heads is None:
-                self.rq_transformer_num_attention_heads = self.rq_transformer_hidden_size // 128
-            if self.rq_transformer_num_key_value_heads is None:
-                self.rq_transformer_num_key_value_heads = self.rq_transformer_hidden_size // 128 // 4
-            assert self.rq_transformer_hidden_size % self.rq_transformer_num_attention_heads == 0
-            assert self.rq_transformer_hidden_size % self.rq_transformer_num_key_value_heads == 0
-        self.audio_num_codebooks = audio_num_codebooks
-        self.audio_codebook_size = audio_codebook_size
-        self.audio_bos_token = audio_bos_token
-        self.audio_eos_token = audio_eos_token
-        self.audio_out_bos_token = audio_out_bos_token
-        self.audio_in_token = audio_in_token
-        self.audio_out_token = audio_out_token
-        self.audio_in_token_idx = audio_in_token_idx
-        self.audio_out_token_idx = audio_out_token_idx
-        self.audio_stream_bos_id = audio_stream_bos_id
-        self.audio_stream_eos_id = audio_stream_eos_id
-        self.audio_out_bos_token_id = audio_out_bos_token_id
-        self.audio_eos_token_id = audio_eos_token_id
-        super().__init__(**kwargs)
-        self.pad_token_id = pad_token_id

higgs_audio/model/cuda_graph_runner.py DELETED Viewed

@@ -1,129 +0,0 @@
-import torch
-import torch.nn as nn
-from typing import Optional, List, Dict, Tuple, Union
-import gc
-from transformers.cache_utils import Cache
-_NUM_WARMUP_ITERS = 2
-class CUDAGraphRunner(nn.Module):
-    def __init__(self, model):
-        super().__init__()
-        self.model = model
-        self.input_buffers: Dict[str, torch.Tensor] = {}
-        self.output_buffers: Dict[str, torch.Tensor] = {}
-        self._graph: Optional[torch.cuda.CUDAGraph] = None
-    @property
-    def graph(self):
-        assert self._graph is not None
-        return self._graph
-    def capture(
-        self,
-        hidden_states: torch.Tensor,
-        causal_mask: torch.Tensor,
-        position_ids: torch.Tensor,
-        audio_discrete_codes_mask: torch.Tensor,
-        cache_position: torch.Tensor,
-        past_key_values: Union[Cache, List[torch.FloatTensor]],
-        use_cache: bool,
-        audio_attention_mask: torch.Tensor,
-        fast_forward_attention_mask: torch.Tensor,
-        output_attentions: bool,
-        output_hidden_states: bool,
-        is_decoding_audio_token: Optional[bool] = None,
-        is_using_cuda_graph: Optional[bool] = False,
-        stream: torch.cuda.Stream = None,
-        memory_pool: Optional[Tuple[int, int]] = None,
-    ):
-        assert self._graph is None
-        # Run warmup iterations
-        for _ in range(_NUM_WARMUP_ITERS):
-            self.model(
-                hidden_states=hidden_states,
-                causal_mask=causal_mask,
-                position_ids=position_ids,
-                audio_discrete_codes_mask=audio_discrete_codes_mask,
-                cache_position=cache_position,
-                past_key_values=past_key_values,
-                use_cache=use_cache,
-                audio_attention_mask=audio_attention_mask,
-                fast_forward_attention_mask=fast_forward_attention_mask,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                is_decoding_audio_token=is_decoding_audio_token,
-                is_using_cuda_graph=is_using_cuda_graph,
-            )
-        torch.cuda.synchronize()
-        # Capture the graph
-        self._graph = torch.cuda.CUDAGraph()
-        with torch.cuda.graph(self._graph, pool=memory_pool, stream=stream):
-            out_hidden_states, all_hidden_states, all_self_attns = self.model(
-                hidden_states=hidden_states,
-                causal_mask=causal_mask,
-                position_ids=position_ids,
-                audio_discrete_codes_mask=audio_discrete_codes_mask,
-                cache_position=cache_position,
-                past_key_values=past_key_values,
-                use_cache=use_cache,
-                audio_attention_mask=audio_attention_mask,
-                fast_forward_attention_mask=fast_forward_attention_mask,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                is_decoding_audio_token=is_decoding_audio_token,
-                is_using_cuda_graph=is_using_cuda_graph,
-            )
-            # hidden_states_out = torch.ops._C.weak_ref_tensor(outputs[0])
-            # del outputs
-            gc.collect()
-        torch.cuda.synchronize()
-        # Save input and output buffers
-        self.input_buffers = {
-            "hidden_states": hidden_states,
-            "causal_mask": causal_mask,
-            "position_ids": position_ids,
-            "audio_discrete_codes_mask": audio_discrete_codes_mask,
-            "cache_position": cache_position,
-            "past_key_values": past_key_values,
-            "audio_attention_mask": audio_attention_mask,
-            "fast_forward_attention_mask": fast_forward_attention_mask,
-        }
-        self.output_buffers = {
-            "hidden_states": out_hidden_states,
-            "all_hidden_states": all_hidden_states,
-            "all_self_attns": all_self_attns,
-        }
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        causal_mask: torch.Tensor,
-        position_ids: torch.Tensor,
-        audio_discrete_codes_mask: torch.Tensor,
-        cache_position: torch.Tensor,
-        audio_attention_mask: torch.Tensor,
-        fast_forward_attention_mask: torch.Tensor,
-        **kwargs,
-    ) -> torch.Tensor:
-        # Copy input tensors to buffers
-        self.input_buffers["hidden_states"].copy_(hidden_states, non_blocking=True)
-        self.input_buffers["causal_mask"].copy_(causal_mask, non_blocking=True)
-        self.input_buffers["position_ids"].copy_(position_ids, non_blocking=True)
-        self.input_buffers["audio_discrete_codes_mask"].copy_(audio_discrete_codes_mask, non_blocking=True)
-        self.input_buffers["cache_position"].copy_(cache_position, non_blocking=True)
-        self.input_buffers["audio_attention_mask"].copy_(audio_attention_mask, non_blocking=True)
-        self.input_buffers["fast_forward_attention_mask"].copy_(fast_forward_attention_mask, non_blocking=True)
-        # Run the captured graph
-        self.graph.replay()
-        return self.output_buffers["hidden_states"], None, None

higgs_audio/model/custom_modules.py DELETED Viewed

@@ -1,155 +0,0 @@
-import torch
-import torch.nn as nn
-class PartiallyFrozenEmbedding(nn.Module):
-    """Split an existing `nn.Embedding` module that splits the embedding into:
-    - A frozen embedding for indices [0..freeze_until_idx].
-    - A trainable embedding for indices [freeze_until_idx+1..vocab_size-1].
-    This should work with both Zero-2 and Zero-3 seamlessly
-    """
-    def __init__(self, original_embedding: nn.Embedding, freeze_until_idx: int):
-        """
-        :param original_embedding: An instance of nn.Embedding (the original embedding layer).
-        :param freeze_until_idx: The index up to which the embedding is frozen (excluding). The freeze_until_idx is not frozen.
-        """
-        super().__init__()
-        self.freeze_until_idx = freeze_until_idx
-        self.original_vocab_size = original_embedding.num_embeddings
-        self.embedding_dim = original_embedding.embedding_dim
-        # Split the original embedding into frozen and trainable parts
-        self.embedding_frozen = nn.Embedding(
-            freeze_until_idx,
-            self.embedding_dim,
-            dtype=original_embedding.weight.dtype,
-            device=original_embedding.weight.device,
-        )
-        self.embedding_trainable = nn.Embedding(
-            self.original_vocab_size - freeze_until_idx,
-            self.embedding_dim,
-            dtype=original_embedding.weight.dtype,
-            device=original_embedding.weight.device,
-        )
-        # Copy weights from the original embedding into the frozen and trainable parts
-        with torch.no_grad():
-            self.embedding_frozen.weight.copy_(original_embedding.weight[:freeze_until_idx])
-            self.embedding_trainable.weight.copy_(original_embedding.weight[freeze_until_idx:])
-        # Freeze the frozen embedding
-        self.embedding_frozen.weight.requires_grad = False
-    def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
-        """
-        Forward pass for the split embedding wrapper.
-        :param input_ids: Tensor of shape [batch_size, seq_len] with indices in [0..original_vocab_size-1].
-        """
-        # Masks to separate frozen and trainable indices
-        # (bsz, seq_len)
-        mask_frozen = input_ids < self.freeze_until_idx
-        mask_trainable = ~mask_frozen
-        # Output tensor for embedding results
-        batch_size, seq_len = input_ids.shape
-        embeddings = torch.zeros(
-            batch_size,
-            seq_len,
-            self.embedding_dim,
-            device=input_ids.device,
-            dtype=self.embedding_frozen.weight.dtype,
-        )
-        # Handle frozen embedding
-        if mask_frozen.any():
-            frozen_ids = input_ids[mask_frozen]
-            frozen_emb = self.embedding_frozen(frozen_ids)
-            embeddings[mask_frozen] = frozen_emb
-        # Handle trainable embedding
-        if mask_trainable.any():
-            # Adjust trainable IDs to the local index space of the trainable embedding
-            trainable_ids = input_ids[mask_trainable] - (self.freeze_until_idx)
-            trainable_emb = self.embedding_trainable(trainable_ids)
-            embeddings[mask_trainable] = trainable_emb
-        return embeddings
-    def to_unsplit(self) -> nn.Embedding:
-        unsplit_embedding = nn.Embedding(
-            self.original_vocab_size,
-            self.embedding_dim,
-            dtype=self.embedding_frozen.weight.dtype,
-            device=self.embedding_frozen.weight.device,
-        )
-        with torch.no_grad():
-            unsplit_embedding.weight[: self.freeze_until_idx].copy_(self.embedding_frozen.weight)
-            unsplit_embedding.weight[self.freeze_until_idx :].copy_(self.embedding_trainable.weight)
-        return unsplit_embedding
-class PartiallyFrozenLinear(nn.Module):
-    """A wrapper around nn.Linear to partially freeze part of the weight matrix."""
-    def __init__(self, original_linear: nn.Linear, freeze_until_idx: int):
-        """
-        :param original_linear: The original nn.Linear layer.
-        :param freeze_until_idx: The index up to which the rows of the weight matrix are frozen.
-        """
-        super().__init__()
-        assert original_linear.bias is None, "Currently only support linear module without bias"
-        self.freeze_until_idx = freeze_until_idx
-        self.input_dim = original_linear.in_features
-        self.output_dim = original_linear.out_features
-        # Create frozen and trainable linear layers
-        self.linear_frozen = nn.Linear(
-            self.input_dim,
-            freeze_until_idx,
-            bias=False,
-            dtype=original_linear.weight.dtype,
-            device=original_linear.weight.device,
-        )
-        self.linear_trainable = nn.Linear(
-            self.input_dim,
-            self.output_dim - freeze_until_idx,
-            bias=False,
-            dtype=original_linear.weight.dtype,
-            device=original_linear.weight.device,
-        )
-        # Copy weights from the original linear layer
-        with torch.no_grad():
-            self.linear_frozen.weight.copy_(original_linear.weight[:freeze_until_idx])
-            self.linear_trainable.weight.copy_(original_linear.weight[freeze_until_idx:])
-        # Freeze the frozen linear layer
-        self.linear_frozen.weight.requires_grad = False
-    def forward(self, input_tensor):
-        # input_tensor: (bsz, seq_len, hidden_state_dim)
-        frozen_output = self.linear_frozen(input_tensor)
-        trainable_output = self.linear_trainable(input_tensor)
-        return torch.cat((frozen_output, trainable_output), dim=-1)
-    def to_unsplit(self) -> nn.Linear:
-        unsplit_linear = nn.Linear(
-            self.input_dim,
-            self.output_dim,
-            bias=False,
-            dtype=self.linear_frozen.weight.dtype,
-            device=self.linear_frozen.weight.device,
-        )
-        # Copy weights from the frozen and trainable layers into the unsplit linear layer
-        with torch.no_grad():
-            unsplit_linear.weight[: self.freeze_until_idx].copy_(self.linear_frozen.weight)
-            unsplit_linear.weight[self.freeze_until_idx :].copy_(self.linear_trainable.weight)
-        return unsplit_linear

higgs_audio/model/modeling_higgs_audio.py DELETED Viewed

The diff for this file is too large to render. See raw diff

higgs_audio/model/utils.py DELETED Viewed

@@ -1,778 +0,0 @@
-import contextlib
-from contextlib import contextmanager
-from functools import wraps
-import torch
-from transformers.integrations import is_deepspeed_available
-if is_deepspeed_available():
-    from deepspeed.utils import groups as deepspeed_groups
-    from deepspeed.sequence.layer import _SeqAllToAll
-else:
-    deepspeed_groups = None
-    _SeqAllToAll = None
-def _ceil_to_nearest(n, round_to):
-    return (n + round_to - 1) // round_to * round_to
-def count_parameters(model, trainable_only=True):
-    if trainable_only:
-        return sum(p.numel() for p in model.parameters() if p.requires_grad)
-    else:
-        return sum(p.numel() for p in model.parameters())
-# TODO(sxjscience) Consider to move the function to audio_processing/utils.py
-def build_delay_pattern_mask(
-    input_ids: torch.LongTensor,
-    bos_token_id: int,
-    pad_token_id: int,
-):
-    """Implement the delay pattern proposed in "Simple and Controllable Music Generation", https://arxiv.org/pdf/2306.05284
-    In the delay pattern, each codebook is offset by the previous codebook by
-    one. We insert a special delay token at the start of the sequence if its delayed, and append pad token once the sequence finishes.
-    Take the example where there are 4 codebooks and audio sequence length=5. After shifting, the output should have length seq_len + num_codebooks - 1
-    - [ *,  *,  *,  *,  *,  P,  P,  P]
-    - [ B,  *,  *,  *,  *,  *,  P,  P]
-    - [ B,  B,  *,  *,  *,  *,  *,  P]
-    - [ B,  B,  B,  *,  *,  *,  *,  *]
-    where B indicates the delay token id, P is the special padding token id and `*` indicates that the original audio token.
-    Now let's consider the case where we have a sequence of audio tokens to condition on.
-    The audio tokens were originally in the following non-delayed form:
-    - [a, b]
-    - [c, d]
-    - [e, f]
-    - [g, h]
-    After conversion, we get the following delayed form:
-    - [a, b, -1, -1, -1]
-    - [B, c,  d, -1, -1]
-    - [B, B,  e,  f, -1]
-    - [B, B,  B,  g,  h]
-    Note that we have a special token `-1` that indicates it should be replaced by a new token we see in the generation phase.
-    In that case, we should override the `-1` tokens in auto-regressive generation.
-    Args:
-        input_ids (:obj:`torch.LongTensor`):
-            The input ids of the prompt. It will have shape (bsz, num_codebooks, seq_len).
-        bos_token_id (:obj:`int`):
-            The id of the special delay token
-        pad_token_id (:obj:`int`):
-            The id of the padding token. Should be the same as eos_token_id.
-    Returns:
-        input_ids (:obj:`torch.LongTensor`):
-            The transformed input ids with delay pattern applied. It will have shape (bsz, num_codebooks, seq_len + num_codebooks - 1).
-        input_ids_with_gen_mask (:obj:`torch.LongTensor`):
-            The transformed input ids with delay pattern applied. The -1 in the output indicates new tokens that should be generated.
-    """
-    bsz, num_codebooks, seq_len = input_ids.shape
-    new_seq_len = seq_len + num_codebooks - 1
-    input_ids_with_gen_mask = torch.ones((bsz, num_codebooks, new_seq_len), dtype=torch.long, device=input_ids.device)
-    bos_mask = torch.tril(input_ids_with_gen_mask, -1) > 0
-    eos_mask = torch.triu(input_ids_with_gen_mask, seq_len) > 0
-    input_ids_with_gen_mask[bos_mask] = bos_token_id
-    input_ids_with_gen_mask[(~bos_mask) & (~eos_mask)] = input_ids.reshape(-1)
-    input_ids = input_ids_with_gen_mask.clone()
-    input_ids[eos_mask] = pad_token_id
-    input_ids_with_gen_mask[eos_mask] = -1
-    return input_ids, input_ids_with_gen_mask
-def revert_delay_pattern(data):
-    """Convert samples encoded with delay pattern back to the original form.
-    Args:
-        data (:obj:`torch.Tensor`):
-            The data with delay pattern applied. It will have shape (num_codebooks, seq_len + num_codebooks - 1).
-    Returns:
-        ret (:obj:`torch.Tensor`):
-            Recovered data with delay pattern removed. It will have shape (num_codebooks, seq_len).
-    """
-    assert len(data.shape) == 2
-    out_l = []
-    num_codebooks = data.shape[0]
-    for i in range(num_codebooks):
-        out_l.append(data[i : (i + 1), i : (data.shape[1] - num_codebooks + 1 + i)])
-    return torch.cat(out_l, dim=0)
-def merge_input_ids_with_audio_features(
-    audio_features_embed,
-    audio_features_length,
-    audio_in_embed,
-    audio_in_ids_start,
-    audio_out_embed,
-    audio_out_ids_start,
-    audio_in_token_idx,
-    audio_out_token_idx,
-    inputs_embeds,
-    input_ids,
-    attention_mask,
-    label_ids,
-    pad_token_id,
-    ignore_index=-100,
-    round_to=8,
-    left_padding=True,
-):
-    """
-    Merge input_ids with audio features into final embeddings.
-    Args:
-        audio_features_embed (`torch.Tensor` of shape `(num_audios, max_audio_tokens, embed_dim)`):
-            Encoded vectors of all audios in the batch (obtained from the semantic encoder)
-        audio_features_length (`torch.LongTensor` of shape `(num_audios,)`):
-            The length of audio embeddings of each audio as stacked in `audio_features_embed`
-        audio_in_embed (`torch.Tensor` of shape `(total_num_audio_in_tokens, embed_dim)`):
-            The embeddings of audio-in tokens
-        audio_in_ids_start (`torch.LongTensor` of shape `(num_audios,)`):
-            The start index of the audio-in tokens for each audio
-        audio_out_embed (`torch.Tensor` of shape `(total_num_audio_out_tokens, embed_dim)`):
-            The embeddings of audio-out tokens
-        audio_out_ids_start (`torch.LongTensor` of shape `(num_audios,)`):
-            The start index of the audio-out tokens for each audio
-        audio_in_token_idx
-            The index of the audio-in token in the vocabulary
-        audio_out_token_idx
-            The index of the audio-out token in the vocabulary
-        inputs_embeds (`torch.Tensor` of shape `(batch_size, sequence_length, embed_dim)`):
-            Token embeddings before merging with audio embeddings
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Input_ids of tokens, possibly filled with audio token
-        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Mask to avoid performing attention on padding token indices.
-        label_ids (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*)
-            labels need to be recalculated to support training (if provided)
-        pad_token_id (`int`):
-            The index of the pad token in the vocabulary
-        ignore_index
-            The index to ignore in the loss calculation
-        round_to
-            The number to round to for padding
-        left_padding
-            Whether to apply left padding
-    Returns:
-        final_embedding
-            The final embeddings after merging audio embeddings with text embeddings.
-        final_attention_mask
-            The final attention mask after merging audio embeddings with text embeddings.
-        final_labels
-            The labels for the text stream
-        position_ids
-            Positional ids for the merged data
-        final_input_ids
-            The final input_ids after merging audio embeddings with text embeddings.
-        final_audio_in_mask
-            Mask for audio-in embeddings
-        final_audio_in_discrete_codes_mask
-            Mask for audio-in discrete tokens
-        final_audio_out_mask
-            Mask for audio-out embeddings
-    Explanation:
-        each audio has variable length embeddings, with length specified by
-        - audio_features_length
-        - audio_in_ids_start
-        - audio_out_ids_start
-        Task:
-        - fill each <|AUDIO|> with audio embeddings (it can be the combination of embeddings extracted by WhisperEncoder and embeddings from audio codebooks)
-        - fill each <|AUDIO_OUT|> with the audio-out embeddings
-        Example:
-            <|AUDIO_OUT|>: X (5 tokens), Y (3 tokens)
-            <|AUDIO|>: Z (8 tokens)
-            X, Y are in the same sequence (in-context voice-clone). Z is in a different sequence (audio understanding).
-        if right padding
-            input_ids: [
-                a b c d e f X g h i j k Y l m
-                o p q r Z s t u v _ _ _ _ _ _
-            ]
-            input_ids should be: [
-                a b c d e f X X X X X g h i j k Y Y Y l m
-                o p q r Z Z Z Z Z Z Z Z s t u v _ _ _ _ _
-            ]
-            labels should be: [
-                a b c d e f _ _ _ _ _ g h i j k _ _ _ l m
-                o p q r _ _ _ _ _ _ _ _ s t u v _ _ _ _ _
-            ]
-        elif left padding
-            input_ids: [
-                a b c d e f X g h i j k Y l m
-                _ _ _ _ _ _ o p q r Z s t u v
-            ]
-            input_ids should be: [
-                a b c d e f X X X X X g h i j k Y Y Y l m
-                _ _ _ _ _ o p q r Z Z Z Z Z Z Z Z s t u v
-            ]
-            labels should be: [
-                a b c d e f _ _ _ _ _ g h i j k _ _ _ l m
-                _ _ _ _ _ o p q r _ _ _ _ _ _ _ _ s t u v
-            ]
-    """
-    if label_ids is None:
-        skip_labels = True
-    else:
-        skip_labels = False
-    if audio_features_embed is not None and audio_features_embed.shape[0] == 0:
-        audio_features_embed = None
-    if audio_in_embed is not None and audio_in_embed.shape[0] == 0:
-        audio_in_embed = None
-    if audio_out_embed is not None and audio_out_embed.shape[0] == 0:
-        audio_out_embed = None
-    batch_size, sequence_length, embed_dim = inputs_embeds.shape
-    target_device = inputs_embeds.device
-    if left_padding is None:
-        left_padding = torch.any(attention_mask[:, 0] == 0)
-    audio_in_token_mask = input_ids == audio_in_token_idx
-    audio_out_token_mask = input_ids == audio_out_token_idx
-    text_token_mask = (input_ids != audio_in_token_idx) & (input_ids != audio_out_token_idx)
-    # 1. Calculate the number of tokens for each placeholder (like [<|AUDIO|>, <|AUDIO_OUT|>]).
-    token_placeholder_num = torch.ones_like(input_ids)
-    if audio_features_embed is not None:
-        num_audios, max_audio_tokens, _ = audio_features_embed.shape
-        audio_in_features_mask = torch.arange(max_audio_tokens).expand(num_audios, max_audio_tokens).to(
-            audio_features_length.device
-        ) < audio_features_length.unsqueeze(1)
-        masked_audio_in_features = audio_features_embed[audio_in_features_mask].view(-1, embed_dim)
-        token_placeholder_num[audio_in_token_mask] = audio_features_length.long()
-    if audio_in_embed is not None:
-        audio_in_codes_length = torch.concat(
-            [
-                audio_in_ids_start[1:] - audio_in_ids_start[:-1],
-                torch.tensor(
-                    [audio_in_embed.shape[0] - audio_in_ids_start[-1]],
-                    device=audio_in_ids_start.device,
-                    dtype=torch.long,
-                ),
-            ],
-            dim=0,
-        )
-        if audio_features_embed is not None:
-            token_placeholder_num[audio_in_token_mask] += audio_in_codes_length.long()
-        else:
-            token_placeholder_num[audio_in_token_mask] = audio_in_codes_length.long()
-    if audio_out_embed is not None:
-        audio_out_codes_length = torch.concat(
-            [
-                audio_out_ids_start[1:] - audio_out_ids_start[:-1],
-                torch.tensor(
-                    [audio_out_embed.shape[0] - audio_out_ids_start[-1]],
-                    device=audio_out_ids_start.device,
-                    dtype=torch.long,
-                ),
-            ],
-            dim=0,
-        )
-        token_placeholder_num[audio_out_token_mask] = audio_out_codes_length.long()
-    new_token_positions = torch.cumsum(token_placeholder_num, -1) - 1
-    max_token_num = _ceil_to_nearest(token_placeholder_num.sum(-1).max(), round_to)
-    nb_audio_pad = max_token_num - 1 - new_token_positions[:, -1]
-    if left_padding:
-        new_token_positions += nb_audio_pad[:, None]  # offset for left padding
-    # 2. Create the full embedding, already padded to the maximum position
-    final_embedding = torch.zeros(
-        (batch_size, max_token_num, embed_dim),
-        dtype=inputs_embeds.dtype,
-        device=inputs_embeds.device,
-    )
-    final_attention_mask = torch.zeros(
-        (batch_size, max_token_num),
-        dtype=attention_mask.dtype,
-        device=inputs_embeds.device,
-    )
-    final_input_ids = torch.full(
-        (batch_size, max_token_num),
-        pad_token_id,
-        dtype=input_ids.dtype,
-        device=inputs_embeds.device,
-    )
-    if skip_labels:
-        final_labels = None
-    else:
-        final_labels = torch.full(
-            (batch_size, max_token_num),
-            ignore_index,
-            dtype=label_ids.dtype,
-            device=inputs_embeds.device,
-        )
-    final_audio_in_mask = torch.full(
-        (batch_size, max_token_num),
-        False,
-        dtype=torch.bool,
-        device=inputs_embeds.device,
-    )
-    final_audio_in_discrete_codes_mask = torch.full(
-        (batch_size, max_token_num),
-        False,
-        dtype=torch.bool,
-        device=inputs_embeds.device,
-    )
-    final_audio_out_mask = torch.full(
-        (batch_size, max_token_num),
-        False,
-        dtype=torch.bool,
-        device=inputs_embeds.device,
-    )
-    # 3. Get the audio-in token positions and audio-out token positions
-    batch_id = torch.arange(batch_size, device=target_device).unsqueeze(1).expand(batch_size, sequence_length)
-    audio_in_batch_id = batch_id[audio_in_token_mask]  # Shape (num_audio_in,)
-    audio_out_batch_id = batch_id[audio_out_token_mask]  # Shape (num_audio_out,)
-    audio_features_token_ends = new_token_positions[audio_in_token_mask]  # Shape (num_audio_in,)
-    audio_out_embed_ends = new_token_positions[audio_out_token_mask]  # Shape (num_audio_out,)
-    if audio_in_embed is not None:
-        # Fill in the audio-in embeddings
-        seq_indices = (
-            torch.arange(max_token_num, device=target_device)
-            .unsqueeze(0)
-            .expand(audio_in_ids_start.shape[0], max_token_num)
-        )
-        audio_in_embed_token_starts = audio_features_token_ends - audio_in_codes_length + 1
-        batch_indices, col_indices = torch.where(
-            (seq_indices >= audio_in_embed_token_starts.unsqueeze(1))
-            & (seq_indices <= audio_features_token_ends.unsqueeze(1))
-        )
-        batch_indices = audio_in_batch_id[batch_indices]
-        final_embedding[batch_indices, col_indices] = audio_in_embed
-        final_input_ids[batch_indices, col_indices] = audio_in_token_idx
-        if not skip_labels:
-            final_labels[batch_indices, col_indices] = ignore_index
-        final_audio_in_mask[batch_indices, col_indices] = True
-        final_audio_in_discrete_codes_mask[batch_indices, col_indices] = True
-        audio_features_token_ends = audio_features_token_ends - audio_in_codes_length
-    if audio_features_embed is not None:
-        # Fill in the audio features
-        seq_indices = (
-            torch.arange(max_token_num, device=target_device)
-            .unsqueeze(0)
-            .expand(audio_features_embed.shape[0], max_token_num)
-        )
-        audio_features_token_starts = audio_features_token_ends - audio_features_length + 1
-        batch_indices, col_indices = torch.where(
-            (seq_indices >= audio_features_token_starts.unsqueeze(1))
-            & (seq_indices <= audio_features_token_ends.unsqueeze(1))
-        )
-        batch_indices = audio_in_batch_id[batch_indices]
-        final_embedding[batch_indices, col_indices] = masked_audio_in_features
-        final_input_ids[batch_indices, col_indices] = audio_in_token_idx
-        if not skip_labels:
-            final_labels[batch_indices, col_indices] = ignore_index
-        final_audio_in_mask[batch_indices, col_indices] = True
-    if audio_out_embed is not None:
-        # Fill in the audio-out embeddings
-        seq_indices = (
-            torch.arange(max_token_num, device=target_device)
-            .unsqueeze(0)
-            .expand(audio_out_ids_start.shape[0], max_token_num)
-        )
-        audio_out_embed_token_starts = audio_out_embed_ends - audio_out_codes_length + 1
-        batch_indices, col_indices = torch.where(
-            (seq_indices >= audio_out_embed_token_starts.unsqueeze(1))
-            & (seq_indices <= audio_out_embed_ends.unsqueeze(1))
-        )
-        batch_indices = audio_out_batch_id[batch_indices]
-        final_embedding[batch_indices, col_indices] = audio_out_embed
-        final_input_ids[batch_indices, col_indices] = audio_out_token_idx
-        if not skip_labels:
-            final_labels[batch_indices, col_indices] = ignore_index
-        final_audio_out_mask[batch_indices, col_indices] = True
-    # Fill in the original text embeddings and labels
-    batch_indices, non_audio_indices = torch.where(text_token_mask)
-    text_to_overwrite = new_token_positions[batch_indices, non_audio_indices]
-    final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_audio_indices]
-    if not skip_labels:
-        final_labels[batch_indices, text_to_overwrite] = label_ids[batch_indices, non_audio_indices]
-    final_input_ids[batch_indices, text_to_overwrite] = input_ids[batch_indices, non_audio_indices]
-    final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[batch_indices, non_audio_indices]
-    final_attention_mask = final_attention_mask | final_audio_in_mask | final_audio_out_mask
-    # Trim the tensor if there are redundant padding tokens
-    if left_padding:
-        first_non_zero_loc = final_attention_mask.sum(0).nonzero()[0]
-        first_non_zero_loc = (first_non_zero_loc // round_to) * round_to
-        if first_non_zero_loc > 0:
-            final_attention_mask = final_attention_mask[:, first_non_zero_loc:]
-            final_embedding = final_embedding[:, first_non_zero_loc:]
-            if not skip_labels:
-                final_labels = final_labels[:, first_non_zero_loc:]
-            final_input_ids = final_input_ids[:, first_non_zero_loc:]
-            final_audio_in_mask = final_audio_in_mask[:, first_non_zero_loc:]
-            final_audio_in_discrete_codes_mask = final_audio_in_discrete_codes_mask[:, first_non_zero_loc:]
-            final_audio_out_mask = final_audio_out_mask[:, first_non_zero_loc:]
-    else:
-        # We have done right padding, so we need to trim the mask
-        last_non_zero_loc = final_attention_mask.sum(0).nonzero()[-1] + 1
-        last_non_zero_loc = ((last_non_zero_loc + round_to - 1) // round_to) * round_to
-        if last_non_zero_loc < max_token_num:
-            final_attention_mask = final_attention_mask[:, :last_non_zero_loc]
-            final_embedding = final_embedding[:, :last_non_zero_loc]
-            if not skip_labels:
-                final_labels = final_labels[:, :last_non_zero_loc]
-            final_input_ids = final_input_ids[:, :last_non_zero_loc]
-            final_audio_in_mask = final_audio_in_mask[:, :last_non_zero_loc]
-            final_audio_in_discrete_codes_mask = final_audio_in_discrete_codes_mask[:, :last_non_zero_loc]
-            final_audio_out_mask = final_audio_out_mask[:, :last_non_zero_loc]
-    position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_((final_attention_mask == 0), 1)
-    return (
-        final_embedding,
-        final_attention_mask,
-        final_labels,
-        position_ids,
-        final_input_ids,
-        final_audio_in_mask,
-        final_audio_in_discrete_codes_mask,
-        final_audio_out_mask,
-    )
-def is_deepspeed_ulysses_enabled():
-    if deepspeed_groups is None:
-        return False
-    """Check if sequence parallelism is enabled."""
-    return deepspeed_groups._get_sequence_parallel_world_size() > 1
-def support_deepspeed_ulysses(module):
-    """A decorator around Pytorch module. It is needed for the module that needs access to sequence parallel info."""
-    module._sp_size = None
-    module._sp_rank = None
-    module._sp_group = None
-    @property
-    def sp_size(self):
-        if self._sp_size is None:
-            self._sp_size = 1
-            if is_deepspeed_ulysses_enabled():
-                self._sp_size = deepspeed_groups._get_sequence_parallel_group().size()
-        return self._sp_size
-    @property
-    def sp_rank(self):
-        if self._sp_rank is None:
-            self._sp_rank = 0
-            if is_deepspeed_ulysses_enabled():
-                self._sp_rank = deepspeed_groups._get_sequence_parallel_rank()
-        return self._sp_rank
-    @property
-    def sp_group(self):
-        if self._sp_group is None and is_deepspeed_ulysses_enabled():
-            self._sp_group = deepspeed_groups._get_sequence_parallel_group()
-        return self._sp_group
-    module.sp_size = sp_size
-    module.sp_rank = sp_rank
-    module.sp_group = sp_group
-    return module
-def deepspeed_ulysses_attention(seq_dim=1, head_dim=2):
-    """Perform all-to-all before and after the attention function."""
-    def attention_decorator(attn_func=None):
-        def wrapped(*args, **kwargs):
-            if is_deepspeed_ulysses_enabled():
-                sp_group = deepspeed_groups._get_sequence_parallel_group()
-                scatter_idx = head_dim  # Scatter on num_heads dimension
-                gather_idx = seq_dim  # Gather on seq_len dimension
-                batch_dim_idx = 0
-                args = list(args)
-                args[0] = _SeqAllToAll.apply(sp_group, args[0], scatter_idx, gather_idx, batch_dim_idx)
-                args[1] = _SeqAllToAll.apply(sp_group, args[1], scatter_idx, gather_idx, batch_dim_idx)
-                args[2] = _SeqAllToAll.apply(sp_group, args[2], scatter_idx, gather_idx, batch_dim_idx)
-                args = tuple(args)
-            attn_output = attn_func(*args, **kwargs)
-            if is_deepspeed_ulysses_enabled():
-                scatter_idx = seq_dim  # Scatter back on seq_len dimension
-                gather_idx = head_dim  # Gather on num_heads dimension
-                batch_dim_idx = 0
-                attn_output = _SeqAllToAll.apply(sp_group, attn_output, scatter_idx, gather_idx, batch_dim_idx)
-            return attn_output
-        return wrapped
-    return attention_decorator
-def deepspeed_ulysses_rope(state_seq_dim=2, trig_seq_dim=1):
-    """Slice the corresponding cos and sin chunks for rope."""
-    def rope_decorator(rope_func=None):
-        def wrapped(*args, **kwargs):
-            if is_deepspeed_ulysses_enabled():
-                sp_rank = deepspeed_groups._get_sequence_parallel_rank()
-                args = list(args)
-                seq_chunk_size = args[0].size(state_seq_dim)
-                args[2] = torch.narrow(args[2], trig_seq_dim, sp_rank * seq_chunk_size, seq_chunk_size)
-                args[3] = torch.narrow(args[3], trig_seq_dim, sp_rank * seq_chunk_size, seq_chunk_size)
-                args = tuple(args)
-            return rope_func(*args, **kwargs)
-        return wrapped
-    return rope_decorator
-def _gather_tensors(input_, group=None):
-    """Gather tensors and concatenate them along a dimension."""
-    input_ = input_.contiguous()
-    world_size = torch.distributed.get_world_size(group)
-    if world_size == 1:
-        return input_
-    tensor_shapes = [
-        torch.empty(len(input_.size()), dtype=torch.int64, device=input_.device) for _ in range(world_size)
-    ]
-    input_size = torch.tensor(input_.size(), dtype=torch.int64, device=input_.device)
-    torch.distributed.all_gather(tensor_shapes, input_size, group=group)
-    gathered_buffers = [
-        torch.empty(tensor_shapes[i].tolist(), dtype=input_.dtype, device=input_.device) for i in range(world_size)
-    ]
-    torch.distributed.all_gather(gathered_buffers, input_, group=group)
-    return gathered_buffers
-def _scatter_tensors(input_, group=None):
-    """Scatter tensors."""
-    world_size = torch.distributed.get_world_size(group)
-    if world_size == 1:
-        return input_
-    rank = torch.distributed.get_rank(group)
-    return input_[rank]
-class _GatherTensors(torch.autograd.Function):
-    """All gather tensors among the ranks."""
-    @staticmethod
-    def symbolic(graph, input_, group):
-        return _gather_tensors(input_, group)
-    @staticmethod
-    def forward(ctx, input_, group):
-        ctx.group = group
-        return torch.nested.as_nested_tensor(_gather_tensors(input_, group), layout=torch.jagged)
-    @staticmethod
-    def backward(ctx, grad_output):
-        return _scatter_tensors(grad_output, ctx.group), None
-def all_gather_tensors(input_, size=None, dim=0, group=None):
-    if torch.distributed.get_world_size(group) == 1:
-        # no sequence parallelism
-        return input_
-    gathered_tensors = _GatherTensors.apply(input_, group)
-    if size:
-        split_gathered_tensors = []
-        for s, gathered_tensor in zip(size, gathered_tensors):
-            split_gathered_tensor = torch.split(gathered_tensor, s.tolist())
-            split_gathered_tensors.append(split_gathered_tensor)
-        gathered_tensors = [y for x in zip(*split_gathered_tensors) for y in x]
-    return torch.cat(gathered_tensors, dim).contiguous()
-def get_sequence_data_parallel_world_size():
-    return torch.distributed.get_world_size()
-def get_sequence_data_parallel_rank():
-    return torch.distributed.get_rank()
-def get_sequence_data_parallel_group():
-    return torch.distributed.group.WORLD
-if is_deepspeed_available():
-    deepspeed_groups._get_sequence_data_parallel_world_size = get_sequence_data_parallel_world_size
-    deepspeed_groups._get_sequence_data_parallel_rank = get_sequence_data_parallel_rank
-    deepspeed_groups._get_sequence_data_parallel_group = get_sequence_data_parallel_group
-def _gather_tokens(input_, dim=0, group=None):
-    """Gather tensors and concatenate them along a dimension"""
-    input_ = input_.contiguous()
-    world_size = torch.distributed.get_world_size(group)
-    if world_size == 1:
-        return input_
-    gather_buffer = torch.empty(world_size * input_.numel(), dtype=input_.dtype, device=input_.device)
-    torch.distributed.all_gather_into_tensor(gather_buffer, input_, group=group)
-    if dim == 0:
-        shape = list(input_.size())
-        shape[0] = shape[0] * world_size
-        output = gather_buffer.view(shape)
-    else:
-        tensor_list = [
-            gather_buffer.narrow(0, input_.numel() * i, input_.numel()).view_as(input_) for i in range(world_size)
-        ]
-        # Note: torch.cat already creates a contiguous tensor.
-        output = torch.cat(tensor_list, dim=dim).contiguous()
-    return output
-def _drop_tokens(input_, dim=0, group=None):
-    """Divide a tensor among the sequence parallel ranks"""
-    world_size = torch.distributed.get_world_size(group)
-    if world_size == 1:
-        return input_
-    this_rank = torch.distributed.get_rank(group)
-    assert input_.shape[dim] % world_size == 0, (
-        f"input dimension {dim} ({input_.shape[dim]}) is not divisible by sequence parallel world size ({world_size})"
-    )
-    chunk_size = input_.shape[dim] // world_size
-    return torch.narrow(input_, dim, this_rank * chunk_size, chunk_size)
-class _DropTokens(torch.autograd.Function):
-    "Divide tokens equally among the sequence parallel ranks"
-    @staticmethod
-    def symbolic(graph, input_, dim, group, grad_scale):
-        return _drop_tokens(input_, dim, group)
-    @staticmethod
-    def forward(ctx, input_, dim, group, grad_scale):
-        ctx.dim = dim
-        ctx.group = group
-        ctx.grad_scale = grad_scale
-        return _drop_tokens(input_, dim, group)
-    @staticmethod
-    def backward(ctx, grad_output):
-        grad_input = _gather_tokens(grad_output, ctx.dim, ctx.group)
-        if ctx.grad_scale != 1:
-            grad_input /= ctx.grad_scale
-        return grad_input, None, None, None
-class _GatherTokens(torch.autograd.Function):
-    "Gather tokens among the sequence parallel ranks"
-    @staticmethod
-    def symbolic(graph, input_, dim, group, grad_scale):
-        return _gather_tokens(input_, dim, group)
-    @staticmethod
-    def forward(ctx, input_, dim, group, grad_scale):
-        ctx.dim = dim
-        ctx.group = group
-        ctx.grad_scale = grad_scale
-        return _gather_tokens(input_, dim, group)
-    @staticmethod
-    def backward(ctx, grad_output):
-        grad_input = _drop_tokens(grad_output, ctx.dim, ctx.group)
-        if ctx.grad_scale != 1:
-            grad_input *= ctx.grad_scale
-        return grad_input, None, None, None
-def drop_tokens(input_, dim=0, group=None, grad_scale=1):
-    if torch.distributed.get_world_size(group) == 1:
-        # no sequence parallelism
-        return input_
-    return _DropTokens.apply(input_, dim, group, grad_scale)
-def gather_tokens(input_, dim=0, group=None, grad_scale=1):
-    if torch.distributed.get_world_size(group) == 1:
-        # no sequence parallelism
-        return input_
-    return _GatherTokens.apply(input_, dim, group, grad_scale)
-def sequence_chunking_per_rank(sp_size, sp_rank, *args, dim=1):
-    """
-    Slice the inputs to create chuncks per the sequence parallel rank. This is used for the context parallel training.
-    Args:
-        sp_size (`int`):
-            Sequence parallel size.
-        sp_rank (`int`):
-            Sequence parallel rank for the current process.
-        dim (`int`):
-           The dimension to slice
-    """
-    if sp_size == 1:
-        return args[0] if len(args) == 1 else args
-    seq_length = args[0].size(dim)
-    for arg in args[1:]:
-        assert arg.size(dim) == seq_length, (
-            f"arg={arg} ({arg.shape[dim]}) does not have the same size as args[0] ({seq_length}) in dimension {dim}"
-        )
-    assert seq_length % sp_size == 0, (
-        f"dimension {dim} ({args[0].shape[dim]}) is not divisible by sequence parallel world size ({sp_size})"
-    )
-    sub_seq_length = seq_length // sp_size
-    sub_seq_start = sp_rank * sub_seq_length
-    output = []
-    for ind in args:
-        ind = torch.narrow(ind, dim, sub_seq_start, sub_seq_length)
-        output.append(ind)
-    return tuple(output) if len(output) > 1 else output[0]
-@contextmanager
-def disable_deepspeed_ulysses():
-    """Disable deepspeed ulysses (sequence parallelism) if it is enabled"""
-    if is_deepspeed_ulysses_enabled():
-        _old_get_sequence_parallel_world_size = deepspeed_groups._get_sequence_parallel_world_size
-        def _get_sequence_parallel_world_size():
-            return 1
-        deepspeed_groups._get_sequence_parallel_world_size = _get_sequence_parallel_world_size
-        try:
-            yield
-        finally:
-            deepspeed_groups._get_sequence_parallel_world_size = _old_get_sequence_parallel_world_size
-    else:
-        context = contextlib.nullcontext
-        with context():
-            yield

higgs_audio/serve/serve_engine.py DELETED Viewed

@@ -1,474 +0,0 @@
-import asyncio
-import base64
-import torch
-import numpy as np
-from io import BytesIO
-from dataclasses import dataclass, field
-from typing import List, Optional, Union
-from copy import deepcopy
-from transformers import AutoTokenizer, AutoProcessor
-from transformers.cache_utils import StaticCache
-from transformers.generation.streamers import BaseStreamer
-from transformers.generation.stopping_criteria import StoppingCriteria
-from dataclasses import asdict
-from loguru import logger
-import threading
-import librosa
-from ..dataset.chatml_dataset import (
-    ChatMLSample,
-    ChatMLDatasetSample,
-    prepare_chatml_sample,
-)
-from ..model import HiggsAudioModel
-from ..model.utils import revert_delay_pattern
-from ..data_collator.higgs_audio_collator import HiggsAudioSampleCollator
-from ..audio_processing.higgs_audio_tokenizer import load_higgs_audio_tokenizer
-def normalize_chinese_punctuation(text):
-    """
-    Convert Chinese (full-width) punctuation marks to English (half-width) equivalents.
-    """
-    # Mapping of Chinese punctuation to English punctuation
-    chinese_to_english_punct = {
-        "，": ",",  # comma
-        "。": ".",  # period
-        "：": ":",  # colon
-        "；": ";",  # semicolon
-        "？": "?",  # question mark
-        "！": "!",  # exclamation mark
-        "（": "(",  # left parenthesis
-        "）": ")",  # right parenthesis
-        "【": "[",  # left square bracket
-        "】": "]",  # right square bracket
-        "《": "<",  # left angle quote
-        "》": ">",  # right angle quote
-        "“": '"',  # left double quotation
-        "”": '"',  # right double quotation
-        "‘": "'",  # left single quotation
-        "’": "'",  # right single quotation
-        "、": ",",  # enumeration comma
-        "—": "-",  # em dash
-        "…": "...",  # ellipsis
-        "·": ".",  # middle dot
-        "「": '"',  # left corner bracket
-        "」": '"',  # right corner bracket
-        "『": '"',  # left double corner bracket
-        "』": '"',  # right double corner bracket
-    }
-    # Replace each Chinese punctuation with its English counterpart
-    for zh_punct, en_punct in chinese_to_english_punct.items():
-        text = text.replace(zh_punct, en_punct)
-    return text
-@dataclass
-class HiggsAudioStreamerDelta:
-    """Represents a chunk of generated content, either text or audio tokens."""
-    text: Optional[str] = None
-    text_tokens: Optional[torch.Tensor] = None
-    audio_tokens: Optional[torch.Tensor] = None
-    finish_reason: Optional[str] = None
-class AsyncHiggsAudioStreamer(BaseStreamer):
-    """
-    Async streamer that handles both text and audio token generation from Higgs-Audio model.
-    Stores chunks in a queue to be consumed by downstream applications.
-    Parameters:
-        tokenizer (`AutoTokenizer`):
-            The tokenizer used to decode text tokens.
-        skip_prompt (`bool`, *optional*, defaults to `False`):
-            Whether to skip the prompt tokens in generation.
-        timeout (`float`, *optional*):
-            The timeout for the queue. If `None`, the queue will block indefinitely.
-        decode_kwargs (`dict`, *optional*):
-            Additional keyword arguments to pass to the tokenizer's `decode` method.
-    Examples:
-        ```python
-        >>> from transformers import AutoTokenizer
-        >>> from threading import Thread
-        >>> import asyncio
-        >>> tokenizer = AutoTokenizer.from_pretrained("path/to/higgs/tokenizer")
-        >>> model = HiggsAudioModel.from_pretrained("path/to/higgs/model")
-        >>> inputs = tokenizer(["Generate some text and audio:"], return_tensors="pt")
-        >>> async def main():
-        ...     streamer = AsyncHiggsAudioStreamer(tokenizer)
-        ...     generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=20)
-        ...     thread = Thread(target=model.generate, kwargs=generation_kwargs)
-        ...     thread.start()
-        ...
-        ...     async for delta in streamer:
-        ...         if delta.text is not None:
-        ...             print("Text:", delta.text)
-        ...         if delta.audio_tokens is not None:
-        ...             print("Audio tokens shape:", delta.audio_tokens.shape)
-        >>> asyncio.run(main())
-        ```
-    """
-    def __init__(
-        self,
-        tokenizer: "AutoTokenizer",
-        skip_prompt: bool = False,
-        timeout: Optional[float] = None,
-        audio_num_codebooks: int = 1,
-        **decode_kwargs,
-    ):
-        self.tokenizer = tokenizer
-        self.skip_prompt = skip_prompt
-        self.timeout = timeout
-        self.decode_kwargs = decode_kwargs
-        self.audio_num_codebooks = audio_num_codebooks
-        # Queue to store generated chunks
-        self.queue = asyncio.Queue()
-        self.stop_signal = None
-        # Get running event loop
-        self.loop = asyncio.get_running_loop()
-        self.has_asyncio_timeout = hasattr(asyncio, "timeout")
-        # State tracking
-        self.next_tokens_are_prompt = True
-    def put(self, value: torch.Tensor):
-        """
-        Receives tokens and processes them as either text or audio tokens.
-        For text tokens, decodes and caches them until complete words are formed.
-        For audio tokens, directly queues them.
-        """
-        if value.shape[0] > 1 and not self.next_tokens_are_prompt:
-            # This is likely audio tokens (shape: [audio_num_codebooks])
-            assert value.shape[0] == self.audio_num_codebooks, "Number of codebooks mismatch"
-            delta = HiggsAudioStreamerDelta(audio_tokens=value)
-            self.loop.call_soon_threadsafe(self.queue.put_nowait, delta)
-            return
-        # Skip prompt tokens if configured
-        if self.skip_prompt and self.next_tokens_are_prompt:
-            self.next_tokens_are_prompt = False
-            return
-        # Process as text tokens
-        if len(value.shape) > 1:
-            value = value[0]
-        text = self.tokenizer.decode(value, **self.decode_kwargs)
-        delta = HiggsAudioStreamerDelta(text=text, text_tokens=value)
-        self.loop.call_soon_threadsafe(self.queue.put_nowait, delta)
-    def end(self):
-        """Flushes any remaining text tokens and signals the end of generation."""
-        self.next_tokens_are_prompt = True
-        self.loop.call_soon_threadsafe(self.queue.put_nowait, self.stop_signal)
-    def __aiter__(self):
-        return self
-    async def __anext__(self):
-        try:
-            if self.has_asyncio_timeout:
-                async with asyncio.timeout(self.timeout):
-                    value = await self.queue.get()
-            else:
-                value = await asyncio.wait_for(self.queue.get(), timeout=self.timeout)
-        except asyncio.TimeoutError:
-            raise TimeoutError()
-        else:
-            if value == self.stop_signal:
-                raise StopAsyncIteration()
-            else:
-                return value
-class AsyncStoppingCriteria(StoppingCriteria):
-    """
-    Stopping criteria that checks for stop signal from a threading event.
-    Args:
-        stop_signal (threading.Event): Event that will receive stop signals
-    """
-    def __init__(self, stop_signal: threading.Event):
-        self.stop_signal = stop_signal
-    def __call__(self, input_ids, scores, **kwargs) -> bool:
-        if self.stop_signal.is_set():
-            logger.info(f"Stop signal received. Can be caused by client disconnection.")
-            return True
-        return False
-@dataclass
-class HiggsAudioResponse:
-    audio: Optional[np.ndarray] = None
-    generated_audio_tokens: Optional[np.ndarray] = None
-    sampling_rate: Optional[int] = None
-    generated_text: str = ""
-    generated_text_tokens: np.ndarray = field(default_factory=np.ndarray)
-    usage: Optional[dict] = None
-class HiggsAudioServeEngine:
-    def __init__(
-        self,
-        model_name_or_path: str,
-        audio_tokenizer_name_or_path: str,
-        tokenizer_name_or_path: Optional[str] = None,
-        device: str = "cuda",
-        torch_dtype: Union[torch.dtype, str] = "auto",
-        kv_cache_lengths: List[int] = [1024, 4096, 8192],  # Multiple KV cache sizes
-    ):
-        """
-        Initialize the HiggsAudioServeEngine, a serving wrapper for the HiggsAudioModel.
-        The model, tokenizer, and audio tokenizer will be downloaded from the Hugging Face Hub if they are not local.
-        Args:
-            model_name_or_path (str):
-                The name or path of the model to load.
-            audio_tokenizer_name_or_path (str):
-                The name or path of the audio tokenizer to load.
-            tokenizer_name_or_path (str):
-                The name or path of the tokenizer to load.
-            device (str):
-                The device to use for the model.
-            kv_cache_lengths (List[int]):
-                The lengths of the KV caches to use for the model. Used for cuda graph capture when device is cuda.
-            torch_dtype (Union[torch.dtype, str]):
-                The dtype to use for the model.
-        """
-        self.device = device
-        self.model_name_or_path = model_name_or_path
-        self.torch_dtype = torch_dtype
-        # Initialize model and tokenizer
-        self.model = HiggsAudioModel.from_pretrained(model_name_or_path, torch_dtype=torch_dtype).to(device)
-        logger.info(f"Loaded model from {model_name_or_path}, dtype: {self.model.dtype}")
-        if tokenizer_name_or_path is None:
-            tokenizer_name_or_path = model_name_or_path
-        logger.info(f"Loading tokenizer from {tokenizer_name_or_path}")
-        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
-        logger.info(f"Initializing Higgs Audio Tokenizer")
-        self.audio_tokenizer = load_higgs_audio_tokenizer(audio_tokenizer_name_or_path, device=device)
-        self.audio_num_codebooks = self.model.config.audio_num_codebooks
-        self.audio_codebook_size = self.model.config.audio_codebook_size
-        self.audio_tokenizer_tps = self.audio_tokenizer.tps
-        self.samples_per_token = int(self.audio_tokenizer.sampling_rate // self.audio_tokenizer_tps)
-        self.hamming_window_len = 2 * self.audio_num_codebooks * self.samples_per_token
-        # Set the audio special tokens
-        self.model.set_audio_special_tokens(self.tokenizer)
-        # Prepare KV caches for different lengths
-        cache_config = deepcopy(self.model.config.text_config)
-        cache_config.num_hidden_layers = self.model.config.text_config.num_hidden_layers
-        if self.model.config.audio_dual_ffn_layers:
-            cache_config.num_hidden_layers += len(self.model.config.audio_dual_ffn_layers)
-        # A list of KV caches for different lengths
-        self.kv_caches = {
-            length: StaticCache(
-                config=cache_config,
-                max_batch_size=1,
-                max_cache_len=length,
-                device=self.model.device,
-                dtype=self.model.dtype,
-            )
-            for length in sorted(kv_cache_lengths)
-        }
-        if self.model.config.encode_whisper_embed:
-            logger.info(f"Loading whisper processor")
-            whisper_processor = AutoProcessor.from_pretrained(
-                "openai/whisper-large-v3-turbo",
-                trust_remote=True,
-                device=self.device,
-            )
-        else:
-            whisper_processor = None
-        # Reuse collator to prepare inference samples
-        self.collator = HiggsAudioSampleCollator(
-            whisper_processor=whisper_processor,
-            encode_whisper_embed=self.model.config.encode_whisper_embed,
-            audio_in_token_id=self.model.config.audio_in_token_idx,
-            audio_out_token_id=self.model.config.audio_out_token_idx,
-            audio_stream_bos_id=self.model.config.audio_stream_bos_id,
-            audio_stream_eos_id=self.model.config.audio_stream_eos_id,
-            pad_token_id=self.model.config.pad_token_id,
-            return_audio_in_tokens=False,
-            use_delay_pattern=self.model.config.use_delay_pattern,
-            audio_num_codebooks=self.model.config.audio_num_codebooks,
-            round_to=1,
-        )
-        # Lock to prevent multiple generations from happening at the same time
-        self.generate_lock = threading.Lock()
-        # Capture CUDA graphs for each KV cache length
-        # if device == "cuda":
-        #     logger.info(f"Capturing CUDA graphs for each KV cache length")
-        #     self.model.capture_model(self.kv_caches.values())
-    def _prepare_inputs(self, chat_ml_sample: ChatMLSample, force_audio_gen: bool = False):
-        input_tokens, _, audio_contents, _ = prepare_chatml_sample(
-            chat_ml_sample,
-            self.tokenizer,
-        )
-        postfix = "<|start_header_id|>assistant<|end_header_id|>\n\n"
-        if force_audio_gen:
-            postfix += "<|audio_out_bos|>"
-        postfix = self.tokenizer.encode(postfix, add_special_tokens=False)
-        input_tokens.extend(postfix)
-        # Configure the audio inputs
-        audio_ids_l = []
-        for audio_content in audio_contents:
-            if audio_content.audio_url not in ["placeholder", ""]:
-                raw_audio, _ = librosa.load(audio_content.audio_url, sr=self.audio_tokenizer.sampling_rate)
-            elif audio_content.raw_audio is not None:
-                raw_audio, _ = librosa.load(
-                    BytesIO(base64.b64decode(audio_content.raw_audio)),
-                    sr=self.audio_tokenizer.sampling_rate,
-                )
-            else:
-                raw_audio = None
-            if raw_audio is not None:
-                audio_ids = self.audio_tokenizer.encode(raw_audio, self.audio_tokenizer.sampling_rate)
-                audio_ids_l.append(audio_ids.squeeze(0).cpu())
-        if len(audio_ids_l) > 0:
-            audio_ids_start = torch.tensor(
-                np.cumsum(np.array([0] + [audio_ids.shape[1] for audio_ids in audio_ids_l])),
-                dtype=torch.long,
-                device=self.device,
-            )[0:-1]
-            audio_ids_concat = torch.cat(audio_ids_l, dim=1)
-        else:
-            audio_ids_start = None
-            audio_ids_concat = None
-        sample = ChatMLDatasetSample(
-            input_ids=torch.LongTensor(input_tokens),
-            label_ids=None,
-            audio_ids_concat=audio_ids_concat,
-            audio_ids_start=audio_ids_start,
-            audio_waveforms_concat=None,
-            audio_waveforms_start=None,
-            audio_sample_rate=None,
-            audio_speaker_indices=None,
-        )
-        data = self.collator([sample])
-        inputs = asdict(data)
-        for k, v in inputs.items():
-            if isinstance(v, torch.Tensor):
-                inputs[k] = v.to(self.model.device)
-        return inputs
-    def _prepare_kv_caches(self):
-        for kv_cache in self.kv_caches.values():
-            kv_cache.reset()
-    def generate(
-        self,
-        chat_ml_sample: ChatMLSample,
-        max_new_tokens: int,
-        temperature: float = 0.7,
-        top_k: Optional[int] = None,
-        top_p: float = 0.95,
-        stop_strings: Optional[List[str]] = None,
-        force_audio_gen: bool = False,
-        ras_win_len: Optional[int] = None,
-        ras_win_max_num_repeat: int = 2,
-    ):
-        """
-        Generate audio from a chatml sample.
-        Args:
-            chat_ml_sample: A chatml sample.
-            max_new_tokens: The maximum number of new tokens to generate.
-            temperature: The temperature to use for the generation.
-            top_p: The top p to use for the generation.
-        Returns:
-            A dictionary with the following keys:
-                audio: The generated audio.
-                sampling_rate: The sampling rate of the generated audio.
-        """
-        # Default stop strings
-        if stop_strings is None:
-            stop_strings = ["<|end_of_text|>", "<|eot_id|>"]
-        with torch.no_grad(), self.generate_lock:
-            inputs = self._prepare_inputs(chat_ml_sample, force_audio_gen=force_audio_gen)
-            prompt_token_ids = inputs["input_ids"][0].cpu().numpy()
-            self._prepare_kv_caches()
-            outputs = self.model.generate(
-                **inputs,
-                max_new_tokens=max_new_tokens,
-                use_cache=True,
-                stop_strings=stop_strings,
-                tokenizer=self.tokenizer,
-                do_sample=False if temperature == 0.0 else True,
-                temperature=temperature,
-                top_k=top_k,
-                top_p=top_p,
-                past_key_values_buckets=self.kv_caches,
-                ras_win_len=ras_win_len,
-                ras_win_max_num_repeat=ras_win_max_num_repeat,
-            )
-            if len(outputs[1]) > 0:
-                wv_list = []
-                for output_audio in outputs[1]:
-                    vq_code = revert_delay_pattern(output_audio).clip(0, self.audio_codebook_size - 1)[:, 1:-1]
-                    wv_numpy = self.audio_tokenizer.decode(vq_code.unsqueeze(0))[0, 0]
-                    wv_list.append(wv_numpy)
-                wv_numpy = np.concatenate(wv_list)
-            else:
-                wv_numpy = None
-            # We only support one request at a time now
-            generated_text_tokens = outputs[0][0].cpu().numpy()[len(prompt_token_ids) :]
-            generated_text = self.tokenizer.decode(generated_text_tokens)
-            generated_audio_tokens = outputs[1][0].cpu().numpy()
-            return HiggsAudioResponse(
-                audio=wv_numpy,
-                generated_audio_tokens=generated_audio_tokens,
-                sampling_rate=self.audio_tokenizer.sampling_rate,
-                generated_text=generated_text,
-                generated_text_tokens=generated_text_tokens,
-                usage={
-                    "prompt_tokens": prompt_token_ids.shape[0],
-                    "completion_tokens": generated_text_tokens.shape[0] + generated_audio_tokens.shape[1],
-                    "total_tokens": (
-                        prompt_token_ids.shape[0] + generated_text_tokens.shape[0] + generated_audio_tokens.shape[1]
-                    ),
-                    "cached_tokens": 0,
-                },
-            )
-    def text_normalize(self, text: str) -> str:
-        """
-        Normalize the text.
-        """
-        # Perform some basic normalization
-        text = normalize_chinese_punctuation(text)
-        # Handle parentheses
-        text = text.replace("(", " ")
-        text = text.replace(")", " ")
-        return text

higgs_audio/serve/utils.py DELETED Viewed

@@ -1,254 +0,0 @@
-import uuid
-import base64
-import re
-import regex
-from typing import AsyncGenerator, Union
-import io
-from pydub import AudioSegment
-import torch
-import numpy as np
-from functools import lru_cache
-from ..audio_processing.higgs_audio_tokenizer import HiggsAudioTokenizer
-def random_uuid() -> str:
-    return str(uuid.uuid4().hex)
-async def async_generator_wrap(first_element, gen: AsyncGenerator):
-    """Wrap an async generator with the first element."""
-    yield first_element
-    async for item in gen:
-        yield item
-@lru_cache(maxsize=50)
-def encode_base64_content_from_file(file_path: str) -> str:
-    """Encode a content from a local file to base64 format."""
-    # Read the MP3 file as binary and encode it directly to Base64
-    with open(file_path, "rb") as audio_file:
-        audio_base64 = base64.b64encode(audio_file.read()).decode("utf-8")
-    return audio_base64
-def pcm16_to_target_format(
-    np_audio: np.ndarray,
-    sample_rate: int,
-    bit_depth: int,
-    channels: int,
-    format: str,
-    target_rate: int,
-):
-    wav_audio = AudioSegment(
-        np_audio.tobytes(),
-        frame_rate=sample_rate,
-        sample_width=bit_depth // 8,
-        channels=channels,
-    )
-    if target_rate is not None and target_rate != sample_rate:
-        wav_audio = wav_audio.set_frame_rate(target_rate)
-    # Convert WAV to MP3
-    target_io = io.BytesIO()
-    wav_audio.export(target_io, format=format)
-    target_io.seek(0)
-    return target_io
-chinese_char_pattern = re.compile(r"[\u4e00-\u9fff]+")
-def contains_chinese(text: str):
-    return bool(chinese_char_pattern.search(text))
-# remove blank between chinese character
-def replace_blank(text: str):
-    out_str = []
-    for i, c in enumerate(text):
-        if c == " ":
-            if (text[i + 1].isascii() and text[i + 1] != " ") and (text[i - 1].isascii() and text[i - 1] != " "):
-                out_str.append(c)
-        else:
-            out_str.append(c)
-    return "".join(out_str)
-def replace_corner_mark(text: str):
-    text = text.replace("²", "平方")
-    text = text.replace("³", "立方")
-    return text
-# remove meaningless symbol
-def remove_bracket(text: str):
-    text = text.replace("（", "").replace("）", "")
-    text = text.replace("【", "").replace("】", "")
-    text = text.replace("`", "").replace("`", "")
-    text = text.replace("——", " ")
-    return text
-# split paragrah logic：
-# 1. per sentence max len token_max_n, min len token_min_n, merge if last sentence len less than merge_len
-# 2. cal sentence len according to lang
-# 3. split sentence according to puncatation
-def split_paragraph(
-    text: str,
-    tokenize,
-    lang="zh",
-    token_max_n=80,
-    token_min_n=60,
-    merge_len=20,
-    comma_split=False,
-):
-    def calc_utt_length(_text: str):
-        if lang == "zh":
-            return len(_text)
-        else:
-            return len(tokenize(_text))
-    def should_merge(_text: str):
-        if lang == "zh":
-            return len(_text) < merge_len
-        else:
-            return len(tokenize(_text)) < merge_len
-    if lang == "zh":
-        pounc = ["。", "？", "！", "；", "：", "、", ".", "?", "!", ";"]
-    else:
-        pounc = [".", "?", "!", ";", ":"]
-    if comma_split:
-        pounc.extend(["，", ","])
-    if text[-1] not in pounc:
-        if lang == "zh":
-            text += "。"
-        else:
-            text += "."
-    st = 0
-    utts = []
-    for i, c in enumerate(text):
-        if c in pounc:
-            if len(text[st:i]) > 0:
-                utts.append(text[st:i] + c)
-            if i + 1 < len(text) and text[i + 1] in ['"', "”"]:
-                tmp = utts.pop(-1)
-                utts.append(tmp + text[i + 1])
-                st = i + 2
-            else:
-                st = i + 1
-    final_utts = []
-    cur_utt = ""
-    for utt in utts:
-        if calc_utt_length(cur_utt + utt) > token_max_n and calc_utt_length(cur_utt) > token_min_n:
-            final_utts.append(cur_utt)
-            cur_utt = ""
-        cur_utt = cur_utt + utt
-    if len(cur_utt) > 0:
-        if should_merge(cur_utt) and len(final_utts) != 0:
-            final_utts[-1] = final_utts[-1] + cur_utt
-        else:
-            final_utts.append(cur_utt)
-    return final_utts
-def is_only_punctuation(text: str):
-    # Regular expression: Match strings that consist only of punctuation marks or are empty.
-    punctuation_pattern = r"^[\p{P}\p{S}]*$"
-    return bool(regex.fullmatch(punctuation_pattern, text))
-# spell Arabic numerals
-def spell_out_number(text: str, inflect_parser):
-    new_text = []
-    st = None
-    for i, c in enumerate(text):
-        if not c.isdigit():
-            if st is not None:
-                num_str = inflect_parser.number_to_words(text[st:i])
-                new_text.append(num_str)
-                st = None
-            new_text.append(c)
-        else:
-            if st is None:
-                st = i
-    if st is not None and st < len(text):
-        num_str = inflect_parser.number_to_words(text[st:])
-        new_text.append(num_str)
-    return "".join(new_text)
-def remove_emoji(text: str):
-    # Pattern to match emojis and their modifiers
-    # - Standard emoji range
-    # - Zero-width joiners (U+200D)
-    # - Variation selectors (U+FE0F, U+FE0E)
-    # - Skin tone modifiers (U+1F3FB to U+1F3FF)
-    emoji_pattern = re.compile(
-        r"["
-        r"\U00010000-\U0010FFFF"  # Standard emoji range
-        r"\u200D"  # Zero-width joiner
-        r"\uFE0F\uFE0E"  # Variation selectors
-        r"\U0001F3FB-\U0001F3FF"  # Skin tone modifiers
-        r"]+",
-        flags=re.UNICODE,
-    )
-    return emoji_pattern.sub(r"", text)
-def remove_repeated_punctuations(text, punctuations):
-    if len(punctuations) == 0:
-        return text
-    pattern = f"[{re.escape(''.join(punctuations))}]"  # Create regex pattern for given punctuations
-    return re.sub(rf"({pattern})\1+", r"\1", text)
-def full_to_half_width(text: str) -> str:
-    """Convert full-width punctuation to half-width in a given string."""
-    full_width = "！＂＃＄％＆＇（）＊＋，－．／：；＜＝＞？＠［＼］＾＿｀｛｜｝～"
-    half_width = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
-    trans_table = str.maketrans(full_width, half_width)
-    return text.translate(trans_table)
-def split_interleaved_delayed_audios(
-    audio_data: Union[list[list[int]], torch.Tensor],
-    audio_tokenizer: HiggsAudioTokenizer,
-    audio_stream_eos_id: int,
-) -> list[tuple[list[list[int]], torch.Tensor]]:
-    separator = [audio_stream_eos_id] * audio_tokenizer.num_codebooks
-    # Convert separator to numpy array if audio_data is numpy array
-    if isinstance(audio_data, torch.Tensor):
-        audio_data = audio_data.transpose(1, 0)
-        separator = torch.tensor(separator)
-        # Find the indices where the rows equal the separator
-        split_indices = torch.where(torch.all(audio_data == separator, dim=1))[0]
-        start = 0
-        groups = []
-        for idx in split_indices:
-            groups.append(audio_data[start:idx].transpose(1, 0))
-            start = idx + 1
-        if start < len(audio_data):
-            groups.append(audio_data[start:].transpose(1, 0))
-    else:
-        groups = []
-        current = []
-        for row in audio_data:
-            current.append(row)
-            if row == separator:
-                groups.append(current)
-                current = []
-        # Don't forget the last group if there's no trailing separator
-        if current:
-            groups.append(current)
-    return groups

higgs_audio_utils.py DELETED Viewed

@@ -1,280 +0,0 @@
-from typing import Optional
-# Import HiggsAudio components
-from higgs_audio.serve.serve_engine import HiggsAudioServeEngine
-from higgs_audio.data_types import ChatMLSample, AudioContent, Message
-import base64
-from functools import lru_cache
-from loguru import logger
-import os
-import json
-import uuid
-import time
-import numpy as np
-import re
-def process_text_output(text_output: str):
-    # remove all the continuous <|AUDIO_OUT|> tokens with a single <|AUDIO_OUT|>
-    text_output = re.sub(r"(<\|AUDIO_OUT\|>)+", r"<|AUDIO_OUT|>", text_output)
-    return text_output
-def check_return_audio(audio_wv: np.ndarray):
-    # check if the audio returned is all silent
-    if np.all(audio_wv == 0):
-        logger.warning("Audio is silent, returning None")
-def load_voice_presets():
-    """Load the voice presets from the voice_examples directory."""
-    try:
-        with open(
-            os.path.join(os.path.dirname(__file__), "examples", "audios", "config.json"),
-            "r",
-        ) as f:
-            voice_dict = json.load(f)
-        voice_presets = {k: v for k, v in voice_dict.items()}
-        voice_presets["EMPTY"] = "No reference voice"
-        logger.info(f"Loaded voice presets: {list(voice_presets.keys())}")
-        return voice_presets
-    except FileNotFoundError:
-        logger.warning("Voice examples config file not found. Using empty voice presets.")
-        return {"EMPTY": "No reference voice"}
-    except Exception as e:
-        logger.error(f"Error loading voice presets: {e}")
-        return {"EMPTY": "No reference voice"}
-SAMPLE_RATE = 24000
-DEFAULT_STOP_STRINGS = ["<|end_of_text|>", "<|eot_id|>"]
-VOICE_PRESETS = load_voice_presets()
-def initialize_engine(model_path, audio_tokenizer_path) -> bool:
-    engine = HiggsAudioServeEngine(
-        model_name_or_path=model_path,
-        audio_tokenizer_name_or_path=audio_tokenizer_path,
-        device="cuda",
-    )
-    return engine
-def get_voice_preset(voice_preset):
-    """Get the voice path and text for a given voice preset."""
-    preset_dir = os.path.join(os.path.dirname(__file__), "examples", "audios")
-    voice_path = os.path.join(preset_dir, VOICE_PRESETS[voice_preset]["audio_file"])
-    if not os.path.exists(voice_path):
-        logger.warning(f"Voice preset file not found: {voice_path}")
-        return None, "Voice preset not found"
-    text = VOICE_PRESETS[voice_preset]["transcript"]
-    return voice_path, text
-def normalize_chinese_punctuation(text):
-    """
-    Convert Chinese (full-width) punctuation marks to English (half-width) equivalents.
-    """
-    # Mapping of Chinese punctuation to English punctuation
-    chinese_to_english_punct = {
-        "，": ", ",  # comma
-        "。": ".",  # period
-        "：": ":",  # colon
-        "；": ";",  # semicolon
-        "？": "?",  # question mark
-        "！": "!",  # exclamation mark
-        "（": "(",  # left parenthesis
-        "）": ")",  # right parenthesis
-        "【": "[",  # left square bracket
-        "】": "]",  # right square bracket
-        "《": "<",  # left angle quote
-        "》": ">",  # right angle quote
-        "“": '"',  # left double quotation
-        "”": '"',  # right double quotation
-        "‘": "'",  # left single quotation
-        "’": "'",  # right single quotation
-        "、": ",",  # enumeration comma
-        "—": "-",  # em dash
-        "…": "...",  # ellipsis
-        "·": ".",  # middle dot
-        "「": '"',  # left corner bracket
-        "」": '"',  # right corner bracket
-        "『": '"',  # left double corner bracket
-        "』": '"',  # right double corner bracket
-    }
-    # Replace each Chinese punctuation with its English counterpart
-    for zh_punct, en_punct in chinese_to_english_punct.items():
-        text = text.replace(zh_punct, en_punct)
-    return text
-def normalize_text(transcript: str):
-    transcript = normalize_chinese_punctuation(transcript)
-    # Other normalizations (e.g., parentheses and other symbols. Will be improved in the future)
-    transcript = transcript.replace("(", " ")
-    transcript = transcript.replace(")", " ")
-    transcript = transcript.replace("°F", " degrees Fahrenheit")
-    transcript = transcript.replace("°C", " degrees Celsius")
-    for tag, replacement in [
-        ("[laugh]", "<SE>[Laughter]</SE>"),
-        ("[humming start]", "<SE>[Humming]</SE>"),
-        ("[humming end]", "<SE_e>[Humming]</SE_e>"),
-        ("[music start]", "<SE_s>[Music]</SE_s>"),
-        ("[music end]", "<SE_e>[Music]</SE_e>"),
-        ("[music]", "<SE>[Music]</SE>"),
-        ("[sing start]", "<SE_s>[Singing]</SE_s>"),
-        ("[sing end]", "<SE_e>[Singing]</SE_e>"),
-        ("[applause]", "<SE>[Applause]</SE>"),
-        ("[cheering]", "<SE>[Cheering]</SE>"),
-        ("[cough]", "<SE>[Cough]</SE>"),
-    ]:
-        transcript = transcript.replace(tag, replacement)
-    lines = transcript.split("\n")
-    transcript = "\n".join([" ".join(line.split()) for line in lines if line.strip()])
-    transcript = transcript.strip()
-    if not any([transcript.endswith(c) for c in [".", "!", "?", ",", ";", '"', "'", "</SE_e>", "</SE>"]]):
-        transcript += "."
-    return transcript
-@lru_cache(maxsize=20)
-def encode_audio_file(file_path):
-    """Encode an audio file to base64."""
-    with open(file_path, "rb") as audio_file:
-        return base64.b64encode(audio_file.read()).decode("utf-8")
-def prepare_chatml_sample(
-    voice_preset: str,
-    text: str,
-    reference_audio: Optional[str] = None,
-    reference_text: Optional[str] = None,
-    system_prompt: str = "",
-):
-    """Prepare a ChatMLSample for the HiggsAudioServeEngine."""
-    messages = []
-    # Add system message if provided
-    if len(system_prompt) > 0:
-        messages.append(Message(role="system", content=system_prompt))
-    # Add reference audio if provided
-    audio_base64 = None
-    ref_text = ""
-    if reference_audio:
-        # Custom reference audio
-        audio_base64 = encode_audio_file(reference_audio)
-        ref_text = reference_text or ""
-    elif voice_preset != "EMPTY":
-        # Voice preset
-        voice_path, ref_text = get_voice_preset(voice_preset)
-        if voice_path is None:
-            logger.warning(f"Voice preset {voice_preset} not found, skipping reference audio")
-        else:
-            audio_base64 = encode_audio_file(voice_path)
-    # Only add reference audio if we have it
-    if audio_base64 is not None:
-        # Add user message with reference text
-        messages.append(Message(role="user", content=ref_text))
-        # Add assistant message with audio content
-        audio_content = AudioContent(raw_audio=audio_base64, audio_url="")
-        messages.append(Message(role="assistant", content=[audio_content]))
-    # Add the main user message
-    text = normalize_text(text)
-    messages.append(Message(role="user", content=text))
-    return ChatMLSample(messages=messages)
-def text_to_speech(
-    engine,
-    text,
-    system_prompt="",
-    voice_preset="EMPTY",
-    reference_audio=None,
-    reference_text=None,
-    max_completion_tokens=1024,
-    temperature=1.0,
-    top_p=0.95,
-    top_k=50,
-    stop_strings=None,
-    ras_win_len=7,
-    ras_win_max_num_repeat=2,
-):
-    """
-    Convert text to speech using HiggsAudioServeEngine.
-    Args:
-        text: The text to convert to speech
-        voice_preset: The voice preset to use (or "EMPTY" for no preset)
-        reference_audio: Optional path to reference audio file
-        reference_text: Optional transcript of the reference audio
-        max_completion_tokens: Maximum number of tokens to generate
-        temperature: Sampling temperature for generation
-        top_p: Top-p sampling parameter
-        top_k: Top-k sampling parameter
-        system_prompt: System prompt to guide the model
-        stop_strings: Dataframe containing stop strings
-        ras_win_len: Window length for repetition avoidance sampling
-        ras_win_max_num_repeat: Maximum number of repetitions allowed in the window
-    Returns:
-        Tuple of (generated_text, (sample_rate, audio_data)) where audio_data is int16 numpy array
-    """
-    try:
-        # Prepare ChatML sample
-        chatml_sample = prepare_chatml_sample(voice_preset, text, reference_audio, reference_text, system_prompt)
-        # Convert stop strings format
-        if stop_strings is None:
-            stop_list = DEFAULT_STOP_STRINGS
-        else:
-            stop_list = [s for s in stop_strings["stops"] if s.strip()]
-        request_id = f"tts-playground-{str(uuid.uuid4())}"
-        start_time = time.time()
-        # Generate using the engine
-        response = engine.generate(
-            chat_ml_sample=chatml_sample,
-            max_new_tokens=max_completion_tokens,
-            temperature=temperature,
-            top_k=top_k if top_k > 0 else None,
-            top_p=top_p,
-            stop_strings=stop_list,
-            ras_win_len=ras_win_len if ras_win_len > 0 else None,
-            ras_win_max_num_repeat=max(ras_win_len, ras_win_max_num_repeat),
-        )
-        generation_time = time.time() - start_time
-        # Process the response
-        text_output = process_text_output(response.generated_text)
-        if response.audio is not None:
-            # Convert to int16 for Gradio
-            audio_data = (response.audio * 32767).astype(np.int16)
-            check_return_audio(audio_data)
-            return text_output, (response.sampling_rate, audio_data)
-        else:
-            logger.warning("No audio generated")
-            return text_output, None
-    except Exception as e:
-        error_msg = f"Error generating speech: {e}"
-        logger.error(error_msg)
-        return f"❌ {error_msg}", None

requirements.txt CHANGED Viewed

@@ -16,16 +16,5 @@ ninja
 gradio_extendedimage @ https://github.com/OutofAi/gradio-extendedimage/releases/download/0.0.2/gradio_extendedimage-0.0.2-py3-none-any.whl
 gradio_extendedaudio @ https://github.com/OutofAi/gradio-extendedaudio/releases/download/0.0.5/gradio_extendedaudio-0.0.5-py3-none-any.whl
-dacite
-boto3==1.35.36
-s3fs
-json_repair
-pandas
-pydantic
-vector_quantize_pytorch
-loguru
-pydub
-ruff==0.12.2
-click
-descript-audio-codec

 gradio_extendedimage @ https://github.com/OutofAi/gradio-extendedimage/releases/download/0.0.2/gradio_extendedimage-0.0.2-py3-none-any.whl
 gradio_extendedaudio @ https://github.com/OutofAi/gradio-extendedaudio/releases/download/0.0.5/gradio_extendedaudio-0.0.5-py3-none-any.whl
+flash-attn-3 @ https://huggingface.co/alexnasa/flash-attn-3/resolve/main/128/flash_attn_3-3.0.0b1-cp39-abi3-linux_x86_64.whl
+onnxruntime

supertonic.py ADDED Viewed

	@@ -0,0 +1,364 @@

+import json
+import os
+import time
+from contextlib import contextmanager
+from typing import Optional
+from unicodedata import normalize
+import numpy as np
+import onnxruntime as ort
+import soundfile as sf
+from huggingface_hub import snapshot_download
+class UnicodeProcessor:
+    def __init__(self, unicode_indexer_path: str):
+        with open(unicode_indexer_path, "r") as f:
+            self.indexer = json.load(f)
+    def _preprocess_text(self, text: str) -> str:
+        # TODO: add more preprocessing
+        text = normalize("NFKD", text)
+        return text
+    def _get_text_mask(self, text_ids_lengths: np.ndarray) -> np.ndarray:
+        text_mask = length_to_mask(text_ids_lengths)
+        return text_mask
+    def _text_to_unicode_values(self, text: str) -> np.ndarray:
+        unicode_values = np.array(
+            [ord(char) for char in text], dtype=np.uint16
+        )  # 2 bytes
+        return unicode_values
+    def __call__(self, text_list: list[str]) -> tuple[np.ndarray, np.ndarray]:
+        text_list = [self._preprocess_text(t) for t in text_list]
+        text_ids_lengths = np.array([len(text) for text in text_list], dtype=np.int64)
+        text_ids = np.zeros((len(text_list), text_ids_lengths.max()), dtype=np.int64)
+        for i, text in enumerate(text_list):
+            unicode_vals = self._text_to_unicode_values(text)
+            text_ids[i, : len(unicode_vals)] = np.array(
+                [self.indexer[val] for val in unicode_vals], dtype=np.int64
+            )
+        text_mask = self._get_text_mask(text_ids_lengths)
+        return text_ids, text_mask
+class Style:
+    def __init__(self, style_ttl_onnx: np.ndarray, style_dp_onnx: np.ndarray):
+        self.ttl = style_ttl_onnx
+        self.dp = style_dp_onnx
+class TextToSpeech:
+    def __init__(
+        self,
+        cfgs: dict,
+        text_processor: UnicodeProcessor,
+        dp_ort: ort.InferenceSession,
+        text_enc_ort: ort.InferenceSession,
+        vector_est_ort: ort.InferenceSession,
+        vocoder_ort: ort.InferenceSession,
+    ):
+        self.cfgs = cfgs
+        self.text_processor = text_processor
+        self.dp_ort = dp_ort
+        self.text_enc_ort = text_enc_ort
+        self.vector_est_ort = vector_est_ort
+        self.vocoder_ort = vocoder_ort
+        self.sample_rate = cfgs["ae"]["sample_rate"]
+        self.base_chunk_size = cfgs["ae"]["base_chunk_size"]
+        self.chunk_compress_factor = cfgs["ttl"]["chunk_compress_factor"]
+        self.ldim = cfgs["ttl"]["latent_dim"]
+    def sample_noisy_latent(
+        self, duration: np.ndarray
+    ) -> tuple[np.ndarray, np.ndarray]:
+        bsz = len(duration)
+        wav_len_max = duration.max() * self.sample_rate
+        wav_lengths = (duration * self.sample_rate).astype(np.int64)
+        chunk_size = self.base_chunk_size * self.chunk_compress_factor
+        latent_len = ((wav_len_max + chunk_size - 1) / chunk_size).astype(np.int32)
+        latent_dim = self.ldim * self.chunk_compress_factor
+        noisy_latent = np.random.randn(bsz, latent_dim, latent_len).astype(np.float32)
+        latent_mask = get_latent_mask(
+            wav_lengths, self.base_chunk_size, self.chunk_compress_factor
+        )
+        noisy_latent = noisy_latent * latent_mask
+        return noisy_latent, latent_mask
+    def _infer(
+        self, text_list: list[str], style: Style, total_step: int, speed: float = 1.05
+    ) -> tuple[np.ndarray, np.ndarray]:
+        assert (
+            len(text_list) == style.ttl.shape[0]
+        ), "Number of texts must match number of style vectors"
+        bsz = len(text_list)
+        text_ids, text_mask = self.text_processor(text_list)
+        dur_onnx, *_ = self.dp_ort.run(
+            None, {"text_ids": text_ids, "style_dp": style.dp, "text_mask": text_mask}
+        )
+        dur_onnx = dur_onnx / speed
+        text_emb_onnx, *_ = self.text_enc_ort.run(
+            None,
+            {"text_ids": text_ids, "style_ttl": style.ttl, "text_mask": text_mask},
+        )  # dur_onnx: [bsz]
+        xt, latent_mask = self.sample_noisy_latent(dur_onnx)
+        total_step_np = np.array([total_step] * bsz, dtype=np.float32)
+        for step in range(total_step):
+            current_step = np.array([step] * bsz, dtype=np.float32)
+            xt, *_ = self.vector_est_ort.run(
+                None,
+                {
+                    "noisy_latent": xt,
+                    "text_emb": text_emb_onnx,
+                    "style_ttl": style.ttl,
+                    "text_mask": text_mask,
+                    "latent_mask": latent_mask,
+                    "current_step": current_step,
+                    "total_step": total_step_np,
+                },
+            )
+        wav, *_ = self.vocoder_ort.run(None, {"latent": xt})
+        return wav, dur_onnx
+    def __call__(
+        self,
+        text: str,
+        style: Style,
+        total_step: int,
+        speed: float = 1.05,
+        silence_duration: float = 0.3,
+    ) -> tuple[np.ndarray, np.ndarray]:
+        assert (
+            style.ttl.shape[0] == 1
+        ), "Single speaker text to speech only supports single style"
+        text_list = chunk_text(text)
+        wav_cat = None
+        dur_cat = None
+        for text in text_list:
+            wav, dur_onnx = self._infer([text], style, total_step, speed)
+            if wav_cat is None:
+                wav_cat = wav
+                dur_cat = dur_onnx
+            else:
+                silence = np.zeros(
+                    (1, int(silence_duration * self.sample_rate)), dtype=np.float32
+                )
+                wav_cat = np.concatenate([wav_cat, silence, wav], axis=1)
+                dur_cat += dur_onnx + silence_duration
+        return wav_cat, dur_cat
+    def batch(
+        self, text_list: list[str], style: Style, total_step: int, speed: float = 1.05
+    ) -> tuple[np.ndarray, np.ndarray]:
+        return self._infer(text_list, style, total_step, speed)
+def length_to_mask(lengths: np.ndarray, max_len: Optional[int] = None) -> np.ndarray:
+    """
+    Convert lengths to binary mask.
+    Args:
+        lengths: (B,)
+        max_len: int
+    Returns:
+        mask: (B, 1, max_len)
+    """
+    max_len = max_len or lengths.max()
+    ids = np.arange(0, max_len)
+    mask = (ids < np.expand_dims(lengths, axis=1)).astype(np.float32)
+    return mask.reshape(-1, 1, max_len)
+def get_latent_mask(
+    wav_lengths: np.ndarray, base_chunk_size: int, chunk_compress_factor: int
+) -> np.ndarray:
+    latent_size = base_chunk_size * chunk_compress_factor
+    latent_lengths = (wav_lengths + latent_size - 1) // latent_size
+    latent_mask = length_to_mask(latent_lengths)
+    return latent_mask
+def load_onnx(
+    onnx_path: str, opts: ort.SessionOptions, providers: list[str]
+) -> ort.InferenceSession:
+    return ort.InferenceSession(onnx_path, sess_options=opts, providers=providers)
+def load_onnx_all(
+    onnx_dir: str, opts: ort.SessionOptions, providers: list[str]
+) -> tuple[
+    ort.InferenceSession,
+    ort.InferenceSession,
+    ort.InferenceSession,
+    ort.InferenceSession,
+]:
+    dp_onnx_path = os.path.join(onnx_dir, "duration_predictor.onnx")
+    text_enc_onnx_path = os.path.join(onnx_dir, "text_encoder.onnx")
+    vector_est_onnx_path = os.path.join(onnx_dir, "vector_estimator.onnx")
+    vocoder_onnx_path = os.path.join(onnx_dir, "vocoder.onnx")
+    dp_ort = load_onnx(dp_onnx_path, opts, providers)
+    text_enc_ort = load_onnx(text_enc_onnx_path, opts, providers)
+    vector_est_ort = load_onnx(vector_est_onnx_path, opts, providers)
+    vocoder_ort = load_onnx(vocoder_onnx_path, opts, providers)
+    return dp_ort, text_enc_ort, vector_est_ort, vocoder_ort
+def load_cfgs(onnx_dir: str) -> dict:
+    cfg_path = os.path.join(onnx_dir, "tts.json")
+    with open(cfg_path, "r") as f:
+        cfgs = json.load(f)
+    return cfgs
+def load_text_processor(onnx_dir: str) -> UnicodeProcessor:
+    unicode_indexer_path = os.path.join(onnx_dir, "unicode_indexer.json")
+    text_processor = UnicodeProcessor(unicode_indexer_path)
+    return text_processor
+def load_text_to_speech(onnx_dir: str, use_gpu: bool = False) -> TextToSpeech:
+    opts = ort.SessionOptions()
+    if use_gpu:
+        raise NotImplementedError("GPU mode is not fully tested")
+    else:
+        providers = ["CPUExecutionProvider"]
+        print("Using CPU for inference")
+    cfgs = load_cfgs(onnx_dir)
+    dp_ort, text_enc_ort, vector_est_ort, vocoder_ort = load_onnx_all(
+        onnx_dir, opts, providers
+    )
+    text_processor = load_text_processor(onnx_dir)
+    return TextToSpeech(
+        cfgs, text_processor, dp_ort, text_enc_ort, vector_est_ort, vocoder_ort
+    )
+def load_voice_style(voice_style_paths: list[str], verbose: bool = False) -> Style:
+    bsz = len(voice_style_paths)
+    # Read first file to get dimensions
+    with open(voice_style_paths[0], "r") as f:
+        first_style = json.load(f)
+    ttl_dims = first_style["style_ttl"]["dims"]
+    dp_dims = first_style["style_dp"]["dims"]
+    # Pre-allocate arrays with full batch size
+    ttl_style = np.zeros([bsz, ttl_dims[1], ttl_dims[2]], dtype=np.float32)
+    dp_style = np.zeros([bsz, dp_dims[1], dp_dims[2]], dtype=np.float32)
+    # Fill in the data
+    for i, voice_style_path in enumerate(voice_style_paths):
+        with open(voice_style_path, "r") as f:
+            voice_style = json.load(f)
+        ttl_data = np.array(
+            voice_style["style_ttl"]["data"], dtype=np.float32
+        ).flatten()
+        ttl_style[i] = ttl_data.reshape(ttl_dims[1], ttl_dims[2])
+        dp_data = np.array(voice_style["style_dp"]["data"], dtype=np.float32).flatten()
+        dp_style[i] = dp_data.reshape(dp_dims[1], dp_dims[2])
+    if verbose:
+        print(f"Loaded {bsz} voice styles")
+    return Style(ttl_style, dp_style)
+@contextmanager
+def timer(name: str):
+    start = time.time()
+    print(f"{name}...")
+    yield
+    print(f"  -> {name} completed in {time.time() - start:.2f} sec")
+def sanitize_filename(text: str, max_len: int) -> str:
+    """Sanitize filename by replacing non-alphanumeric characters with underscores"""
+    import re
+    prefix = text[:max_len]
+    return re.sub(r"[^a-zA-Z0-9]", "_", prefix)
+def chunk_text(text: str, max_len: int = 300) -> list[str]:
+    """
+    Split text into chunks by paragraphs and sentences.
+    Args:
+        text: Input text to chunk
+        max_len: Maximum length of each chunk (default: 300)
+    Returns:
+        List of text chunks
+    """
+    import re
+    # Split by paragraph (two or more newlines)
+    paragraphs = [p.strip() for p in re.split(r"\n\s*\n+", text.strip()) if p.strip()]
+    chunks = []
+    for paragraph in paragraphs:
+        paragraph = paragraph.strip()
+        if not paragraph:
+            continue
+        # Split by sentence boundaries (period, question mark, exclamation mark followed by space)
+        # But exclude common abbreviations like Mr., Mrs., Dr., etc. and single capital letters like F.
+        pattern = r"(?<!Mr\.)(?<!Mrs\.)(?<!Ms\.)(?<!Dr\.)(?<!Prof\.)(?<!Sr\.)(?<!Jr\.)(?<!Ph\.D\.)(?<!etc\.)(?<!e\.g\.)(?<!i\.e\.)(?<!vs\.)(?<!Inc\.)(?<!Ltd\.)(?<!Co\.)(?<!Corp\.)(?<!St\.)(?<!Ave\.)(?<!Blvd\.)(?<!\b[A-Z]\.)(?<=[.!?])\s+"
+        sentences = re.split(pattern, paragraph)
+        current_chunk = ""
+        for sentence in sentences:
+            if len(current_chunk) + len(sentence) + 1 <= max_len:
+                current_chunk += (" " if current_chunk else "") + sentence
+            else:
+                if current_chunk:
+                    chunks.append(current_chunk.strip())
+                current_chunk = sentence
+        if current_chunk:
+            chunks.append(current_chunk.strip())
+    return chunks
+model_dir = snapshot_download("Supertone/supertonic")
+onnx_dir = f"{model_dir}/onnx"
+text_to_speech = load_text_to_speech(onnx_dir, False)
+def generate_speech(text_list, save_dir, voice_style="M1", total_step=5, speed=1.05, n_test=1, batch=None):
+    saved_files_list = []
+    voice_style_paths = [f"{model_dir}/voice_styles/{voice_style}.json"] * len(text_list)
+    assert len(voice_style_paths) == len(
+        text_list
+    ), f"Number of voice styles ({len(voice_style_paths)}) must match number of texts ({len(text_list)})"
+    bsz = len(voice_style_paths)
+    style = load_voice_style(voice_style_paths, verbose=True)
+    for n in range(n_test):
+        print(f"\n[{n+1}/{n_test}] Starting synthesis...")
+        with timer("Generating speech from text"):
+            if batch:
+                wav, duration = text_to_speech.batch(text_list, style, total_step, speed)
+            else:
+                wav, duration = text_to_speech(text_list[0], style, total_step, speed)
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir)
+        for b in range(bsz):
+            fname = f"{sanitize_filename(text_list[b], 20)}_{n+1}.wav"
+            w = wav[b, : int(text_to_speech.sample_rate * duration[b].item())]  # [T_trim]
+            sf.write(os.path.join(save_dir, fname), w, text_to_speech.sample_rate)
+            saved_files_list.append(f"{save_dir}/{fname}")
+            # print(f"Saved: {save_dir}/{fname}")
+    print("\n=== Synthesis completed successfully! ===")
+    return saved_files_list