Spaces:

sleeper371
/

bark_with_batch_inference

Running on Zero

App Files Files Community

sleeper371 commited on Jul 2

Commit

37a9836

1 Parent(s): 6e4576a

add code

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +179 -0
LICENSE +21 -0
README.md +92 -14
app.py +191 -0
config.py +12 -0
core/__init__.py +0 -0
core/bark/__init__.py +5 -0
core/bark/constants.py +18 -0
core/bark/custom_context.py +79 -0
core/bark/encodec.py +63 -0
core/bark/generate_audio.py +117 -0
core/bark/generate_audio_semantic_dataset.py +122 -0
core/bark/generate_coarse.py +385 -0
core/bark/generate_fine.py +210 -0
core/bark/generate_semantic.py +361 -0
core/bark/voice_clone.py +104 -0
core/data_model/__init__.py +1 -0
core/data_model/bark.py +337 -0
core/memory/__init__.py +5 -0
core/memory/common.py +187 -0
core/memory/model_manager.py +289 -0
core/memory/models.py +169 -0
core/model/__init__.py +1 -0
core/model/bark.py +425 -0
core/model/hubert.py +237 -0
core/trainer/__init__.py +1 -0
core/trainer/custom_hubert_trainer.py +555 -0
core/utils/__init__.py +7 -0
core/utils/audio.py +104 -0
core/utils/huggingface.py +169 -0
core/utils/read_write_files.py +46 -0
core/utils/text.py +13 -0
event_handlers.py +436 -0
generate_audio_semantic_dataset.py +155 -0
prompts/de_speaker_0.npz +0 -0
prompts/de_speaker_1.npz +0 -0
prompts/de_speaker_2.npz +0 -0
prompts/de_speaker_3.npz +0 -0
prompts/de_speaker_4.npz +0 -0
prompts/de_speaker_5.npz +0 -0
prompts/de_speaker_6.npz +0 -0
prompts/de_speaker_7.npz +0 -0
prompts/de_speaker_8.npz +0 -0
prompts/de_speaker_9.npz +0 -0
prompts/en_speaker_0.npz +0 -0
prompts/en_speaker_1.npz +0 -0
prompts/en_speaker_2.npz +0 -0
prompts/en_speaker_3.npz +0 -0
prompts/en_speaker_4.npz +0 -0
prompts/en_speaker_5.npz +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,179 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+bark_prompts/
+generated_audio/
+models/
+.DS_Store

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Hao Huynh Nhat
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,14 +1,92 @@
----
-title: Bark With Batch Inference
-emoji: 🌍
-colorFrom: indigo
-colorTo: blue
-sdk: gradio
-sdk_version: 5.35.0
-app_file: app.py
-pinned: false
-license: mit
-short_description: BARK model from SUNO with batch inference
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Generate Audio from text and clone voice with BARK
+You can generate audio from text with natural sounding voice and clone any voice (not perfect).
+![Screenshot Placeholder](./assets/images/screenshot.png)
+Code worked on Python 3.12. May also work on other versions.
+Example generated audio in the /assets/audio folder
+## Features
+-   **Text-to-Audio Generation:** Generate speech from text using the BARK model (supports 'small' and 'large' variants).
+-   **Parameter Control:** Adjust semantic, coarse, and fine temperature settings for generation diversity. Set a generation seed for reproducibility.
+-   **Device Selection:** Run inference on available devices (CPU, CUDA, MPS).
+-   **Standard Voice Prompts:** Utilize built-in BARK voice prompts (`.npz` files) located in the `bark_prompts` directory.
+-   **Custom Voice Prompt Creation (Voice Cloning):**
+    -   Upload your own audio file (.wav, .mp3).
+    -   Generate a BARK-compatible semantic prompt (`.npz` file) using a custom-trained HuBERT model.
+    -   The generated prompt appears in the "Select Voice Prompt" dropdown for immediate use.
+-   **Audio Management:** View, play, and delete generated audio files directly within the interface.
+-   **Training Scripts:** Includes scripts to generate the necessary dataset (`generate_audio_semantic_dataset.py`) and train the custom HuBERT model (`train_hubert.py`).
+## Custom Voice Cloning Model
+The core of the custom voice prompt generation relies on a fine-tuned HuBERT model.
+-   **Model:** `sleeper371/hubert-for-bark-semantic` on Hugging Face ([Link](https://huggingface.co/sleeper371/hubert-for-bark-semantic))
+-   **Architecture:** This model uses a HuBERT base feature extractor followed by a Transformer decoder head.
+-   **Training:** It was trained on over 4700 sentence pairs, mapping audio waveforms to the semantic tokens generated by BARK's semantic model. The training used a cross-entropy loss objective.
+-   **Dataset:** The training dataset is available at `sleeper371/bark-wave-semantic` on Hugging Face ([Link](https://huggingface.co/datasets/sleeper371/bark-wave-semantic)).
+-   **Comparison:** This approach is inspired by projects like [gitmylo/bark-data-gen](https://github.com/gitmylo/bark-data-gen), but differs in the head architecture (he used an LSTM head while I used a transformers decoder head)
+## Setup and Installation
+Follow these steps to set up the environment and run the application.
+1.  **Clone the Repository:**
+2.  **Create a Virtual Environment:**
+    It's highly recommended to use a virtual environment to manage dependencies.
+    ```bash
+    # For Linux/macOS
+    python3 -m venv venv
+    source venv/bin/activate
+    # For Windows
+    python -m venv venv
+    .\venv\Scripts\activate
+    ```
+3.  **Install Requirements:**
+    Make sure you have a `requirements.txt` file in the repository root containing all necessary packages (e.g., `gradio`, `torch`, `transformers`, `soundfile`, etc.).
+    ```bash
+    pip install -r requirements.txt
+    ```
+## Running the Application
+Once the setup is complete, run the Gradio application:
+```bash
+python app.py
+```
+This will launch the Gradio interface, typically accessible at http://127.0.0.1:7860 in your web browser. The console output will provide the exact URL.
+## Training Your Own Custom HuBERT Model
+If you want to train your own HuBERT model for voice cloning:
+1. Generate Dataset:
+-   Use the generate_audio_semantic_dataset.py script.
+2. Train the Model:
+-   Use the train_hubert.py script.
+-   This script takes the generated dataset (audio paths and semantic token paths) to fine-tune a HuBERT model with a Transformer decoder head.
+-   Configure training parameters (batch size, learning rate, epochs, output directory) within the script or via command-line arguments (if implemented).
+## License
+MIT
+## Acknowledgements
+-   Suno AI, they trained the models
+-   gitmylo, inspired me to use HuBERT to predict semantic tokens from audio

app.py ADDED Viewed

	@@ -0,0 +1,191 @@

+import gradio as gr
+from config import *
+from event_handlers import *
+# --- Gradio UI Definition ---
+# theme = gr.themes.Default(primary_hue=gr.themes.colors.blue).set()
+theme = gr.themes.Ocean(primary_hue=gr.themes.colors.blue).set()
+with gr.Blocks(
+    theme=theme,
+    title="grAudio",
+    css=".gradio-container { max-width: 95% !important; }",
+) as app:
+    # --- Global State ---
+    initial_audio_list = load_existing_audio()
+    audio_list_state = gr.State(value=initial_audio_list)
+    newly_generated_state = gr.State([])
+    # State to store the index of the selected row in the DataFrame
+    selected_index_state = gr.State(-1)  # -1 means nothing selected
+    # --- UI Layout ---
+    gr.Markdown("# Generate Audio from text")
+    with gr.Row(equal_height=False):
+        # --- Column 1: Configuration (Left) ---
+        with gr.Column(scale=2, min_width=350):
+            gr.Markdown("### Generation Configuration")
+            with gr.Accordion("Batch size & Temperatures", open=True):
+                batch_size_number = gr.Number(
+                    value=1,
+                    label="Seed",
+                    minimum=0,
+                    step=1,
+                    scale=1,
+                )
+                semantic_temp_slider = gr.Slider(
+                    0.1, 1.0, value=0.7, step=0.1, label="Semantic Temp"
+                )
+                coarse_temp_slider = gr.Slider(
+                    0.1, 1.0, value=0.7, step=0.1, label="Coarse Temp"
+                )
+                fine_temp_slider = gr.Slider(
+                    0.1, 1.0, value=0.7, step=0.1, label="Fine Temp"
+                )
+            with gr.Accordion("Model, Devices", open=True):
+                model_type_dropdown = gr.Dropdown(
+                    choices=["small", "large"], value="small", label="Model Type"
+                )
+                available_devices, best_device = get_available_torch_devices()
+                device_dropdown = gr.Dropdown(
+                    choices=available_devices, value=best_device, label="Device"
+                )
+            with gr.Accordion("Voice Prompt", open=True):
+                prompt_dropdown = gr.Dropdown(
+                    choices=get_available_prompts(),
+                    label="Select Voice Prompt",
+                    info="Optional",
+                    multiselect=False,
+                    allow_custom_value=False,
+                )
+                refresh_prompts_btn = gr.Button(
+                    "Refresh Prompts", variant="secondary", size="sm"
+                )
+            with gr.Accordion("Create New Voice Prompt", open=False):
+                prompt_audio_upload = gr.File(
+                    value=None,
+                    file_count="single",
+                    label="Upload Audio (.wav, .mp3)",
+                    file_types=["audio"],
+                    type="filepath",
+                )
+                create_prompt_btn = gr.Button("Create Prompt", variant="secondary")
+        # --- Column 2: Text Input & Generate Button (Middle) ---
+        with gr.Column(scale=4, min_width=600):
+            gr.Markdown("### Text Input")
+            text_input_block = gr.Textbox(
+                lines=30,
+                placeholder="If your text includes multiple long sentences, select a voice prompt to have consistent speech.\nDo not use long sentence, split them out to multiple sentences with each less than 15 seconds",
+                label="Text Prompts",
+            )
+            generate_btn = gr.Button("Generate", variant="primary")
+        # --- Column 3: Generated Audio Display (Right) - SIMPLIFIED ---
+        with gr.Column(scale=2, min_width=250):
+            gr.Markdown("### Generated Audio")
+            # DataFrame to display the list
+            audio_dataframe = gr.DataFrame(
+                headers=["File", "Prompt", "Duration (s)"],
+                datatype=["str", "str", "str"],
+                interactive=True,  # Allow row selection
+                row_count=(10, "dynamic"),  # Show ~10 rows, scroll if more
+                col_count=(3, "fixed"),
+                # value=format_audio_list_for_dataframe(initial_audio_list) # Set initial value via app.load
+            )
+            # Single audio player for the selected item
+            selected_audio_player = gr.Audio(
+                label="Selected Audio",
+                type="filepath",
+                interactive=False,  # Only for playback
+            )
+            # Single delete button
+            delete_selected_btn = gr.Button("Delete Selected Audio", variant="stop")
+    # --- Event Handling ---
+    # 1. Refresh Prompts Button
+    refresh_prompts_btn.click(
+        fn=update_available_prompts, inputs=None, outputs=[prompt_dropdown]
+    )
+    # 2. Create Prompt Button
+    create_prompt_btn.click(
+        fn=create_audio_prompt,
+        inputs=[prompt_audio_upload, device_dropdown],
+        outputs=[prompt_dropdown],
+    )
+    # 3. Generate Button -> Calls backend -> Outputs to temporary state
+    generate_btn.click(
+        fn=generate_batch_audio,
+        inputs=[
+            text_input_block,
+            semantic_temp_slider,
+            coarse_temp_slider,
+            fine_temp_slider,
+            batch_size_number,
+            model_type_dropdown,
+            device_dropdown,
+            prompt_dropdown,
+        ],
+        outputs=[newly_generated_state],
+    )
+    # 4. Temporary State Change -> Updates the main audio list state
+    newly_generated_state.change(
+        fn=update_audio_list,
+        inputs=[newly_generated_state, audio_list_state],
+        outputs=[audio_list_state],
+        show_progress="hidden",
+    )
+    # 5. Main Audio List State Change -> Updates the DataFrame display
+    #    Also clears selection when the list updates.
+    audio_list_state.change(
+        fn=format_audio_list_for_dataframe,
+        inputs=[audio_list_state],
+        outputs=[audio_dataframe],
+        show_progress="hidden",
+    ).then(  # Chain: after updating dataframe, clear selection player and index
+        fn=lambda: (None, -1),  # Function returning values to clear outputs
+        inputs=None,
+        outputs=[selected_audio_player, selected_index_state],
+        show_progress="hidden",
+        queue=False,
+    )
+    # 6. DataFrame Row Selection -> Updates the selected index and audio player
+    audio_dataframe.select(
+        fn=handle_row_selection,
+        inputs=[audio_list_state],  # Pass the full list state to find the filepath
+        outputs=[
+            selected_audio_player,
+            selected_index_state,
+        ],
+        show_progress="hidden",
+    )
+    # 7. Delete Selected Button Click -> Calls delete handler
+    delete_selected_btn.click(
+        fn=handle_delete_selected,
+        inputs=[selected_index_state, audio_list_state],  # Pass index and list
+        outputs=[
+            audio_list_state,  # Update the main list state
+            selected_index_state,  # Clear the selected index
+            selected_audio_player,  # Clear the audio player
+        ],
+        show_progress="hidden",
+    )
+    # 8. Initial Load: Populate the DataFrame
+    app.load(
+        fn=format_audio_list_for_dataframe,
+        inputs=[audio_list_state],  # Use the initial state value
+        outputs=[audio_dataframe],  # Render initial data into the DataFrame
+    )
+if __name__ == "__main__":
+    app.launch(debug=True, share=False)

config.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import os
+# --- Configuration ---
+PROMPT_DIR = "./prompts"
+GENERATED_AUDIO_DIR = "./generated_audio"
+os.makedirs(PROMPT_DIR, exist_ok=True)
+os.makedirs(GENERATED_AUDIO_DIR, exist_ok=True)
+# Constants for audio generation
+DEFAULT_AUDIO_SAMPLE_RATE = 24000
+DEFAULT_DURATION = 3
+DEFAULT_FREQ = 440

core/__init__.py ADDED Viewed

File without changes

core/bark/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from core.bark.generate_audio import *
+from core.bark.encodec import *
+from core.bark.voice_clone import *

core/bark/constants.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# original BARK semantic vocab size
+SEMANTIC_VOCAB_SIZE = 10_000
+# HuBERT model output vocab size
+HUBERT_OUTPUT_VOCAB_SIZE = 10_003
+CODEBOOK_SIZE = 1024
+N_COARSE_CODEBOOKS = 2
+COARSE_RATE_HZ = 75
+COARSE_SEMANTIC_PAD_TOKEN = 12_048
+COARSE_INFER_TOKEN = 12_050
+# for the BERT model to get semantic tokens from raw texts
+TEXT_ENCODING_OFFSET = 10_048
+SEMANTIC_PAD_TOKEN = 10_000
+TEXT_PAD_TOKEN = 129_595
+SEMANTIC_INFER_TOKEN = 129_599
+SEMANTIC_RATE_HZ = 49.9
+N_FINE_CODEBOOKS = 8

core/bark/custom_context.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import contextlib
+import torch
+import funcy
+"""
+Custom context managers for PyTorch inference operations.
+This module provides context managers for controlling:
+- CUDA benchmarking settings
+- Inference mode and gradient calculation
+- Automatic mixed precision (AMP) casting
+The main context manager `inference_mode()` combines all these settings
+for optimal inference performance.
+"""
+class InferenceContext:
+    """
+    Context manager for controlling CUDA benchmarking settings.
+    Args:
+        benchmark (bool): Whether to enable cudnn benchmarking. Defaults to False
+            since input lengths may vary in inference scenarios.
+    This context manager saves and restores the original cudnn.benchmark setting
+    when entering/exiting the context.
+    """
+    def __init__(self, benchmark=False):
+        # we can't expect inputs to be the same length, so disable benchmarking by default
+        self._chosen_cudnn_benchmark = benchmark
+        self._cudnn_benchmark = None
+    def __enter__(self):
+        self._cudnn_benchmark = torch.backends.cudnn.benchmark
+        torch.backends.cudnn.benchmark = self._chosen_cudnn_benchmark
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        torch.backends.cudnn.benchmark = self._cudnn_benchmark
+if (
+    torch.cuda.is_available()
+    and hasattr(torch.cuda, "amp")
+    and hasattr(torch.cuda.amp, "autocast")
+    and hasattr(torch.cuda, "is_bf16_supported")
+    and torch.cuda.is_bf16_supported()
+):
+    autocast = funcy.partial(
+        torch.amp.autocast, dtype=torch.bfloat16, device_type="cuda"
+    )
+    """Context manager for automatic mixed precision (AMP) using bfloat16 where supported."""
+else:
+    @contextlib.contextmanager
+    def autocast():
+        """No-op autocast context manager when bfloat16 is not supported."""
+        yield
+@contextlib.contextmanager
+def inference_mode():
+    """
+    Combined context manager for optimal inference performance.
+    Combines:
+    - CUDA benchmarking control
+    - PyTorch inference mode
+    - Disabled gradient calculation
+    - Automatic mixed precision casting (where supported)
+    Usage:
+        with inference_mode():
+            # inference operations here
+    """
+    with InferenceContext(), torch.inference_mode(), torch.no_grad(), autocast():
+        yield

core/bark/encodec.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import torch
+import numpy as np
+from encodec import EncodecModel
+from encodec.utils import convert_audio
+from core.memory import model_manager, ModelEnum, env
+from core.bark.custom_context import inference_mode
+def encodec_decode_fine_tokens_to_audio(fine_tokens: torch.Tensor) -> np.ndarray:
+    """
+    expecting fine_tokens shape [codebook_size, timestep], concretely [8, 75*duration_in_sec]
+    Decode the given fine_tokens using the Encodec's decoder
+    Returns the audio sample array as an np.ndarray
+    Returns
+        np.ndarray of shape (B, C, T), C = 1 for mono audio
+    """
+    model_info = ModelEnum.ENCODEC24k.value
+    model_wrapper = model_manager.get_model(model_info)
+    model: EncodecModel = model_wrapper.model
+    device = next(model.parameters()).device
+    input_tensor = fine_tokens.transpose(0, 1).to(device)
+    emb = model.quantizer.decode(input_tensor)
+    output: torch.Tensor = model.decoder(emb)
+    audio_arr = output.detach().cpu().numpy()
+    del input_tensor, emb, output
+    return audio_arr
+def encodec_encode_audio(
+    audio_sample: torch.Tensor, audio_sample_rate: int
+) -> torch.Tensor:
+    """
+    Encode the given audio sample using the encodec model
+    audio_sample expected shape: (channels, sample)
+    Returns codes as a tensor shape [n_q, T]
+        where n_q typically is 8 and T is the compressed time step dimension (75 per second for 24khz model)
+    """
+    model_wrapper = model_manager.get_model(ModelEnum.ENCODEC24k.value)
+    model: EncodecModel = model_wrapper.model
+    device = next(model.parameters()).device
+    wav = convert_audio(
+        audio_sample, audio_sample_rate, model.sample_rate, model.channels
+    )
+    wav = wav.unsqueeze(0).float().to(device)
+    # Extract discrete codes from EnCodec
+    with inference_mode():
+        encoded_frames = model.encode(wav)
+    codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1)  # [B, n_q, T]
+    return codes[0, :, :]

core/bark/generate_audio.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import sys
+import logging
+from typing_extensions import Union, List
+import numpy as np
+import torch
+from dataclasses import asdict
+from core.bark.generate_semantic import generate_semantic_tokens_from_text
+from core.bark.generate_coarse import generate_coarse_tokens_from_semantic
+from core.bark.generate_fine import generate_fine_tokens_from_coarse
+from core.data_model.bark import BarkPrompt, BarkGenerationConfig
+from core.bark.encodec import encodec_decode_fine_tokens_to_audio
+from core.bark.constants import SEMANTIC_PAD_TOKEN, SEMANTIC_RATE_HZ
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    handlers=[logging.StreamHandler(sys.stdout)],
+)
+logger = logging.getLogger(__name__)
+def generate_audio(
+    texts: List[str],
+    prompt: Union[BarkPrompt, None] = None,
+    generation_config: BarkGenerationConfig = None,
+    silent: bool = False,
+) -> List[np.ndarray]:
+    """
+    Generate audio from text with an optional audio prompt
+    Args:
+        text (str): Input text to generate audio. Must be non-empty.
+        num_gen (int): number of audio to generate per text
+        prompt (Union[str, None]): optional path to a prompt file of type .npz that will be used as the audio prompt
+        generation_config: configurations to generate audio
+    """
+    if prompt is not None:
+        semantic_prompt = prompt.semantic_prompt if prompt is not None else None
+        # if len(semantic_prompt.shape) == 2:
+        #     semantic_prompt = semantic_prompt[0, :]
+        assert (
+            len(semantic_prompt.shape) == 1
+        ), "expecting semantic prompt as a 1D array"
+    else:
+        semantic_prompt = None
+    if generation_config is None:
+        logger.info("using BARK default generation config")
+        generation_config = BarkGenerationConfig()
+    semantic_tokens = generate_semantic_tokens_from_text(
+        texts,
+        semantic_prompt,
+        **asdict(generation_config),
+        silent=silent,
+    )
+    # because we generate audio in batch, all audios in one batch have the same length
+    # of the longest audio. We need to remove the random section of the shorter audio
+    # after it has ended
+    # coarse token generation
+    coarse_tokens = generate_coarse_tokens_from_semantic(
+        semantic_tokens, prompt, **asdict(generation_config), silent=silent
+    )
+    # fine token generation
+    fine_tokens = generate_fine_tokens_from_coarse(
+        coarse_tokens=coarse_tokens,
+        history_prompt=prompt,
+        temperature=generation_config.generate_fine_temperature,
+        use_small_model=generation_config.use_small_model,
+        silent=silent,
+    )
+    # decoding the codes
+    audio_wave = encodec_decode_fine_tokens_to_audio(fine_tokens)
+    assert (
+        len(audio_wave.shape) == 3
+    ), f"expecting audio tensor of shape (B, C, T), received {audio_wave.shape}"
+    audio_wave = audio_wave.squeeze(1)  # squeeze the channel dimension
+    res = remove_padded_segment_from_audio(audio_wave, semantic_tokens.cpu().numpy())
+    return res
+def remove_padded_segment_from_audio(
+    audio_wave: np.ndarray, semantic_tokens: np.ndarray, audio_sample_rate: int = 24000
+) -> List[np.ndarray]:
+    # Because the semantic token tensor's time step dimension is of the longest audio in the sample
+    # all the remaining audio have shorter length would have random sound after its end
+    # we will change the values of coarse_token tensor of shorter audios at positions after it end
+    # to avoid random sound in the generated results
+    # SEMANTIC_PAD_TOKEN is also the end of sentence token
+    # this function assume audio_wave has shape (batch, T)
+    assert (
+        len(audio_wave.shape) == 2
+    ), f"expecting ndarray of shape (B, T), received {audio_wave.shape}"
+    mask = semantic_tokens == SEMANTIC_PAD_TOKEN
+    semantic_eos_indices = np.argmax(mask.astype(np.int32), axis=1)  # Shape [batch]
+    wave_eos_indices: np.ndarray = semantic_eos_indices * (
+        audio_sample_rate / SEMANTIC_RATE_HZ
+    )
+    wave_eos_indices = wave_eos_indices.astype(np.int32)
+    res = []
+    for wave_index in range(audio_wave.shape[0]):
+        if wave_eos_indices[wave_index] == 0:
+            # zero means this audio is the longest one in the batch and there is no need to cut the padded segment
+            res.append(audio_wave[wave_index])
+            continue
+        start_padding_index = wave_eos_indices[wave_index]
+        res.append(audio_wave[wave_index, :start_padding_index])
+    return res

core/bark/generate_audio_semantic_dataset.py ADDED Viewed

	@@ -0,0 +1,122 @@

+from typing import List
+import numpy as np
+from tqdm import tqdm
+from dataclasses import asdict
+from core.bark.generate_semantic import generate_semantic_tokens_from_text
+from core.bark.generate_coarse import generate_coarse_tokens_from_semantic
+from core.bark.generate_fine import generate_fine_tokens_from_coarse
+from core.bark.encodec import encodec_decode_fine_tokens_to_audio
+from core.bark.generate_audio import remove_padded_segment_from_audio
+from core.data_model import WavSemantic, WavSemanticDataset, BarkGenerationConfig
+from core.bark.constants import SEMANTIC_PAD_TOKEN
+def generate_wav_semantic_dataset(
+    text_file_path: str,
+    generation_config: BarkGenerationConfig,
+    batch_size: int = 16,
+    silent: bool = False,
+    save_path: str = "./dataset",
+    save_data_as_raw_audio: bool = True,
+) -> None:
+    """
+    Generate a dataset of (wav, semantic_tokens) for training a model to predict semantic tokens from audio
+    Args
+        text_file_path: path to the text file that will be used to generate audio data
+        generation_config: the config used to generate data
+        batch_size: batch size when generate data
+        bark_model_type: either `large` or `small`, the coarse and fine model variant that will be used to generate audio
+        max_token_per_example: a criteria to limit the length of an example from text. The text will be tokenized using a BERT tokenizer,
+            and the tokenized text will be truncated to not exceed this length
+        save_path: path to save the generated dataset
+        save_data_as_raw_audio: if True, waves will be saved as raw audio, otherwise it will be saved as compressed .npz file
+    """
+    texts = read_text_file(text_file_path)
+    assert len(texts) > 0, "empty text data"
+    mini_batches = [texts[i : i + batch_size] for i in range(0, len(texts), batch_size)]
+    progress_bar = tqdm(
+        total=len(mini_batches), disable=silent, desc="Generating wav-semantic dataset"
+    )
+    for batch in mini_batches:
+        semantic_tokens = generate_semantic_tokens_from_text(
+            texts=batch, semantic_prompt=None, silent=True, **asdict(generation_config)
+        )
+        coarse = generate_coarse_tokens_from_semantic(
+            semantic_tokens=semantic_tokens,
+            history_prompt=None,
+            silent=True,
+            **asdict(generation_config)
+        )
+        fine = generate_fine_tokens_from_coarse(
+            coarse_tokens=coarse,
+            history_prompt=None,
+            temperature=generation_config.generate_fine_temperature,
+            use_small_model=generation_config.use_small_model,
+            silent=True,
+        )
+        # generate audio waves from the fine tokens
+        waves = encodec_decode_fine_tokens_to_audio(fine)
+        # remove the channel dimension
+        waves = waves.squeeze(1)
+        waves = remove_padded_segment_from_audio(waves, semantic_tokens.cpu().numpy())
+        save_semantic_wave_data(
+            batch,
+            waves,
+            semantic_tokens.detach().cpu().numpy(),
+            24000,
+            generation_config,
+            save_path,
+            save_data_as_raw_audio,
+        )
+        progress_bar.update(1)
+        del semantic_tokens, coarse, fine, waves
+def save_semantic_wave_data(
+    texts: List[str],
+    waves: List[np.ndarray],
+    semantic_tokens: np.ndarray,
+    sample_rate: int,
+    generation_config: BarkGenerationConfig,
+    save_path: str,
+    save_raw_audio: bool,
+) -> None:
+    """
+    Save the given data as a WaveSemantic dataset
+    """
+    examples = []
+    assert (
+        len(texts) == len(waves) == semantic_tokens.shape[0]
+    ), "unexpected array length"
+    model_type = "small" if generation_config.use_small_model else "large"
+    # remove the padding tokens at the end of the semantic sequences
+    mask = semantic_tokens == SEMANTIC_PAD_TOKEN
+    semantic_padding_indices = np.argmax(mask.astype(np.int32), axis=1)
+    for i, (text, padding_index) in enumerate(zip(texts, semantic_padding_indices)):
+        if padding_index == 0:
+            padding_index = len(semantic_tokens[i])
+        example = WavSemantic(text, waves[i], semantic_tokens[i, :padding_index])
+        examples.append(example)
+    dataset = WavSemanticDataset(sample_rate, generation_config, model_type, examples)
+    dataset.save(save_path, save_raw_audio)
+def read_text_file(path: str) -> List[str]:
+    with open(path, "r") as file:
+        lines = file.readlines()
+        # Remove newline characters
+        lines = [line.strip() for line in lines]
+        return lines

core/bark/generate_coarse.py ADDED Viewed

	@@ -0,0 +1,385 @@

+import numpy as np
+import torch
+import torch.nn.functional as F
+from tqdm import tqdm
+from typing_extensions import Optional, Union, Tuple
+from core.bark.constants import *
+from core.model.bark import GPT
+from core.data_model.bark import BarkPrompt
+from core.bark.custom_context import inference_mode
+from core.memory import model_manager, ModelEnum, env
+# number of coarse tokens per one semantic token for one second
+num_coarse_per_semantic = (COARSE_RATE_HZ / SEMANTIC_RATE_HZ) * N_COARSE_CODEBOOKS
+def generate_coarse_tokens_from_semantic(
+    semantic_tokens: torch.Tensor,
+    history_prompt: Union[BarkPrompt, None] = None,
+    generate_coarse_temperature: Union[float, None] = 0.6,
+    coarse_top_k: Union[int, None] = None,
+    coarse_top_p: Union[float, None] = None,
+    silent: bool = False,
+    max_coarse_history: int = 630,
+    sliding_window_length: int = 60,
+    use_kv_caching: bool = True,
+    use_small_model: bool = False,
+    **kwargs,
+) -> torch.Tensor:
+    # Validate inputs
+    _validate_semantic_tokens(semantic_tokens)
+    _validate_history_prompt(history_prompt)
+    assert (
+        60 <= max_coarse_history <= 630
+    ), "max_coarse_history must be between 60 and 630"
+    assert (
+        max_coarse_history + sliding_window_length <= 1024 - 256
+    ), "Context exceeds model limit"
+    # align the number of semantic history token with the given max_coarse_history
+    max_semantic_history = int(max_coarse_history / num_coarse_per_semantic)
+    # align the length of the provided semantic and coarse history
+    semantic_history, coarse_history = _process_history_prompt(
+        history_prompt, max_semantic_history, num_coarse_per_semantic
+    )
+    # Load coarse model
+    coarse_model_info = (
+        ModelEnum.BARK_COARSE_SMALL.value
+        if use_small_model
+        else ModelEnum.BARK_COARSE.value
+    )
+    model_wrapper = model_manager.get_model(coarse_model_info)
+    model: GPT = model_wrapper.model
+    assert isinstance(model, GPT), "unexpected model type"
+    # total_steps is the number of coarse tokens the model need to predict
+    total_steps = int(
+        np.floor(semantic_tokens.size(1) * num_coarse_per_semantic / N_COARSE_CODEBOOKS)
+        * N_COARSE_CODEBOOKS
+    )
+    assert (
+        total_steps > 0 and total_steps % N_COARSE_CODEBOOKS == 0
+    ), "Invalid step count"
+    batch, T = semantic_tokens.size()
+    # expand the semantic history at the batch dimension to match with the semantic_tokens tensor's batch size
+    # for the concatenation
+    semantic_history = semantic_history[None].expand((batch, -1))
+    full_semantic = torch.hstack([semantic_history, semantic_tokens]).to(torch.int32)
+    base_semantic_index = semantic_history.size(1)
+    # Generate coarse tokens
+    with inference_mode():
+        generated_coarse = _generate_coarse_with_sliding_window(
+            model,
+            full_semantic,
+            coarse_history,
+            total_steps,
+            base_semantic_index,
+            max_semantic_history,
+            num_coarse_per_semantic,
+            generate_coarse_temperature,
+            coarse_top_k,
+            coarse_top_p,
+            silent,
+            max_coarse_history,
+            sliding_window_length,
+            use_kv_caching,
+        )
+    # remove the history prompt from the generated tokens
+    generated_coarse = generated_coarse[:, coarse_history.size(0) :]
+    assert generated_coarse.size(1) == total_steps, "Generated length mismatch"
+    # Reshape and adjust coarse codes
+    B, L = generated_coarse.shape
+    # Broadcasting subtracts from all elements
+    coarse_output = (
+        generated_coarse.reshape(B, -1, N_COARSE_CODEBOOKS).transpose(1, 2)
+        - SEMANTIC_VOCAB_SIZE
+    )
+    for codebook_idx in range(1, N_COARSE_CODEBOOKS):
+        coarse_output[:, codebook_idx, :] -= codebook_idx * CODEBOOK_SIZE
+    return coarse_output
+def _validate_semantic_tokens(semantic_tokens: torch.Tensor) -> None:
+    """
+    Validate the input semantic tokens tensor.
+    Args:
+        semantic_tokens: Tensor of semantic tokens (1D).
+    Raises:
+        AssertionError: If the tensor does not meet expected criteria.
+    """
+    assert isinstance(
+        semantic_tokens, torch.Tensor
+    ), "Semantic tokens must be a torch.Tensor"
+    assert semantic_tokens.dim() == 2, "Semantic tokens must be 2D"
+    assert semantic_tokens.size(1) > 0, "Semantic tokens tensor cannot be empty"
+    assert semantic_tokens.min() >= 0, "Semantic tokens must be non-negative"
+    assert (
+        semantic_tokens.max() <= SEMANTIC_VOCAB_SIZE
+    ), "Semantic tokens exceed vocab size"
+def _validate_history_prompt(history_prompt: Union[BarkPrompt, None]) -> None:
+    """
+    Validate the history prompt if provided.
+    Args:
+        history_prompt: BarkPrompt object or None.
+    Raises:
+        AssertionError: If the prompt does not meet expected criteria.
+    """
+    if history_prompt is None:
+        return
+    assert isinstance(
+        history_prompt, BarkPrompt
+    ), "History prompt must be a BarkPrompt object"
+    semantic = history_prompt.semantic_prompt
+    coarse = history_prompt.coarse_prompt
+    assert (
+        isinstance(semantic, torch.Tensor) and semantic.dim() == 1
+    ), "Semantic prompt must be 1D tensor"
+    assert (
+        semantic.size(0) > 0
+        and semantic.min() >= 0
+        and semantic.max() <= SEMANTIC_VOCAB_SIZE - 1
+    )
+    assert (
+        isinstance(coarse, torch.Tensor) and coarse.dim() == 2
+    ), "Coarse prompt must be 2D tensor"
+    assert (
+        coarse.shape[0] == N_COARSE_CODEBOOKS
+    ), "Coarse prompt must have correct number of codebooks"
+    assert coarse.min() >= 0 and coarse.max() <= CODEBOOK_SIZE - 1
+def _process_history_prompt(
+    history_prompt: Union[BarkPrompt, None],
+    max_semantic_history: int,
+    coarse_to_semantic_ratio: float,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Process the history prompt into semantic and coarse history tensors.
+    Trim on the left (keep the right most tokens)
+    Args:
+        history_prompt: BarkPrompt object or None.
+        max_semantic_history: Maximum number of semantic history tokens.
+        coarse_to_semantic_ratio: Ratio of coarse to semantic token rates.
+    Returns:
+        Tuple[semantic_history, coarse_history]: Processed history tensors.
+    """
+    if history_prompt is None:
+        return torch.tensor(
+            [], dtype=torch.int32, device=torch.device(env.DEVICE)
+        ), torch.tensor([], dtype=torch.int32, device=torch.device(env.DEVICE))
+    semantic_history = history_prompt.semantic_prompt
+    coarse_history = history_prompt.coarse_prompt
+    # Add offset then "ravel("F")" flatten
+    coarse_history = _add_codebook_offset(coarse_history, CODEBOOK_SIZE)
+    coarse_history_flat = coarse_history.T.flatten() + SEMANTIC_VOCAB_SIZE
+    # Trim histories to fit max length
+    n_semantic_hist = min(
+        max_semantic_history,
+        semantic_history.size(0) - semantic_history.size(0) % 2,  # Ensure even length
+        int(coarse_history_flat.size(0) // coarse_to_semantic_ratio),
+    )
+    n_coarse_hist = int(round(n_semantic_hist * coarse_to_semantic_ratio))
+    semantic_history = semantic_history[-n_semantic_hist:].to(torch.int32)
+    coarse_history_flat = coarse_history_flat[-n_coarse_hist:].to(torch.int32)
+    coarse_history_flat = coarse_history_flat[:-2]  # Original time alignment hack
+    return semantic_history, coarse_history_flat
+def _add_codebook_offset(x: torch.Tensor, offset: int) -> torch.Tensor:
+    """
+    x shape (n_codebook, T)
+    n_codebook start from 0 to n, from the second codebook row on we add offset * row_num
+    """
+    for n in range(1, x.shape[0]):
+        x[n, :] += offset * n
+    return x
+def _sample_coarse_token(
+    logits: torch.Tensor,
+    temperature: Union[float, None],
+    top_k: Optional[int],
+    top_p: Optional[float],
+    logit_start_idx: int,
+) -> torch.Tensor:
+    """
+    Sample a coarse token from model logits with filtering.
+    Args:
+        logits: Model output logits (shape [batch, seq, vocab]).
+        temperature: Sampling temperature for randomness.
+        top_k: Number of top logits to consider, if specified.
+        top_p: Nucleus sampling threshold, if specified.
+        logit_start_idx: Starting index for coarse token logits.
+    Returns:
+        torch.Tensor: Sampled token with offset applied (shape [1]).
+    """
+    relevant_logits = logits[:, 0, logit_start_idx : logit_start_idx + CODEBOOK_SIZE]
+    if temperature is None:
+        probs = F.softmax(relevant_logits, dim=-1)
+        next_token = torch.argmax(probs, dim=-1, keepdim=True).to(torch.int32)
+    else:
+        if top_p is not None:  # this branch is untested
+            # Optimize with NumPy for top-p filtering,
+            original_device = relevant_logits.device
+            logits_np = relevant_logits.detach().cpu().numpy().astype(np.float32)
+            sorted_indices = np.argsort(logits_np)[::-1]
+            sorted_logits = logits_np[sorted_indices]
+            cumulative_probs = np.cumsum(
+                F.softmax(torch.from_numpy(sorted_logits), dim=-1).numpy()
+            )
+            indices_to_remove = cumulative_probs > top_p
+            indices_to_remove[1:] = indices_to_remove[:-1].copy()
+            indices_to_remove[0] = False
+            logits_np[sorted_indices[indices_to_remove]] = -np.inf
+            relevant_logits = torch.from_numpy(logits_np).to(original_device)
+        if top_k is not None:
+            top_values, _ = torch.topk(
+                relevant_logits, min(top_k, relevant_logits.size(-1))
+            )
+            relevant_logits[relevant_logits < top_values[:, [-1]]] = -float("Inf")
+        probs = F.softmax(relevant_logits / temperature, dim=-1)
+        next_token = torch.multinomial(probs, num_samples=1).to(torch.int32)
+    return next_token + logit_start_idx
+def _generate_coarse_with_sliding_window(
+    model: GPT,
+    full_semantic: torch.Tensor,
+    coarse_history: torch.Tensor,
+    total_steps: int,
+    base_semantic_index: int,
+    max_semantic_history: int,
+    coarse_per_semantic: float,
+    temperature: float,
+    top_k: Optional[int],
+    top_p: Optional[float],
+    silent: bool,
+    max_coarse_history: int,
+    sliding_window_length: int,
+    use_kv_caching: bool,
+) -> torch.Tensor:
+    """
+    Generate coarse tokens using a sliding window approach.
+    Args:
+        model: GPT model for coarse token generation.
+        full_semantic: 2D tensor of Concatenated semantic history and input tokens.
+        coarse_history: 1D tensor, Initial coarse history tokens.
+        total_steps: Total number of coarse tokens to generate.
+        base_semantic_index: Start index of input semantic tokens.
+        max_semantic_history: Maximum semantic history length.
+        coarse_per_semantic: Coarse-to-semantic token ratio.
+        temperature: Sampling temperature.
+        top_k: Top-k filtering parameter.
+        top_p: Top-p filtering parameter.
+        silent: Suppresses progress bar if True.
+        max_coarse_history: Maximum coarse history length.
+        sliding_window_length: Tokens per window.
+        use_kv_caching: Enables KV caching.
+    Returns:
+        torch.Tensor: Generated coarse tokens (1D).
+    """
+    device = next(model.parameters()).device
+    semantic_tensor = full_semantic.to(device)  # Add batch dimension
+    coarse_tensor = (
+        coarse_history[None].expand((semantic_tensor.shape[0], -1)).to(device)
+    )
+    window_count = int(np.ceil(total_steps / sliding_window_length))
+    progress_bar = tqdm(
+        total=window_count, disable=silent, desc="Generating coarse tokens"
+    )
+    step_counter = 0  # equivalent to the number of coarse tokens generated so far
+    for _ in range(window_count):
+        current_semantic_idx = base_semantic_index + int(
+            round(step_counter / coarse_per_semantic)
+        )
+        window_start = max(0, current_semantic_idx - max_semantic_history)
+        semantic_window = semantic_tensor[:, window_start : window_start + 256]
+        semantic_window = F.pad(
+            semantic_window,
+            (0, 256 - semantic_window.shape[-1]),
+            "constant",
+            COARSE_SEMANTIC_PAD_TOKEN,
+        )
+        input_tensor = torch.hstack(
+            [
+                semantic_window,
+                torch.tensor([COARSE_INFER_TOKEN], device=device)[None].expand(
+                    (semantic_window.shape[0], -1)
+                ),
+                coarse_tensor[:, -max_coarse_history:],
+            ]
+        )
+        kv_cache = None
+        for _ in range(sliding_window_length):
+            if step_counter >= total_steps:
+                break
+            is_first_codebook = step_counter % N_COARSE_CODEBOOKS == 0
+            logit_start_idx = (
+                SEMANTIC_VOCAB_SIZE + (1 - int(is_first_codebook)) * CODEBOOK_SIZE
+            )
+            model_input = (
+                input_tensor[:, [-1]]
+                if use_kv_caching and kv_cache is not None
+                else input_tensor
+            )
+            logits, kv_cache = model(
+                model_input, use_cache=use_kv_caching, past_kv=kv_cache
+            )
+            next_token = _sample_coarse_token(
+                logits,
+                temperature,
+                top_k,
+                top_p,
+                logit_start_idx,
+            )
+            coarse_tensor = torch.cat((coarse_tensor, next_token), dim=1)
+            input_tensor = torch.cat((input_tensor, next_token), dim=1)
+            step_counter += 1
+            del logits, next_token
+        del input_tensor
+        progress_bar.update(1)
+    progress_bar.close()
+    return coarse_tensor

core/bark/generate_fine.py ADDED Viewed

	@@ -0,0 +1,210 @@

+from typing import Union
+import torch
+from tqdm import tqdm
+import torch.nn.functional as F
+from core.data_model.bark import BarkPrompt
+from core.bark.custom_context import inference_mode
+from core.model import FineGPT
+from core.memory import ModelEnum, model_manager
+from core.bark.constants import *
+def generate_fine_tokens_from_coarse(
+    coarse_tokens: torch.Tensor,
+    history_prompt: Union[BarkPrompt, None] = None,
+    temperature: float = 0.5,
+    use_small_model: bool = True,
+    silent: bool = False,
+) -> torch.Tensor:
+    """
+    Generate fine-grained audio codes from coarse audio codes using the BARK fine model.
+    This function takes coarse tokens (representing a partial set of audio codebooks) and
+    autoregressively predicts the remaining fine tokens, optionally conditioning on a history
+    prompt. The process involves sliding a context window over the sequence, predicting 512
+    timesteps at a time based on a 1024-timestep input.
+    Prompt tokens are trim on the left (keep the right most tokens)
+    Args:
+        coarse_tokens (torch.Tensor): Coarse audio codes with shape (batch, n_coarse, sequence_length),
+            where n_coarse <= N_FINE_CODEBOOKS - 1 and values are in [0, CODEBOOK_SIZE - 1].
+        history_prompt (BarkPrompt, optional): Historical fine tokens for conditioning, or None.
+        temperature (float): Sampling temperature for fine token prediction; if None, uses argmax.
+        silent (bool): If True, suppresses progress bar output.
+    Returns:
+        torch.Tensor: Fine audio codes with shape (N_FINE_CODEBOOKS, sequence_length),
+            matching the input sequence_length.
+    Raises:
+        AssertionError: If input validation fails for coarse_tokens or history_prompt.
+    """
+    # Validate inputs
+    _validate_coarse_tokens(coarse_tokens=coarse_tokens)
+    history_fine_tokens = _validate_and_load_history(history_prompt=history_prompt)
+    batch, n_coarse, sequence_length = coarse_tokens.shape
+    # Load the fine model
+    model_info = (
+        ModelEnum.BARK_FINE_SMALL.value
+        if use_small_model
+        else ModelEnum.BARK_FINE.value
+    )
+    model_wrapper = model_manager.get_model(model_info)
+    model: FineGPT = model_wrapper.model
+    assert isinstance(model, FineGPT), "Expected FineGPT model type"
+    device = next(model.parameters()).device
+    coarse_tokens = coarse_tokens.to(device)
+    # stack coarse tokens with padding for remaining codebooks across the codebook dimension
+    # e.g original coarse_token shape (B, 2, T), after vstack shape: (B, 8, T) where codebook size = 8
+    pad_tensor = torch.full(
+        (batch, N_FINE_CODEBOOKS - n_coarse, sequence_length),
+        CODEBOOK_SIZE,
+        dtype=torch.int32,
+        device=device,
+    )
+    input_tensor = torch.cat((coarse_tokens, pad_tensor), dim=1)
+    # Prepend history if provided. Maximum history time step is 512
+    # this is a horizontal prepend on the left of the previous padded input tensor
+    # output tensor: (8, history_timestep + coarse_timestep), history_timestep <= 512
+    n_history = 0
+    if history_fine_tokens is not None:
+        history_fine_tokens = history_fine_tokens.expand((batch, N_FINE_CODEBOOKS, -1))
+        history_limit = min(history_fine_tokens.shape[-1], 512)
+        history_slice = history_fine_tokens[:, :, -history_limit:].to(
+            device, dtype=torch.int32
+        )
+        input_tensor = torch.cat((history_slice, input_tensor), dim=-1)
+        n_history = history_limit  # number of time step dimension in the prompt
+    # right Pad if total_length (history_timestep + coarse_timestep) is less than model context (1024)
+    total_length = input_tensor.shape[-1]
+    padding_needed = max(0, 1024 - total_length)
+    if padding_needed > 0:
+        padding = torch.full(
+            (batch, N_FINE_CODEBOOKS, padding_needed),
+            CODEBOOK_SIZE,
+            dtype=torch.int32,
+            device=device,
+        )
+        input_tensor = torch.cat((input_tensor, padding), dim=2)
+        total_length = input_tensor.shape[-1]
+    # Calculate number of prediction loops
+    context_window = 1024  # Model's input context size
+    prediction_step = 512  # Number of new timesteps predicted per loop
+    remaining_length = max(0, sequence_length - (context_window - n_history))
+    extra_loops = (remaining_length + prediction_step - 1) // prediction_step
+    n_loops = 1 + extra_loops  # Total loops: initial + extra
+    # Process sequence in sliding windows
+    input_tensor = input_tensor.transpose(
+        -2, -1
+    )  # Shape: (total_length, N_FINE_CODEBOOKS)
+    with inference_mode():
+        for loop_idx in tqdm(
+            range(n_loops), disable=silent, desc="Generating fine tokens"
+        ):
+            # Define window boundaries
+            # the last loop, by using window_start = (total_length - context_window),
+            # the input will be: input_tensor[:, -1024:, :], the last context_window timestep of the input
+            window_start = min(
+                loop_idx * prediction_step, total_length - context_window
+            )
+            fill_start = min(
+                n_history + loop_idx * prediction_step, total_length - prediction_step
+            )
+            fill_offset = fill_start - window_start
+            window_end = window_start + context_window
+            # Extract input window
+            # Shape: (1, 1024, N_FINE_CODEBOOKS)
+            input_window = input_tensor[:, window_start:window_end, :]
+            # Predict fine codebooks autoregressively
+            for codebook_idx in range(n_coarse, N_FINE_CODEBOOKS):
+                # Shape: (1, 1024, vocab_size)
+                logits = model(codebook_idx, input_window)
+                if temperature is None:
+                    preds = torch.argmax(
+                        logits[:, fill_offset:, :CODEBOOK_SIZE], dim=-1
+                    )
+                else:
+                    scaled_logits = logits[:, :, :CODEBOOK_SIZE] / temperature
+                    probs = F.softmax(scaled_logits, dim=-1)
+                    probs = probs[:, fill_offset:, :]
+                    # Reshape to [2 * N, 1024] for multinomial
+                    B, N, C = probs.shape  # B=2, N=512-fill_offset, C=1024
+                    probs_2d = probs.reshape(-1, C)  # Shape: [2 * N, 1024]
+                    # Perform multinomial sampling
+                    # Shape: [2 * N, 1]
+                    preds = torch.multinomial(probs_2d, num_samples=1)
+                    # Reshape back to [2, N] after squeezing
+                    preds = preds.squeeze(-1).reshape(B, N)
+                input_window[:, fill_offset:, codebook_idx] = preds.to(torch.int32)
+                # Update main tensor with predictions
+                fill_length = min(prediction_step, total_length - fill_start)
+                input_tensor[:, fill_start : fill_start + fill_length, codebook_idx] = (
+                    input_window[
+                        :, fill_offset : fill_offset + fill_length, codebook_idx
+                    ]
+                )
+    # Extract final result, removing history and padding
+    # Shape: (N_FINE_CODEBOOKS, sequence_length)
+    fine_tokens = input_tensor.transpose(-1, -2)[
+        :, :, n_history : n_history + sequence_length
+    ]
+    # Verify output shape matches input sequence length
+    assert fine_tokens.shape[-1] == sequence_length, "Output length mismatch"
+    return fine_tokens
+def _validate_coarse_tokens(coarse_tokens: torch.Tensor) -> None:
+    """Validate coarse token tensor properties."""
+    assert isinstance(
+        coarse_tokens, torch.Tensor
+    ), "coarse_tokens must be a torch.Tensor"
+    assert len(coarse_tokens.shape) == 3, "coarse_tokens must be 3D"
+    assert (
+        1 <= coarse_tokens.shape[1] <= N_FINE_CODEBOOKS - 1
+    ), "Invalid number of coarse codebooks"
+    assert coarse_tokens.shape[-1] > 0, "Sequence length must be positive"
+    assert (
+        coarse_tokens.min() >= 0 and coarse_tokens.max() <= CODEBOOK_SIZE
+    ), "Token values out of range"
+def _validate_and_load_history(
+    history_prompt: Union[BarkPrompt, None],
+) -> Union[torch.Tensor, None]:
+    """Validate and load history prompt if provided."""
+    if history_prompt is None:
+        return None
+    history_fine_tokens = history_prompt.fine_prompt
+    assert isinstance(
+        history_fine_tokens, torch.Tensor
+    ), "history_prompt.fine_prompt must be a torch.Tensor"
+    assert len(history_fine_tokens.shape) == 2, "History must be 2D"
+    assert (
+        history_fine_tokens.shape[0] == N_FINE_CODEBOOKS
+    ), "History must have all fine codebooks"
+    assert history_fine_tokens.shape[1] > 0, "History must not empty"
+    assert (
+        history_fine_tokens.min() >= 0
+        and history_fine_tokens.max() <= CODEBOOK_SIZE - 1
+    ), "History values out of range"
+    return history_fine_tokens

core/bark/generate_semantic.py ADDED Viewed

	@@ -0,0 +1,361 @@

+from typing import List, Tuple, Optional, Union
+import re
+from tqdm import tqdm
+import numpy as np
+import torch
+import torch.nn.functional as F
+from transformers import BertTokenizer
+from core.memory import model_manager, ModelEnum, env
+from core.bark.custom_context import inference_mode
+from core.bark.constants import *
+from core.model import GPT
+SEMANTIC_EOS_TOKEN = 10_000
+def generate_semantic_tokens_from_text(
+    texts: List[str],
+    semantic_prompt: Union[torch.Tensor, None] = None,
+    temperature: Union[float, None] = 0.7,
+    semantic_top_k: Union[int, None] = None,
+    semantic_top_p: Union[int, None] = None,
+    min_eos_p: float = 0.2,
+    max_gen_duration_second: Union[float, None] = None,
+    allow_early_stop: bool = True,
+    use_kv_caching: bool = True,
+    use_small_model: bool = True,
+    silent: Union[bool, None] = False,
+    max_token_ids_per_sentence: int = 256,
+    **kwargs,
+) -> torch.Tensor:
+    # trim white spaces and replace redundant white space characters
+    texts = _preprocess_texts(texts)
+    assert all([len(text) > 0 for text in texts]), f"invalid input text {texts}"
+    if semantic_prompt is None:
+        semantic_prompt = torch.tensor([])
+    else:
+        assert isinstance(
+            semantic_prompt, torch.Tensor
+        ), f"expecting semantic_prompt of type torch.Tensor, received {type(semantic_prompt)}"
+        assert semantic_prompt.dim() == 1, "expect 1D tensor as semantic_prompt"
+    # load the GPT-style model that generate semantic token from text
+    # and the BERT tokenizer to memory
+    text_model_info = (
+        ModelEnum.BARK_TEXT_SMALL.value
+        if use_small_model
+        else ModelEnum.BARK_TEXT.value
+    )
+    text_model = model_manager.get_model(text_model_info)
+    assert text_model.model is not None, "text model is None"
+    assert text_model.preprocessor is not None, "tokenizer for the text model is None"
+    assert isinstance(
+        text_model.model, GPT
+    ), f"expecting model of type GPT, got {type(text_model.model)}"
+    assert isinstance(
+        text_model.preprocessor, BertTokenizer
+    ), f"expecting preprocessor of type BertTokenizer, got {type(text_model.preprocessor)}"
+    model: GPT = text_model.model
+    tokenizer: BertTokenizer = text_model.preprocessor
+    device = next(model.parameters()).device
+    # tokenize the given text using the BERT tokenizer
+    token_ids = [tokenizer.encode(text, add_special_tokens=False) for text in texts]
+    # for each token_ids of each sentence, append an encoding offset token
+    token_ids = [np.array(sentence) + TEXT_ENCODING_OFFSET for sentence in token_ids]
+    # encoded_text's length must has length 256 as from the original implementation
+    # pad to the right if the token_ids of the sentence is shorter, trim on the right if it is longer than 256 tokens
+    token_ids = [
+        trim_or_pad_array(sentence, TEXT_PAD_TOKEN, max_token_ids_per_sentence)
+        for sentence in token_ids
+    ]
+    token_ids_tensor = torch.vstack(token_ids).to(dtype=torch.int32, device=device)
+    # when the token_ids list has one element (batch size = 1), the above cat operation created a 1D tensor
+    # we need to check and make it 2D
+    if len(token_ids_tensor.shape) == 1:
+        token_ids_tensor = token_ids_tensor.unsqueeze(0)
+    # semantic prompt also need to be an array of 256 discrete tokens
+    semantic_prompt = trim_or_pad_array(semantic_prompt, SEMANTIC_PAD_TOKEN, 256)
+    # need to replicate the semantic_prompt array to match the shape of the token_ids for concatenation
+    semantic_prompt = (
+        semantic_prompt.unsqueeze(0).expand((token_ids_tensor.shape[0], -1)).to(device)
+    )
+    # final input is the concatenation of the token_ids and the semantic tokens array
+    input_tensor = torch.cat(
+        [
+            token_ids_tensor,  # shape (batch_size, T)
+            semantic_prompt,
+            torch.tensor([SEMANTIC_INFER_TOKEN], device=device)
+            .unsqueeze(0)
+            .expand((token_ids_tensor.shape[0], -1)),
+        ],
+        dim=1,
+    ).to(torch.int64)
+    # 256 token_ids, 256 prompt tokens, 1 semantic_infer token as the last column
+    assert (
+        input_tensor.shape[1] == 256 + 256 + 1
+    ), f"expecting tensor shape [batch, 513], received {input_tensor.shape}"
+    with inference_mode():
+        output: torch.Tensor = _generate_semantic(
+            model=model,
+            x=input_tensor,
+            temperature=temperature,
+            top_k=semantic_top_k,
+            top_p=semantic_top_p,
+            min_eos_p=min_eos_p,
+            max_gen_duration_s=max_gen_duration_second,
+            allow_early_stop=allow_early_stop,
+            use_kv_caching=use_kv_caching,
+            silent=silent,
+        )
+        validate_semantic_token_output(output)
+    return output
+def _generate_semantic(
+    model: GPT,
+    x: torch.Tensor,
+    temperature: float = 0.7,
+    top_k: Optional[int] = None,
+    top_p: Optional[float] = None,
+    min_eos_p: float = 0.2,
+    max_gen_duration_s: Optional[float] = None,
+    allow_early_stop: bool = True,
+    use_kv_caching: bool = False,
+    silent: bool = False,
+) -> torch.Tensor:
+    # Maximum number of tokens to generate
+    max_steps = 2048
+    # Initialize progress bar for user feedback (custom due to unpredictable stopping)
+    progress_bar = tqdm(
+        total=max_steps, disable=silent, desc="Generating semantic tokens"
+    )
+    last_progress = 0
+    # Key-value cache for attention optimization
+    kv_cache = None
+    # Autoregressive generation loop
+    for step in range(max_steps):
+        # Determine input based on KV caching
+        if use_kv_caching and kv_cache is not None:
+            # Use only the last token with cached attention states
+            x_input = x[:, [-1]]  # Shape [1, 1]
+        else:
+            # Use full sequence (recomputes attention each time)
+            x_input = x  # Shape [1, seq_len]
+        # Forward pass through the model
+        logits, kv_cache = model(
+            x_input,
+            merge_context=True,  # Merges text and semantic history context
+            past_kv=kv_cache,  # Previous attention states
+            use_cache=use_kv_caching,  # Enables caching if requested
+        )
+        # Sample the next token and check for early stopping
+        next_token, should_stop = _sample_next_token(
+            logits=logits,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            semantic_eos_token=SEMANTIC_EOS_TOKEN,
+            allow_early_stop=allow_early_stop,
+            min_eos_p=min_eos_p,
+        )
+        # Check stopping conditions
+        # only stop if all generations in the batch reached the stopping condition
+        if torch.all(should_stop):
+            progress_bar.update(step - last_progress + 1)
+            break
+        if step == max_steps - 1:
+            progress_bar.update()
+            break
+        # Append the new token to the sequence
+        x = torch.cat((x, next_token), dim=1)
+        # Update duration and progress
+        # total_duration_s += duration_per_step
+        if step > last_progress:
+            progress_bar.update(step - last_progress)
+            last_progress = step
+        # Clean up tensors to manage memory
+        del logits, next_token
+    # Finalize progress bar
+    progress_bar.total = step + 1
+    progress_bar.close()
+    # Extract generated tokens (skip initial 513 context tokens)
+    output = x[:, 256 + 256 + 1 :].detach()
+    return output
+def _sample_next_token(
+    logits: torch.Tensor,  # what is the shape of logits?
+    temperature: float,
+    top_k: Optional[int],
+    top_p: Optional[float],
+    semantic_eos_token: int,
+    allow_early_stop: bool,
+    min_eos_p: Optional[float],
+) -> Tuple[torch.Tensor, torch.BoolTensor]:
+    """
+    Sample the next token from logits with optional top-k, top-p filtering and early stopping.
+    Args:
+        logits: Tensor of shape [batch, seq, vocab_size] containing model predictions.
+        temperature: Controls randomness of sampling (lower = more deterministic).
+        top_k: If set, keeps only the top-k logits.
+        top_p: If set, applies nucleus (top-p) filtering.
+        vocab_size: Size of the semantic vocabulary (e.g., SEMANTIC_VOCAB_SIZE).
+        allow_early_stop: Whether to check for EOS token or probability threshold.
+        min_eos_p: Minimum probability for EOS to trigger early stop.
+        eos_token: Token ID representing end-of-sequence.
+    Returns:
+        Tuple[next_token, should_stop]:
+            - next_token: Sampled token (shape [1]).
+            - should_stop: Whether to stop generation (EOS detected).
+    """
+    # Extract logits for the last position in the sequence
+    relevant_logits = logits[:, -1, :semantic_eos_token]
+    # Append EOS logit if early stopping is allowed
+    if allow_early_stop:
+        eos_logit = logits[:, -1, [semantic_eos_token]]
+        relevant_logits = torch.hstack((relevant_logits, eos_logit))
+    # select the token with the highest probability
+    if temperature is None:
+        # next_token shape (B, 1)
+        probs = F.softmax(relevant_logits, dim=-1)
+        next_token = torch.argmax(probs, dim=-1, keepdim=True)
+        # when the model predict a 206 token_id, it continue to predict that same token_id with argmax
+        # we will intentionally avoid that token_id here
+        if torch.any(next_token == 206):
+            next_token = anything_but(probs, 206)
+    # do some maneuvers to introduce diversity in the sampling of the next token
+    else:
+        # Apply top-p (nucleus) filtering for diversity
+        if top_p is not None:  # this if branch is untested
+            # Convert to NumPy for faster sorting (optimization from original)
+            original_device = relevant_logits.device
+            logits_np = relevant_logits.detach().cpu().type(torch.float32).numpy()
+            sorted_indices = np.argsort(logits_np)[::-1]  # Descending order
+            sorted_logits = logits_np[sorted_indices]
+            cumulative_probs = np.cumsum(
+                F.softmax(torch.from_numpy(sorted_logits), dim=-1).numpy()
+            )
+            indices_to_remove = cumulative_probs > top_p
+            # Shift to keep at least one
+            indices_to_remove[1:] = indices_to_remove[:-1].copy()
+            indices_to_remove[0] = False  # Ensure top token stays
+            logits_np[sorted_indices[indices_to_remove]] = -np.inf
+            relevant_logits = torch.from_numpy(logits_np).to(original_device)
+        # Apply top-k filtering for diversity
+        if top_k is not None:
+            top_values, _ = torch.topk(
+                relevant_logits, min(top_k, relevant_logits.size(-1))
+            )
+            # compare the whole logit tensor to its k_th largest value, batch wise
+            relevant_logits[relevant_logits < top_values[:, [-1]]] = -float("Inf")
+        # Compute probabilities with temperature scaling
+        probs = F.softmax(relevant_logits / temperature, dim=-1)
+        # Sample the next token
+        next_token = torch.multinomial(probs, num_samples=1).to(torch.int32)
+    # Check for early stopping conditions for each sequence in the batch
+    if allow_early_stop:
+        # EOS token is vocab_size when appended
+        is_eos_token = (next_token == semantic_eos_token).flatten()
+        eos_prob_high = min_eos_p is not None and probs[:, -1] >= min_eos_p
+        should_stop = torch.logical_or(is_eos_token, eos_prob_high)
+    # when batch dimension is 1, next_token is a 1D array, need to make it 2D
+    if len(next_token.shape) == 1:
+        next_token = next_token.unsqueeze(0)
+    return next_token, should_stop
+# select the second largest probability token if the argmax is the avoided token
+# otherwise select the argmax token
+def anything_but(probs: torch.Tensor, avoid_id: int) -> torch.Tensor:
+    # probs shape (B, C)
+    # return tensor shape (B, 1)
+    values, indices = torch.topk(probs, 2, dim=-1)
+    selected = []
+    # loop over the batch dimension
+    for b in range(probs.shape[0]):
+        if indices[b, 0] == avoid_id:
+            selected.append(indices[b, 1])
+            continue
+        selected.append(indices[b, 0])
+    return torch.tensor(selected, dtype=torch.int32, device=probs.device).unsqueeze(1)
+def validate_semantic_token_output(output: torch.Tensor) -> None:
+    assert torch.all(
+        (0 <= output) & (output <= SEMANTIC_VOCAB_SIZE)
+    ), "unexpected output tokens"
+# preprocess the texts for the generate_text_semantic model
+def _preprocess_texts(texts: List[str]) -> List[str]:
+    return [re.sub(r"\s+", " ", text).strip() for text in texts]
+def trim_or_pad_array(
+    array: Union[np.ndarray, torch.Tensor], pad_token: int, max_length: int = 256
+) -> torch.Tensor:
+    """
+    Trim on the left (keep the right most tokens), pad on the right
+    """
+    # Convert np.ndarray to torch.Tensor if necessary
+    if isinstance(array, np.ndarray):
+        tensor = torch.from_numpy(array).to(device=torch.device(env.DEVICE))
+    else:  # Already a torch.Tensor
+        tensor = array
+    # Get the current length
+    current_length = tensor.shape[0]
+    if current_length > max_length:
+        # Trim from the end (last max_length elements)
+        return tensor[-max_length:]
+    elif current_length < max_length:
+        # Left pad 0, right pad to max_length
+        padding = (0, max_length - current_length)
+        return torch.nn.functional.pad(
+            tensor, padding, mode="constant", value=pad_token
+        )
+    # If length equals max_length, just return as is
+    return tensor

core/bark/voice_clone.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import torch
+import torchaudio
+from typing import Optional
+from core.utils import read_audio_file
+from core.bark import encodec_encode_audio
+from core.model.hubert import HuBERTForBarkSemantic
+from core.memory import model_manager, ModelEnum
+from core.bark.custom_context import InferenceContext
+from core.data_model import *
+HUBERT_SAMPLE_RATE = 16000
+def generate_semantic_tokens_from_hubert(
+    waves: torch.Tensor,
+    audio_sample_rate: int,
+    temperature: float,
+    eos_p: float,
+    max_length: int,
+    device: Optional[torch.device],
+    inference_dtype: torch.dtype = torch.float32,
+) -> torch.Tensor:
+    """
+    Generate semantic tokens from audio using the HuBERT model.
+    Args:
+        audio: 2D tensor of raw audio samples (shape: [B, T], where T is the number of samples)
+        sample_rate: Sample rate of the input audio (default: 24000, matching EnCodec in BARK)
+        hubert_model_name: Name of the HuBERT model from Hugging Face (default: facebook/hubert-large-ls960-ft)
+        device: Torch device to run the model on (defaults to CUDA if available, else CPU)
+        max_length: Maximum length of semantic tokens to return (optional, for truncation)
+    Returns:
+        torch.Tensor: 1D tensor of semantic tokens (e.g., shape [N], where N is the sequence length)
+    Raises:
+        RuntimeError: If HuBERT model loading or processing fails
+    """
+    assert (
+        len(waves.shape) == 2
+    ), f"expecting a tensor of shape [B, T], got {waves.shape}"
+    waves = waves.to(device)
+    # # HuBERT expects audio at 16kHz, resample if necessary
+    if audio_sample_rate != HUBERT_SAMPLE_RATE:
+        resampler = torchaudio.transforms.Resample(
+            orig_freq=audio_sample_rate, new_freq=HUBERT_SAMPLE_RATE
+        ).to(device)
+        waves = resampler(waves)
+    model = model_manager.get_model(ModelEnum.HuBERTBaseForBarkSemantic.value).model
+    assert isinstance(
+        model, HuBERTForBarkSemantic
+    ), f"expecting HuBERTForBarkSemantic model type, received {type(model)}"
+    waves = waves.to(dtype=inference_dtype)
+    model = model.to(dtype=inference_dtype)
+    with InferenceContext():
+        predictions: torch.Tensor = model.generate(
+            wav_input=waves, temperature=temperature, eos_p=eos_p, max_length=max_length
+        )
+    return predictions
+def create_bark_prompt(
+    audio_file: AudioFile, temperature: float, eos_p: float, device: torch.device
+) -> BarkPrompt:
+    """
+    Turn raw audio into valid BARK prompt. When given a raw audio file, use this function
+    to generate a valid BARK prompt
+    """
+    # Read the audio
+    raw_audio = read_audio_file(
+        path=audio_file.audio_file_path,
+        target_sample_rate=HUBERT_SAMPLE_RATE,
+        channels=1,
+        max_duration=15,
+    )
+    audio_tensor = torch.tensor(raw_audio.astype(np.float32), device=device)
+    # Generate semantic tokens from audio using HuBERT
+    semantic_tokens: torch.Tensor = generate_semantic_tokens_from_hubert(
+        waves=audio_tensor.unsqueeze(0),
+        audio_sample_rate=16000,
+        temperature=temperature,
+        eos_p=eos_p,
+        max_length=600,
+        device=device,
+    )
+    # Generate codebook tokens using EnCodec
+    codes = encodec_encode_audio(
+        audio_sample=torch.from_numpy(raw_audio[None]),
+        audio_sample_rate=HUBERT_SAMPLE_RATE,
+    )
+    # Assuming codes has shape [num_codebooks, T], typically 8 codebooks for 24kHz
+    return BarkPrompt(semantic_tokens, codes[:2, :], codes[:, :])

core/data_model/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from core.data_model.bark import *

core/data_model/bark.py ADDED Viewed

	@@ -0,0 +1,337 @@

+import os
+import json
+from pathlib import Path
+import torch
+from dataclasses import dataclass, asdict, fields
+import numpy as np
+from enum import Enum
+from pydantic import BaseModel, Field
+from typing import Optional, Union, List, Literal
+from datetime import datetime
+from core.utils import save_audio_file, read_audio_file
+@dataclass
+class BarkGenerationConfig:
+    semantic_top_k: Union[int, None] = 1000  # a tenth of the semantic vocab size
+    coarse_top_k: Union[int, None] = 100  # a tenth of the coarse codebook size
+    semantic_top_p: Union[int, None] = None
+    coarse_top_p: Union[int, None] = None
+    min_eos_p: float = 0.5
+    max_gen_duration_second: Union[float, None] = None
+    allow_early_stop: bool = True
+    use_kv_caching: bool = True
+    max_coarse_history: int = 630
+    sliding_window_length: int = 60
+    max_token_per_example: int = 256
+    # set to None to use argmax sampling
+    temperature: float = 0.6
+    generate_coarse_temperature: float = 0.6
+    # set this to None if you want to use argmax to generate fine token
+    generate_fine_temperature: float = 0.6
+    use_small_model: bool = True
+    def __init__(self, **kwargs):
+        # Get field names from dataclass
+        valid_fields = {f.name for f in fields(self)}
+        # Set only known fields
+        for key, value in kwargs.items():
+            if key in valid_fields:
+                setattr(self, key, value)
+    def to_dict(self) -> dict:
+        return asdict(self)
+    @classmethod
+    def from_dict(cls, data: dict) -> "BarkGenerationConfig":
+        return cls(**data)
+@dataclass
+class BarkPrompt:
+    """
+    semantic_prompt shape: (T)
+    coarse_prompt shape: (2, T)
+    fine_prompt shape: (8, T)
+    those T are different depends on the rate of token type per second
+    """
+    semantic_prompt: torch.Tensor
+    coarse_prompt: torch.Tensor
+    fine_prompt: torch.Tensor
+    def save_prompt(self, file_path: str) -> bool:
+        """
+        Save all 3 prompts to disk as JSON. Return True if success, False if error
+        """
+        # Ensure the directory exists
+        directory = os.path.dirname(file_path)
+        if directory:  # If there's a directory component
+            os.makedirs(directory, exist_ok=True)
+        data = {
+            "semantic_prompt": self.semantic_prompt.detach().cpu().tolist(),
+            "coarse_prompt": self.coarse_prompt.detach().cpu().tolist(),
+            "fine_prompt": self.fine_prompt.detach().cpu().tolist(),
+        }
+        if not file_path.endswith(".json"):
+            file_path += ".json"
+        try:
+            with open(file_path, "w", encoding="utf-8") as f:
+                json.dump(data, f)
+            return True
+        except Exception:
+            return False
+    @classmethod
+    def load_prompt(cls, file_path: str, device: torch.device) -> "BarkPrompt":
+        """
+        Load a prompt from disk. File to load can be either a .json or .npz file
+        """
+        try:
+            if file_path.endswith(".json"):
+                with open(file_path, "r", encoding="utf-8") as f:
+                    prompt = json.load(f)
+                assert (
+                    "semantic_prompt" in prompt
+                    and "coarse_prompt" in prompt
+                    and "fine_prompt" in prompt
+                ), f"invalid prompt data {prompt}"
+                semantic_prompt = torch.tensor(prompt["semantic_prompt"])
+                coarse_prompt = torch.tensor(prompt["coarse_prompt"])
+                fine_prompt = torch.tensor(prompt["fine_prompt"])
+            elif file_path.endswith(".npz"):
+                with np.load(file_path) as data:
+                    assert (
+                        "semantic_prompt" in data
+                        and "coarse_prompt" in data
+                        and "fine_prompt" in data
+                    ), f"invalid prompt data in NPZ file"
+                    semantic_prompt = torch.from_numpy(data["semantic_prompt"])
+                    coarse_prompt = torch.from_numpy(data["coarse_prompt"])
+                    fine_prompt = torch.from_numpy(data["fine_prompt"])
+            else:
+                raise ValueError("Unsupported file format. Use .json or .npz")
+            # Convert to device and dtype after loading
+            semantic_prompt = semantic_prompt.to(device=device, dtype=torch.int32)
+            coarse_prompt = coarse_prompt.to(device=device, dtype=torch.int32)
+            fine_prompt = fine_prompt.to(device=device, dtype=torch.int32)
+            # Shape checks remain the same
+            if len(semantic_prompt.shape) == 2:
+                semantic_prompt = semantic_prompt[0, :]
+            assert (
+                len(semantic_prompt.shape) == 1
+            ), "expecting semantic_prompt as a 1D array"
+            assert (
+                coarse_prompt.shape[0] == 2
+            ), "expecting coarse_prompt has 2 code book dimension"
+            assert (
+                fine_prompt.shape[0] == 8
+            ), "expecting fine_prompt has 8 code book dimension"
+            return cls(semantic_prompt, coarse_prompt, fine_prompt)
+        except Exception as e:
+            raise ValueError(f"Failed to load file: {str(e)}")
+class AudioFile(BaseModel):
+    """Model for validating raw audio prompt inputs."""
+    audio_file_path: str = Field(..., description="Path to the audio file")
+    max_duration: int = Field(
+        ..., ge=1, description="Maximum duration of the audio in seconds"
+    )
+    def get_default_prompt_name(self) -> str:
+        audio_file_name = Path(self.audio_file_path).name
+        return f"{audio_file_name}_{datetime.now().strftime('%Y_%m_%d_%H_%M')}"
+class TextToAudioInput(BaseModel):
+    """Model for validating inputs to the text-to-audio generation function."""
+    texts: List[str] = Field(
+        ..., min_items=1, description="List of text strings to convert to audio"
+    )
+    audio_prompt: Optional[Union[AudioFile, str]] = Field(
+        None, description="Optional audio prompt (raw or file path)"
+    )
+    sample_rate: int = Field(
+        default=24000, ge=1, description="Sample rate for generated audio"
+    )
+    device: Optional[str] = Field(
+        None, description="Device to use for generation (e.g., 'cuda', 'cpu')"
+    )
+    save_path: str = Field(
+        default="./artifact", description="Directory to save generated audio files"
+    )
+class TextToAudioModel(Enum):
+    BARK = "BARK"
+@dataclass
+class WavSemantic:
+    """
+    An example of a pair (wav, semantic) for training a model to predict semantic from audio
+    """
+    text: str
+    wav: np.ndarray
+    semantic: np.ndarray
+@dataclass
+class WavSemanticDataset:
+    sample_rate: int
+    semantic_generation_config: BarkGenerationConfig
+    bark_model_type: Literal["small", "large"]
+    data: List[WavSemantic]
+    def save(self, save_path: str, save_raw_audio: bool) -> None:
+        """
+        Save this WavSemanticDataset instance to disk at the specified path with compression.
+        Args:
+            save_path: Directory path where the dataset will be saved (default: './data').
+        """
+        # Ensure the save directory exists
+        save_dir = Path(save_path)
+        save_dir.mkdir(parents=True, exist_ok=True)
+        # this allows continuous saving of data, e.g save every new batch of data generated
+        if not os.path.exists(save_dir / "metadata.json"):
+            # Prepare metadata dictionary using instance attributes
+            metadata = {
+                "sample_rate": self.sample_rate,
+                "semantic_generation_config": self.semantic_generation_config.to_dict(),
+                "bark_model_type": self.bark_model_type,
+            }
+            # Save metadata as JSON
+            with open(save_dir / "metadata.json", "w") as f:
+                json.dump(metadata, f, indent=2)
+        next_index = self._get_latest_saved_file_index(save_path) + 1
+        # Save each WavSemantic sample
+        for i, sample in enumerate(self.data):
+            sample_dir = save_dir / f"sample_{i+next_index}"
+            sample_dir.mkdir(exist_ok=True)
+            # Save text
+            with open(sample_dir / "text.txt", "w") as f:
+                f.write(sample.text)
+            # Save wav and semantic in a single compressed .npz file
+            if save_raw_audio:
+                save_audio_file(
+                    sample.wav, self.sample_rate, str(sample_dir / "audio.wav")
+                )
+                with open(sample_dir / "semantic.json", "w") as f:
+                    json.dump(sample.semantic.tolist(), f)
+            else:
+                np.savez_compressed(
+                    sample_dir / "data.npz", wav=sample.wav, semantic=sample.semantic
+                )
+    @staticmethod
+    def _get_latest_saved_file_index(dataset_path: str) -> int:
+        file_names = os.listdir(dataset_path)
+        file_names.remove("metadata.json")
+        if len(file_names) == 0:
+            return -1
+        indices = [
+            int(file_name.split("_")[-1].split(".")[0]) for file_name in file_names
+        ]
+        return max(indices)
+    @classmethod
+    def load(cls, load_path: str, num_samples: int = 5000) -> "WavSemanticDataset":
+        """
+        Load a WavSemanticDataset from disk at the specified path.
+        Args:
+            load_path: Directory path where the dataset is saved.
+            num_samples: maximum number of samples to load from the folder
+        Returns:
+            A new WavSemanticDataset instance loaded from disk.
+        """
+        load_dir = Path(load_path)
+        if not load_dir.exists():
+            raise FileNotFoundError(f"Directory {load_path} does not exist")
+        filenames = os.listdir(load_dir)
+        if len(filenames) == 1:
+            # when there is a folder inside the load_path folder, step into it
+            load_dir = load_dir / filenames[0]
+            filenames = os.listdir(load_dir)
+        # Load metadata
+        with open(load_dir / "metadata.json", "r") as f:
+            metadata = json.load(f)
+        # Reconstruct semantic_generation_config
+        config = BarkGenerationConfig.from_dict(metadata["semantic_generation_config"])
+        # Load each WavSemantic sample
+        data = []
+        for i, filename in enumerate(filenames):
+            if not "sample" in filename:
+                continue
+            sample_dir = load_dir / filename
+            # Load text
+            with open(sample_dir / "text.txt", "r") as f:
+                text = f.read()
+            # Load compressed wav and semantic from .npz file
+            if os.path.isfile(sample_dir / "data.npz"):
+                with np.load(sample_dir / "data.npz") as npz_data:
+                    wav = npz_data["wav"]
+                    semantic = npz_data["semantic"]
+            # assuming audio wave file was stored separately from the semantic file
+            else:
+                # assuming "audio.wav" and "semantic.npz" exist in the folder
+                wav = read_audio_file(
+                    sample_dir / "audio.wav", metadata["sample_rate"], 1, False, None
+                )
+                if os.path.isfile(sample_dir / "semantic.npz"):
+                    with np.load(sample_dir / "semantic.npz") as npz_data:
+                        semantic = npz_data["semantic"]
+                elif os.path.isfile(sample_dir / "semantic.json"):
+                    with open(sample_dir / "semantic.json") as f:
+                        semantic = np.array(json.load(f))
+            data.append(WavSemantic(text=text, wav=wav, semantic=semantic))
+            if i > num_samples:
+                break
+        # Reconstruct and return the dataset
+        return cls(
+            sample_rate=metadata["sample_rate"],
+            semantic_generation_config=config,
+            bark_model_type=metadata["bark_model_type"],
+            data=data,
+        )
+    def __getitem__(self, idx: int) -> WavSemantic:
+        return self.data[idx]
+    def __len__(self) -> int:
+        return len(self.data)

core/memory/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from core.memory.model_manager import *
+from core.memory.model_manager import *
+from core.memory.common import *

core/memory/common.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import os
+import logging
+from dotenv import load_dotenv
+from enum import Enum
+from typing import ClassVar, Dict, Any
+import torch
+from huggingface_hub import hf_hub_download
+# Configure logging with a default level (will be updated by EnvVars)
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class LogLevel(Enum):
+    """Enumeration of valid logging levels."""
+    DEBUG = "DEBUG"
+    INFO = "INFO"
+    WARNING = "WARNING"
+    ERROR = "ERROR"
+    CRITICAL = "CRITICAL"
+def grab_best_device(use_gpu: bool, enable_mps: bool) -> str:
+    """
+    Determine the best available device for PyTorch operations.
+    Args:
+        use_gpu (bool): Whether to prioritize GPU/MPS over CPU.
+        enable_mps (bool): Whether to allow MPS (Metal Performance Shaders) on Apple Silicon.
+    Returns:
+        str: Device identifier ("cuda", "mps", or "cpu").
+    """
+    if use_gpu and torch.cuda.is_available():
+        device = "cuda"
+        logger.debug("Selected CUDA device (GPU available)")
+    elif use_gpu and enable_mps and torch.backends.mps.is_available():
+        device = "mps"
+        logger.debug("Selected MPS device (Apple Silicon GPU available)")
+    else:
+        device = "cpu"
+        logger.debug("Selected CPU device (no GPU/MPS available or disabled)")
+    return device
+class EnvVars:
+    """
+    Class to manage and expose environment variables with type safety and runtime configurability.
+    Loads variables from a .env file or system environment, applies defaults if not found, and allows updates
+    at runtime. Variables are stored as instance attributes rather than polluting the global namespace.
+    """
+    # Default values for environment variables
+    _DEFAULTS: ClassVar[Dict[str, Any]] = {
+        "GLOBAL_ENABLE_MPS": True,  # Enable PyTorch's Metal Performance Shaders on Apple Silicon
+        "AUDIO_SAMPLE_RATE": 24000,  # Default sample rate for audio processing (in Hz)
+        "SUNO_USE_SMALL_MODELS": True,  # Use smaller Bark models if True
+        "CACHE_DIR": "./models",
+        "LOG_LEVEL": LogLevel.INFO,  # Default logging level
+        "USE_GPU": True,  # Whether to prioritize GPU/MPS over CPU
+    }
+    def __init__(self) -> None:
+        """Initialize the EnvVars instance and load variables."""
+        self._vars: Dict[str, Any] = {}
+        self._load_env_vars()
+        self._update_attributes()
+    def _load_env_vars(self) -> None:
+        """Load environment variables from .env file or system, falling back to defaults."""
+        load_dotenv()  # Load .env file into os.environ
+        for var_name, default_value in self._DEFAULTS.items():
+            value = os.getenv(var_name)
+            if value is None:
+                logger.info(
+                    f"{var_name} not found in environment, using default: {default_value}"
+                )
+                self._vars[var_name] = default_value
+            else:
+                # Convert value to the appropriate type based on default
+                if isinstance(default_value, bool):
+                    self._vars[var_name] = value.lower() in ("true", "1", "t")
+                elif isinstance(default_value, int):
+                    self._vars[var_name] = int(value)
+                elif isinstance(default_value, float):
+                    self._vars[var_name] = float(value)
+                elif isinstance(default_value, LogLevel):
+                    self._vars[var_name] = LogLevel(value.upper())
+                else:
+                    self._vars[var_name] = value
+                logger.info(
+                    f"{var_name} loaded from environment: {self._vars[var_name]}"
+                )
+    def _update_attributes(self) -> None:
+        """Update instance attributes and apply settings (e.g., logging level, device)."""
+        # Set instance attributes
+        self.GLOBAL_ENABLE_MPS: bool = self._vars["GLOBAL_ENABLE_MPS"]
+        self.AUDIO_SAMPLE_RATE: int = self._vars["AUDIO_SAMPLE_RATE"]
+        self.SUNO_USE_SMALL_MODELS: bool = self._vars["SUNO_USE_SMALL_MODELS"]
+        self.CACHE_DIR: str = self._vars["CACHE_DIR"]
+        self.LOG_LEVEL: LogLevel = self._vars["LOG_LEVEL"]
+        self.USE_GPU: bool = self._vars["USE_GPU"]
+        self.DEVICE: str = grab_best_device(self.USE_GPU, self.GLOBAL_ENABLE_MPS)
+        logging.getLogger().setLevel(self.LOG_LEVEL.value)
+    def update(self, var_name: str, value: Any) -> None:
+        """
+        Update an environment variable at runtime and reapply settings.
+        Args:
+            var_name (str): Name of the variable to update (must be in _DEFAULTS).
+            value (Any): New value for the variable.
+        Raises:
+            KeyError: If var_name is not a recognized environment variable.
+        """
+        if var_name not in self._DEFAULTS:
+            raise KeyError(f"Unknown environment variable: {var_name}")
+        # Convert value to the appropriate type based on default
+        default_type = type(self._DEFAULTS[var_name])
+        if default_type is bool:
+            self._vars[var_name] = bool(
+                value.lower() in ("true", "1", "t") if isinstance(value, str) else value
+            )
+        elif default_type is int:
+            self._vars[var_name] = int(value)
+        elif default_type is float:
+            self._vars[var_name] = float(value)
+        elif default_type is LogLevel:
+            self._vars[var_name] = LogLevel(
+                value.upper() if isinstance(value, str) else value
+            )
+        else:
+            self._vars[var_name] = value
+        logger.info(f"Updated {var_name} to {self._vars[var_name]}")
+        self._update_attributes()
+# Create global instance to access environment variables
+env = EnvVars()
+def get_cached_or_download_model_from_hf(
+    repo_id: str, file_name: str, cache_dir: str = env.CACHE_DIR
+) -> str:
+    """
+    Download a model from Hugging Face Hub if not already cached.
+    Args:
+        repo_id (str): The repository ID on Hugging Face Hub (e.g., 'suno/bark').
+        file_name (str): The name of the model file to download (e.g., 'text.pt').
+        cache_dir (str): Directory to store cached models (defaults to env.CACHE_DIR).
+    Returns:
+        str: The full path to the downloaded or cached model file.
+    Raises:
+        OSError: If the cache directory cannot be created.
+        RuntimeError: If the download from Hugging Face fails.
+    """
+    # Ensure cache directory exists
+    try:
+        os.makedirs(cache_dir, exist_ok=True)
+    except OSError as e:
+        logger.error(f"Failed to create cache directory {cache_dir}: {str(e)}")
+        raise
+    # Check if file is already cached
+    cached_path = os.path.join(cache_dir, file_name)
+    if os.path.exists(cached_path):
+        logger.debug(f"Model found in cache: {cached_path}")
+        return cached_path
+    # Download from Hugging Face if not cached
+    logger.info(f"Downloading model {repo_id}/{file_name} to {cache_dir}")
+    try:
+        hf_hub_download(repo_id=repo_id, filename=file_name, local_dir=cache_dir)
+        logger.debug(f"Model downloaded successfully to {cached_path}")
+        return cached_path
+    except Exception as e:
+        logger.error(f"Failed to download model {repo_id}/{file_name}: {str(e)}")
+        raise RuntimeError(f"Failed to download model {repo_id}/{file_name}: {str(e)}")

core/memory/model_manager.py ADDED Viewed

	@@ -0,0 +1,289 @@

+import psutil
+import logging
+from typing import Dict, Optional, Callable, Any, Literal
+from collections import OrderedDict
+from threading import Lock
+import torch
+from transformers import BertTokenizer
+from encodec import EncodecModel
+from core.memory.common import get_cached_or_download_model_from_hf, env
+from core.model.bark import GPTConfig, FineGPTConfig, GPT, FineGPT
+from core.memory.models import *
+# Configure logging for this module
+logger = logging.getLogger(__name__)
+def clear_cuda_cache() -> None:
+    """
+    Clear the CUDA memory cache if GPU is available.
+    Raises:
+        RuntimeError: If CUDA operations fail unexpectedly.
+    """
+    if torch.cuda.is_available():
+        try:
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()
+            logger.debug("CUDA cache cleared successfully")
+        except RuntimeError as e:
+            logger.error(f"Failed to clear CUDA cache: {str(e)}")
+            raise RuntimeError(f"CUDA cache clear failed: {str(e)}")
+class ModelManager:
+    """
+    Manager class for loading, caching, and unloading PyTorch models with memory management.
+    Prioritizes GPU memory when available, with an optional `offload_to_cpu` flag to use CPU RAM instead.
+    Uses an LRU (Least Recently Used) cache to keep only the most recently used models in memory.
+    Automatically unloads models when memory usage (GPU or CPU, depending on config) exceeds a threshold
+    or the maximum number of cached models is reached.
+    """
+    def __init__(self, max_models: int = 10, offload_to_cpu: bool = False):
+        """
+        Initialize the model manager.
+        Args:
+            max_models (int): Maximum number of models to keep in memory before unloading (default: 5)
+            offload_to_cpu (bool): If True, use CPU RAM instead of GPU memory (default: False)
+        """
+        self._models: OrderedDict = OrderedDict()  # LRU cache for loaded models
+        self._lock = Lock()  # Thread lock for safe concurrent access
+        self._max_models = max_models  # Max number of models to cache
+        # Whether to offload models to CPU instead of GPU
+        self._offload_to_cpu = offload_to_cpu
+        self._device = torch.device(env.DEVICE)  # Device to load models onto
+        logger.info(f"Model manager initialized with device: {self._device}")
+    def _check_memory(self) -> bool:
+        """
+        Check if current memory usage is below the threshold, focusing on GPU unless offloaded to CPU.
+        Returns:
+            bool: True if memory usage is safe, False if it exceeds the threshold
+        """
+        if self._offload_to_cpu or not torch.cuda.is_available():
+            # Check CPU memory usage
+            mem = psutil.virtual_memory()  # System memory stats
+            total_mem_used = mem.used / 1e9  # CPU memory used in GB
+            total_mem_available = mem.total / 1e9  # Total CPU memory in GB
+        else:
+            # Check GPU memory usage
+            total_mem_used = (
+                torch.cuda.memory_allocated() / 1e9
+            )  # GPU memory used in GB
+            total_mem_available = (
+                torch.cuda.get_device_properties(0).total_memory / 1e9
+            )  # Total GPU memory in GB
+        usage_ratio = total_mem_used / total_mem_available
+        logger.debug(
+            f"Memory usage on {self._device}: {usage_ratio:.2%} (threshold: {MEMORY_THRESHOLD})"
+        )
+        return usage_ratio < MEMORY_THRESHOLD
+    def _unload_lru_model(self):
+        """Unload the least recently used model to free memory."""
+        with self._lock:
+            if self._models:
+                # Remove oldest entry
+                model_info, model_instance = self._models.popitem(last=False)
+                logger.info(
+                    f"Unloading model {model_info} from {self._device} to free memory"
+                )
+                # Move model to CPU before deletion to ensure GPU memory is freed
+                if not self._offload_to_cpu and torch.cuda.is_available():
+                    model_instance.model = model_instance.model.cpu()
+                del model_instance  # Explicitly delete reference
+                logger.debug(f"Memory freed from {self._device}")
+    def get_model(self, model_info: ModelInfo) -> Model:
+        """
+        Retrieve or load a model, managing memory constraints on the chosen device (GPU or CPU).
+        Args:
+            model_info (ModelInfo): Metadata for the model to load
+        Returns:
+            Model: The loaded model instance with config and preprocessor
+        Raises:
+            ValueError: If model_info is invalid
+        """
+        assert isinstance(
+            model_info, ModelInfo
+        ), f"invalid model_info type {type(model_info)}"
+        with self._lock:
+            # If model is already loaded, move it to the end (most recently used) and return it
+            if model_info in self._models:
+                self._models.move_to_end(model_info)
+                return self._models[model_info]
+            # Ensure memory is available by unloading models if necessary
+            while not self._check_memory() or len(self._models) >= self._max_models:
+                self._unload_lru_model()
+            if model_info.load_model is not None:
+                model = model_info.load_model(model_info, torch.device(env.DEVICE))
+            elif model_info.checkpoint_name is not None:
+                model = load_transformers_model(model_info, self._device)
+            elif model_info.repo_id is not None and model_info.file_name is not None:
+                model_file_path = get_cached_or_download_model_from_hf(
+                    repo_id=model_info.repo_id, file_name=model_info.file_name
+                )
+                model = load_model_from_file(model_info, model_file_path, self._device)
+            else:
+                raise ValueError(
+                    "Invalid model info: must provide checkpoint_name or repo_id/file_name"
+                )
+            # Cache the loaded model
+            self._models[model_info] = model
+            clear_cuda_cache()
+            logger.info(f"Loaded and cached model {model_info} on {self._device}")
+            return model
+    def unload_model(self, model_info: ModelInfo):
+        """
+        Manually unload a specific model from memory.
+        Args:
+            model_info (ModelInfo): Metadata of the model to unload
+        """
+        with self._lock:
+            if model_info in self._models:
+                model_instance = self._models[model_info]
+                # Move model to CPU before deletion if on GPU
+                if not self._offload_to_cpu and torch.cuda.is_available():
+                    model_instance.model = model_instance.model.cpu()
+                del self._models[model_info]
+                logger.info(f"Manually unloaded model {model_info} from {self._device}")
+def load_model_from_file(
+    model_info: ModelInfo, model_file_path: str, device: torch.device
+) -> Model:
+    """
+    Load a model from a file (e.g., custom weights from Hugging Face).
+    Args:
+        model_info (ModelInfo): Metadata for the model
+        model_file_path (str): Path to the model weights file
+        device (torch.device): Device to load the model onto (CPU or GPU)
+    Returns:
+        Model: Loaded model instance
+    """
+    if model_info.repo_id == "suno/bark":
+        return load_bark_model(model_info, model_file_path, device)
+    if model_info.model_type == "custom_hubert_tokenizer":
+        return load_custom_hubert_tokenizer(model_info, model_file_path, device)
+    raise ValueError(f"Unknown how to load model {model_info}")
+# temporary turnoff this hubert
+def load_custom_hubert_tokenizer(
+    model_info: ModelInfo, model_file_path: str, device: torch.device
+) -> Model:
+    # Automatically uses the right layers
+    # tokenizer = HuBERTForBarkSemantic.load_from_checkpoint(
+    #     model_file_path, torch.device(env.DEVICE)
+    # ).to(device)
+    # return Model(model=tokenizer)
+    return Model(model=None)
+def load_transformers_model(model_info: ModelInfo, device: torch.device) -> Model:
+    """
+    Load a model using Hugging Face's transformers library.
+    Args:
+        model_info (ModelInfo): Metadata for the model
+        device (torch.device): Device to load the model onto (CPU or GPU)
+    Returns:
+        Model: Loaded model instance
+    """
+    if model_info.checkpoint_name == "facebook/encodec_24khz":
+        model = EncodecModel.encodec_model_24khz()
+        model.encode()
+        model = model.to(device)
+        return Model(model)
+    raise NotImplementedError("Only Encodec 24k supported for now")
+def load_bark_model(
+    model_info: ModelInfo, model_file_path: str, device: torch.device
+) -> Model:
+    """
+    Load a Bark model from a file.
+    Args:
+        model_info (ModelInfo): Metadata for the Bark model
+        model_file_path (str): Path to the model weights file
+        device (torch.device): Device to load the model onto (CPU or GPU)
+    Returns:
+        Model: Loaded Bark model instance with config and optional tokenizer
+    """
+    # Load checkpoint directly to the specified device
+    # weights_only = False only for trusted source
+    checkpoint = torch.load(model_file_path, map_location=device, weights_only=False)
+    ConfigClass, ModelClass = (
+        (GPTConfig, GPT)
+        if model_info.model_type in ["text", "coarse"]
+        else (FineGPTConfig, FineGPT)
+    )
+    model_args = preprocess_model_args(checkpoint["model_args"])
+    conf = ConfigClass(**model_args)
+    model = ModelClass(conf)
+    state_dict = _update_bark_state_dict(model, checkpoint["model"])
+    model.load_state_dict(state_dict, strict=False)
+    model = model.to(device)  # Ensure model is on the correct device
+    model.eval()
+    logger.info(f"Loaded Bark model: {model_info} on {device}")
+    # Add tokenizer for text models (tokenizer stays on CPU as it doesn't require GPU)
+    preprocessor = (
+        BertTokenizer.from_pretrained("bert-base-multilingual-cased")
+        if model_info.model_type == "text"
+        else None
+    )
+    return Model(model, conf, preprocessor)
+def preprocess_model_args(model_args: dict) -> dict:
+    if "input_vocab_size" not in model_args:
+        model_args["input_vocab_size"] = model_args["vocab_size"]
+        model_args["output_vocab_size"] = model_args["vocab_size"]
+        del model_args["vocab_size"]
+    return model_args
+def _update_bark_state_dict(model: GPT, state_dict: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Update the state dictionary by removing unwanted prefixes (specific to Bark models).
+    Args:
+        model (GPT): The model instance to align the state dict with
+        state_dict (Dict[str, Any]): The loaded state dictionary
+    Returns:
+        Dict[str, Any]: Updated state dictionary
+    """
+    unwanted_prefix = "_orig_mod."
+    for key in list(state_dict.keys()):
+        if key.startswith(unwanted_prefix):
+            state_dict[key[len(unwanted_prefix) :]] = state_dict.pop(key)
+    return state_dict
+# Instantiate the global model manager with default GPU priority
+model_manager = ModelManager(offload_to_cpu=False if env.USE_GPU else True)

core/memory/models.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import os
+import sys
+import logging
+from dataclasses import asdict
+from typing_extensions import Optional, Callable
+from dataclasses import dataclass
+from enum import Enum
+from transformers import BertTokenizer
+from encodec import EncodecModel
+import torch
+from core.model.bark import GPT
+from core.memory.common import env
+from core.utils import download_file_from_hf
+from core.model.hubert import HuBERTForBarkSemantic, HubertForBarkSemanticConfig
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    handlers=[logging.StreamHandler(sys.stdout)],
+)
+logger = logging.getLogger(__name__)
+# Memory threshold (in percentage) to trigger unloading of models when memory usage gets too high
+# 90% of available memory; applies to GPU unless offloaded to CPU
+MEMORY_THRESHOLD = 0.9
+@dataclass(frozen=True)
+class ModelInfo:
+    """Data structure to hold metadata about a model."""
+    # Hugging Face repository ID (e.g., "suno/bark")
+    repo_id: Optional[str] = None
+    # Filename of the model weights (e.g., "text.pt")
+    file_name: Optional[str] = None
+    # Pretrained checkpoint name (e.g., "facebook/encodec_24khz")
+    checkpoint_name: Optional[str] = None
+    # Configuration class for the model
+    config_class: Optional[type] = None
+    # Model class to instantiate
+    model_class: Optional[type] = None
+    # Preprocessor class (e.g., tokenizer)
+    preprocessor_class: Optional[type] = None
+    # Type of model (e.g., "text", "coarse", "encodec")
+    model_type: Optional[str] = None
+    # define the function that load the model
+    load_model: Optional[Callable] = None
+@dataclass
+class Model:
+    """Container for a loaded model, its configuration, and preprocessor."""
+    model: Callable  # The PyTorch model instance
+    config: Optional[Callable] = None  # Model configuration object
+    # Preprocessor (e.g., tokenizer for text models)
+    preprocessor: Optional[Callable] = None
+def _load_encodec_model(model_info: ModelInfo, device: torch.device) -> Model:
+    model = EncodecModel.encodec_model_24khz()
+    model.set_target_bandwidth(6.0)
+    model.eval()
+    model.to(device)
+    return Model(model)
+def _load_hubert_base_for_bark_semantic(
+    model_info: ModelInfo, device: torch.device
+) -> "Model":
+    os.makedirs(env.CACHE_DIR, exist_ok=True)
+    local_file_path = os.path.join(env.CACHE_DIR, model_info.file_name)
+    if not os.path.isfile(local_file_path):
+        logger.info(
+            f"Downloading {model_info.file_name} model from {model_info.repo_id}"
+        )
+        download_file_from_hf(
+            model_info.repo_id, "model", model_info.file_name, env.CACHE_DIR
+        )
+    checkpoint = torch.load(local_file_path, map_location=device)
+    assert isinstance(
+        checkpoint, dict
+    ), "expecting a dictionary, got {type(checkpoint)}"
+    state_dict = checkpoint.get("model_state_dict", None)
+    assert (
+        state_dict is not None
+    ), f"model_state_dict not in checkpoint, {checkpoint.keys()}"
+    model_config = checkpoint.get("config", None)
+    assert model_config is not None, "not found model config in checkpoint"
+    config = HubertForBarkSemanticConfig(**model_config)
+    model = HuBERTForBarkSemantic(
+        config=config, load_hubert_pretrained_weights=False, device=device
+    )
+    model.load_state_dict(state_dict=state_dict, strict=True)
+    return Model(model=model, config=config, preprocessor=None)
+# TODO: refactor this class, each ModelInfo should have its own _load_model function for consistency
+# and avoid complicated if-else paths
+class ModelEnum(Enum):
+    """
+    Enumeration of supported models with their metadata.
+    Each entry maps to a ModelInfo object defining how to load the model.
+    """
+    BARK_TEXT_SMALL = ModelInfo(
+        repo_id="suno/bark",
+        file_name="text.pt",
+        model_type="text",
+        model_class=GPT,
+        preprocessor_class=BertTokenizer,
+    )
+    BARK_COARSE_SMALL = ModelInfo(
+        repo_id="suno/bark", file_name="coarse.pt", model_type="coarse"
+    )
+    BARK_FINE_SMALL = ModelInfo(
+        repo_id="suno/bark", file_name="fine.pt", model_type="fine"
+    )
+    BARK_TEXT = ModelInfo(repo_id="suno/bark", file_name="text_2.pt", model_type="text")
+    BARK_COARSE = ModelInfo(
+        repo_id="suno/bark", file_name="coarse_2.pt", model_type="coarse"
+    )
+    BARK_FINE = ModelInfo(repo_id="suno/bark", file_name="fine_2.pt", model_type="fine")
+    CustomHuBERTTokenizer = ModelInfo(
+        repo_id="GitMylo/bark-voice-cloning",
+        file_name="quantifier_hubert_base_ls960_14.pth",
+        model_type="custom_hubert_tokenizer",
+    )
+    ENCODEC24k = ModelInfo(
+        checkpoint_name="facebook/encodec_24khz",
+        model_type="encodec",
+        load_model=_load_encodec_model,
+    )
+    HuBERTBaseForBarkSemantic = ModelInfo(
+        checkpoint_name="facebook/hubert-base-ls960",
+        repo_id="sleeper371/hubert-for-bark-semantic",
+        file_name="hubert_epoch_30_2025_04_06_03_23_eval_loss_0.5520355800787607_acc_0.8344086021505376.pt",
+        load_model=_load_hubert_base_for_bark_semantic,
+    )
+    @classmethod
+    def get_model_info(cls, model_name: str) -> ModelInfo:
+        """
+        Retrieve ModelInfo for a given model name.
+        Args:
+            model_name (str): Name of the model (e.g., "BARK_TEXT_SMALL")
+        Returns:
+            ModelInfo: Metadata for the requested model
+        Raises:
+            ValueError: If the model name is not recognized
+        """
+        try:
+            return cls[model_name].value
+        except KeyError:
+            raise ValueError(f"Unknown model name: {model_name}")

core/model/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from core.model.bark import *

core/model/bark.py ADDED Viewed

	@@ -0,0 +1,425 @@

+"""
+codes adapted from https://github.com/suno-ai/bark
+"""
+import math
+from dataclasses import dataclass
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+@dataclass
+class GPTConfig:
+    block_size: int = 1024
+    input_vocab_size: int = 10_048
+    output_vocab_size: int = 10_048
+    n_layer: int = 12
+    n_head: int = 12
+    n_embd: int = 768
+    dropout: float = 0.0
+    bias: bool = (
+        True  # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
+    )
+@dataclass
+class FineGPTConfig(GPTConfig):
+    n_codes_total: int = 8
+    n_codes_given: int = 1
+class LayerNorm(nn.Module):
+    """LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False"""
+    def __init__(self, ndim: int, bias: bool) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(ndim))
+        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
+    def forward(self, input):
+        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)
+class MLP(nn.Module):
+    def __init__(self, config: GPTConfig):
+        super().__init__()
+        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
+        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
+        self.dropout = nn.Dropout(config.dropout)
+        self.gelu = nn.GELU()
+    def forward(self, x) -> torch.Tensor:
+        x = self.c_fc(x)
+        x = self.gelu(x)
+        x = self.c_proj(x)
+        x = self.dropout(x)
+        return x
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config: GPTConfig) -> None:
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+        # key, query, value projections for all heads, but in a batch
+        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
+        # output projection
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+        # regularization
+        self.attn_dropout = nn.Dropout(config.dropout)
+        self.resid_dropout = nn.Dropout(config.dropout)
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+        self.dropout = config.dropout
+        # flash attention make GPU go brrrrr but support is only in PyTorch nightly and still a bit scary
+        self.flash = hasattr(torch.nn.functional, "scaled_dot_product_attention")
+        if not self.flash:
+            # print("WARNING: using slow attention. Flash Attention atm needs PyTorch nightly and dropout=0.0")
+            # causal mask to ensure that attention is only applied to the left in the input sequence
+            self.register_buffer(
+                "bias",
+                torch.tril(torch.ones(config.block_size, config.block_size)).view(
+                    1, 1, config.block_size, config.block_size
+                ),
+            )
+    def forward(
+        self, x: torch.Tensor, past_kv: torch.Tensor = None, use_cache: bool = False
+    ):
+        B, T, C = (
+            x.size()
+        )  # batch size, sequence length, embedding dimensionality (n_embd)
+        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
+        k = k.view(B, T, self.n_head, C // self.n_head).transpose(
+            1, 2
+        )  # (B, nh, T, hs)
+        q = q.view(B, T, self.n_head, C // self.n_head).transpose(
+            1, 2
+        )  # (B, nh, T, hs)
+        v = v.view(B, T, self.n_head, C // self.n_head).transpose(
+            1, 2
+        )  # (B, nh, T, hs)
+        if past_kv is not None:
+            past_key = past_kv[0]
+            past_value = past_kv[1]
+            k = torch.cat((past_key, k), dim=-2)
+            v = torch.cat((past_value, v), dim=-2)
+        FULL_T = k.shape[-2]
+        if use_cache is True:
+            present = (k, v)
+        else:
+            present = None
+        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
+        if self.flash:
+            # efficient attention using Flash Attention CUDA kernels
+            if past_kv is not None:
+                # When `past_kv` is provided, we're doing incremental decoding and `q.shape[2] == 1`: q only contains
+                # the query for the last token. scaled_dot_product_attention interprets this as the first token in the
+                # sequence, so if is_causal=True it will mask out all attention from it. This is not what we want, so
+                # to work around this we set is_causal=False.
+                is_causal = False
+            else:
+                is_causal = True
+            y = torch.nn.functional.scaled_dot_product_attention(
+                q, k, v, dropout_p=self.dropout, is_causal=is_causal
+            )
+        else:
+            # manual implementation of attention
+            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+            att = att.masked_fill(
+                self.bias[:, :, FULL_T - T : FULL_T, :FULL_T] == 0, float("-inf")
+            )
+            att = F.softmax(att, dim=-1)
+            att = self.attn_dropout(att)
+            y = att @ v  # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
+        y = (
+            y.transpose(1, 2).contiguous().view(B, T, C)
+        )  # re-assemble all head outputs side by side
+        # output projection
+        y = self.resid_dropout(self.c_proj(y))
+        return (y, present)
+class Block(nn.Module):
+    def __init__(self, config: GPTConfig, layer_idx: int) -> None:
+        super().__init__()
+        self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
+        self.attn = CausalSelfAttention(config)
+        self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
+        self.mlp = MLP(config)
+        self.layer_idx = layer_idx
+    def forward(
+        self, x: torch.Tensor, past_kv: torch.Tensor = None, use_cache: bool = False
+    ):
+        attn_output, prev_kvs = self.attn(
+            self.ln_1(x), past_kv=past_kv, use_cache=use_cache
+        )
+        x = x + attn_output
+        x = x + self.mlp(self.ln_2(x))
+        return (x, prev_kvs)
+class GPT(nn.Module):
+    def __init__(self, config: GPTConfig):
+        super().__init__()
+        assert config.input_vocab_size is not None
+        assert config.output_vocab_size is not None
+        assert config.block_size is not None
+        self.config = config
+        self.transformer = nn.ModuleDict(
+            dict(
+                wte=nn.Embedding(config.input_vocab_size, config.n_embd),
+                wpe=nn.Embedding(config.block_size, config.n_embd),
+                drop=nn.Dropout(config.dropout),
+                h=nn.ModuleList([Block(config, idx) for idx in range(config.n_layer)]),
+                ln_f=LayerNorm(config.n_embd, bias=config.bias),
+            )
+        )
+        self.lm_head = nn.Linear(config.n_embd, config.output_vocab_size, bias=False)
+        # Note: lm_head lacks bias, implying parameter sharing with wte for efficiency
+    def get_num_params(self, non_embedding: bool = True) -> int:
+        """
+        Return the number of parameters in the model.
+        For non-embedding count (default), the position embeddings get subtracted.
+        The token embeddings would too, except due to the parameter sharing these
+        params are actually used as weights in the final layer, so we include them.
+        """
+        n_params = sum(p.numel() for p in self.parameters())
+        if non_embedding:
+            n_params -= self.transformer.wte.weight.numel()
+            n_params -= self.transformer.wpe.weight.numel()
+        return n_params
+    def forward(
+        self,
+        idx: torch.Tensor,
+        merge_context: bool = False,
+        past_kv: torch.Tensor = None,
+        position_ids: torch.Tensor = None,
+        use_cache: bool = False,
+    ):
+        device = idx.device
+        b, t = idx.size()
+        if past_kv is not None:
+            # When past_kv is provided, this is optimized for autoregressive generation
+            assert (
+                t == 1
+            ), "should only pass in the last token of the sequence when using kv_cache"
+            # Shape: (b, 1, n_embd), single token case
+            tok_emb = self.transformer.wte(idx)
+        else:
+            if merge_context:
+                # Custom feature: assumes first 256 tokens are one context, next 256 another, rest is sequence
+                assert idx.shape[1] >= 256 + 256 + 1
+                t = idx.shape[1] - 256  # Adjusts t for merged context length
+            else:
+                assert (
+                    t <= self.config.block_size
+                ), f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+            if merge_context:
+                # Merges two contexts by adding their embeddings, not a standard GPT behavior
+                tok_emb = torch.cat(
+                    [
+                        self.transformer.wte(idx[:, :256])
+                        + self.transformer.wte(idx[:, 256 : 256 + 256]),
+                        self.transformer.wte(idx[:, 256 + 256 :]),
+                    ],
+                    dim=1,
+                )
+            else:
+                tok_emb = self.transformer.wte(idx)
+        if past_kv is None:
+            past_length = 0
+            # Empty cache for each layer
+            past_kv = tuple([None] * len(self.transformer.h))
+        else:
+            # Infers prior sequence length from cache
+            past_length = past_kv[0][0].size(-2)
+        if position_ids is None:
+            position_ids = torch.arange(
+                past_length, t + past_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0)
+            assert position_ids.shape == (1, t)
+        pos_emb = self.transformer.wpe(position_ids)
+        x = self.transformer.drop(tok_emb + pos_emb)
+        # Prepares cache for key-value pairs if enabled
+        new_kv = () if use_cache else None
+        for i, (block, past_layer_kv) in enumerate(zip(self.transformer.h, past_kv)):
+            x, kv = block(x, past_kv=past_layer_kv, use_cache=use_cache)
+            if use_cache:
+                new_kv = new_kv + (kv,)  # Accumulates new key-value pairs for caching
+        x = self.transformer.ln_f(x)
+        # Optimization: only computes logits for the last token, efficient for generation
+        logits = self.lm_head(x[:, [-1], :])  # Preserves time dim with [-1]
+        return (
+            logits,
+            new_kv,
+        )  # Returns tuple: logits for next token, cache if requested
+class NonCausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+        # key, query, value projections for all heads, but in a batch
+        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
+        # output projection
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+        # regularization
+        self.attn_dropout = nn.Dropout(config.dropout)
+        self.resid_dropout = nn.Dropout(config.dropout)
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+        self.dropout = config.dropout
+        # flash attention make GPU go brrrrr but support is only in PyTorch >= 2.0
+        self.flash = hasattr(torch.nn.functional, "scaled_dot_product_attention")
+    def forward(self, x):
+        B, T, C = (
+            x.size()
+        )  # batch size, sequence length, embedding dimensionality (n_embd)
+        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
+        k = k.view(B, T, self.n_head, C // self.n_head).transpose(
+            1, 2
+        )  # (B, nh, T, hs)
+        q = q.view(B, T, self.n_head, C // self.n_head).transpose(
+            1, 2
+        )  # (B, nh, T, hs)
+        v = v.view(B, T, self.n_head, C // self.n_head).transpose(
+            1, 2
+        )  # (B, nh, T, hs)
+        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
+        if self.flash:
+            # efficient attention using Flash Attention CUDA kernels
+            y = torch.nn.functional.scaled_dot_product_attention(
+                q, k, v, attn_mask=None, dropout_p=self.dropout, is_causal=False
+            )
+        else:
+            # manual implementation of attention
+            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+            att = F.softmax(att, dim=-1)
+            att = self.attn_dropout(att)
+            y = att @ v  # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
+        y = (
+            y.transpose(1, 2).contiguous().view(B, T, C)
+        )  # re-assemble all head outputs side by side
+        # output projection
+        y = self.resid_dropout(self.c_proj(y))
+        return y
+class FineBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln_1 = nn.LayerNorm(config.n_embd)
+        self.attn = NonCausalSelfAttention(config)
+        self.ln_2 = nn.LayerNorm(config.n_embd)
+        self.mlp = MLP(config)
+    def forward(self, x):
+        x = x + self.attn(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+class FineGPT(GPT):
+    def __init__(self, config):
+        super().__init__(config)
+        del self.lm_head
+        self.config = config
+        self.n_codes_total = config.n_codes_total
+        self.transformer = nn.ModuleDict(
+            dict(
+                wtes=nn.ModuleList(
+                    [
+                        nn.Embedding(config.input_vocab_size, config.n_embd)
+                        for _ in range(config.n_codes_total)
+                    ]
+                ),
+                wpe=nn.Embedding(config.block_size, config.n_embd),
+                drop=nn.Dropout(config.dropout),
+                h=nn.ModuleList([FineBlock(config) for _ in range(config.n_layer)]),
+                ln_f=nn.LayerNorm(config.n_embd),
+            )
+        )
+        self.lm_heads = nn.ModuleList(
+            [
+                nn.Linear(config.n_embd, config.output_vocab_size, bias=False)
+                for _ in range(config.n_codes_given, self.n_codes_total)
+            ]
+        )
+        for i in range(self.n_codes_total - config.n_codes_given):
+            self.transformer.wtes[i + 1].weight = self.lm_heads[i].weight
+    def forward(self, pred_idx, idx):
+        device = idx.device
+        b, t, codes = idx.size()
+        assert (
+            t <= self.config.block_size
+        ), f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+        assert pred_idx > 0, "cannot predict 0th codebook"
+        assert codes == self.n_codes_total, (b, t, codes)
+        pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(
+            0
+        )  # shape (1, t)
+        # forward the GPT model itself
+        tok_embs = [
+            wte(idx[:, :, i]).unsqueeze(-1)
+            for i, wte in enumerate(self.transformer.wtes)
+        ]  # token embeddings of shape (b, t, n_embd)
+        tok_emb = torch.cat(tok_embs, dim=-1)
+        pos_emb = self.transformer.wpe(
+            pos
+        )  # position embeddings of shape (1, t, n_embd)
+        x = tok_emb[:, :, :, : pred_idx + 1].sum(dim=-1)
+        x = self.transformer.drop(x + pos_emb)
+        for block in self.transformer.h:
+            x = block(x)
+        x = self.transformer.ln_f(x)
+        logits = self.lm_heads[pred_idx - self.config.n_codes_given](x)
+        return logits
+    def get_num_params(self, non_embedding=True):
+        """
+        Return the number of parameters in the model.
+        For non-embedding count (default), the position embeddings get subtracted.
+        The token embeddings would too, except due to the parameter sharing these
+        params are actually used as weights in the final layer, so we include them.
+        """
+        n_params = sum(p.numel() for p in self.parameters())
+        if non_embedding:
+            for wte in self.transformer.wtes:
+                n_params -= wte.weight.numel()
+            n_params -= self.transformer.wpe.weight.numel()
+        return n_params

core/model/hubert.py ADDED Viewed

	@@ -0,0 +1,237 @@

+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional, Tuple, Union, Literal
+import torch
+import torch.nn as nn
+from transformers.modeling_outputs import BaseModelOutput
+from transformers import HubertModel, AutoConfig, AutoModel
+@dataclass
+class CustomHubertConfig:
+    """Configuration class for CustomHubert model."""
+    # e.g., "facebook/hubert-base-ls960" or "facebook/hubert-large-ll60k"
+    checkpoint_name: str
+    # Layer to extract features from (0-indexed, e.g., 9 for 10th layer)
+    feature_layer: int = 11
+    # Target audio sample rate in Hz
+    target_sample_rate: int = 16000
+    # Optional length multiple for audio trimming
+    seq_len_multiple_of: Optional[int] = None
+@dataclass
+class HubertForBarkSemanticConfig:
+    """Configuration for HuBERTForBarkSemantic."""
+    # # HuBERT model checkpoint for feature extractor layer
+    checkpoint_name: Literal["facebook/hubert-base-ls960", "hubert-large-ls960-ft"]
+    vocab_size: int
+    # Layer to extract features from
+    feature_layer: int = 11
+    # last three tokens for SOS, EOS and PAD tokens
+    # maximum target sequence length
+    max_target_length: int = 2000
+    num_decoder_layer: int = 12
+    sos_token_id: int = 10000
+    eos_token_id: int = 10001
+class HubertFeatureExtractor(nn.Module):
+    """
+    A custom HuBERT model that loads a pretrained model from transformers and extracts
+    features from a specified layer. Processes raw audio waveforms and returns hidden states.
+    Args:
+        config (CustomHubertConfig): Configuration specifying checkpoint, layer, and audio settings.
+        device (torch.device, optional): Device to run the model on (e.g., "cuda" or "cpu").
+    """
+    def __init__(
+        self,
+        config: CustomHubertConfig,
+        load_pretrained_weights: bool,
+        device: Optional[torch.device] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.target_sample_rate = config.target_sample_rate
+        # Load pretrained HuBERT model from transformers
+        self.hubert_config = AutoConfig.from_pretrained(config.checkpoint_name)
+        if load_pretrained_weights:
+            self.model = HubertModel.from_pretrained(config.checkpoint_name)
+        else:
+            # don't download the pretrained weights, init the model from the config
+            self.model = AutoModel.from_config(self.hubert_config)
+        # Validate feature_layer
+        # e.g., 12 for BASE, 24 for LARGE
+        num_layers = self.model.config.num_hidden_layers
+        if not (0 <= config.feature_layer < num_layers):
+            raise ValueError(
+                f"feature_layer must be between 0 and {num_layers - 1}, got {config.feature_layer}"
+            )
+        self.feature_layer = config.feature_layer
+        # Move to device if specified
+        if device is not None:
+            self.to(device)
+    @property
+    def hidden_size(self) -> int:
+        """Returns the hidden size of the HuBERT model (e.g., 768 for BASE, 1024 for LARGE)."""
+        return self.model.config.hidden_size
+    def forward(
+        self,
+        wav_input: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Processes raw audio waveforms through HuBERT and extracts features from the specified layer.
+        Input audio sample rate expected 16k
+        Args:
+            wav_input (torch.Tensor): Raw audio waveforms, shape [batch_size, audio_length].
+            return_shape (Tuple[int, int], optional): If provided, reshapes output to [batch_size, seq_length, hidden_size].
+        Returns:
+            torch.Tensor: Features from the specified layer. Shape depends on return_shape:
+                          - If None: [batch_size * seq_length, hidden_size] (flattened).
+                          - If provided: [batch_size, seq_length, hidden_size].
+        """
+        # Forward pass through HuBERT
+        # output_hidden_states=True returns all layer outputs
+        outputs: BaseModelOutput = self.model(
+            input_values=wav_input, output_hidden_states=True, return_dict=True
+        )
+        # Extract features from the specified layer (0-indexed)
+        # hidden_states is a tuple of [batch_size, seq_length, hidden_size] for each layer
+        features = outputs.hidden_states[self.feature_layer]  # e.g., [2, 500, 768]
+        features = features.contiguous()
+        return features
+class HuBERTForBarkSemantic(nn.Module):
+    def __init__(
+        self,
+        config: HubertForBarkSemanticConfig,
+        load_hubert_pretrained_weights: bool = True,
+        device: Optional[torch.device] = None,
+    ):
+        super().__init__()
+        self.config = config
+        # HuBERT feature extractor
+        hubert_config = CustomHubertConfig(
+            checkpoint_name=config.checkpoint_name,
+            feature_layer=config.feature_layer,
+        )
+        self.hubert = HubertFeatureExtractor(
+            config=hubert_config,
+            load_pretrained_weights=load_hubert_pretrained_weights,
+            device=device,
+        )
+        # e.g., 768 for BASE
+        input_size = self.hubert.model.config.hidden_size
+        # Transformer Decoder
+        self.decoder_embedding = nn.Embedding(config.vocab_size, input_size)
+        self.pos_embedding = nn.Parameter(
+            torch.zeros(1, config.max_target_length, input_size)
+        )
+        self.decoder = nn.TransformerDecoder(
+            nn.TransformerDecoderLayer(
+                d_model=input_size,
+                nhead=8,
+                dim_feedforward=2048,
+                dropout=0.1,
+                batch_first=True,
+            ),
+            num_layers=config.num_decoder_layer,  # Adjust as needed
+        )
+        self.fc = nn.Linear(input_size, config.vocab_size)
+        if device is not None:
+            self.to(device)
+    def save_state_dict(self, save_path: str):
+        torch.save(self.state_dict(), save_path)
+    def forward(self, wav_input: torch.Tensor, tgt: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass: Extracts HuBERT features and predicts semantic token probabilities.
+        Args:
+            wav_input: [batch_size, audio_length] (e.g., [2, 160000])
+            tgt: the target sequence
+        Returns:
+            [batch_size, seq_length, vocab_size + 1] (e.g., [2, 500, VOCAB_SIZE])
+        """
+        memory: torch.Tensor = self.hubert(wav_input)  # [B, T, 768]
+        B, T_tgt = tgt.shape
+        tgt_emb = self.decoder_embedding(tgt) + self.pos_embedding[:, :T_tgt, :]
+        tgt_mask = nn.Transformer.generate_square_subsequent_mask(T_tgt).to(tgt.device)
+        output: torch.Tensor = self.decoder(tgt_emb, memory, tgt_mask=tgt_mask)
+        logits = self.fc(output)
+        return logits
+    @torch.no_grad
+    def generate(
+        self,
+        wav_input: torch.Tensor,
+        temperature: Optional[float] = 0.8,
+        eos_p: Optional[float] = 0.5,
+        max_length: int = 600,
+    ) -> torch.Tensor:
+        """
+        Inference: autoregressive generation.
+        assuming wav_input audio is at 16000 sample rate"""
+        self.eval()
+        memory = self.hubert(wav_input)
+        B = wav_input.shape[0]
+        tgt = torch.full(
+            size=(B, 1), fill_value=self.config.sos_token_id, device=wav_input.device
+        )
+        for _ in range(max_length):
+            tgt_emb = (
+                self.decoder_embedding(tgt) + self.pos_embedding[:, : tgt.shape[1], :]
+            )
+            tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.shape[1]).to(
+                tgt.device
+            )
+            output = self.decoder(tgt_emb, memory, tgt_mask=tgt_mask)
+            # logits shape (B, T', vocab_size)
+            logits: torch.Tensor = self.fc(output[:, -1, :])
+            if temperature is not None and temperature > 0:
+                probs = torch.softmax(input=logits / temperature, dim=-1)
+                next_token = torch.multinomial(input=probs, num_samples=1)
+            else:
+                probs = torch.softmax(input=logits, dim=-1)
+                next_token = logits.argmax(dim=-1, keepdim=True)
+            # stop if the EOS token probabilities are higher than the provided eos_p
+            if eos_p is not None and eos_p > 0:
+                if torch.all(probs[:, self.config.eos_token_id] > eos_p):
+                    break
+            # early stopping
+            if torch.all(next_token == self.config.eos_token_id):
+                break
+            tgt = torch.cat([tgt, next_token], dim=1)
+            if (next_token == self.config.eos_token_id).all():
+                break
+        # remove the [SOS] token from the generated semantic sequences
+        return tgt[:, 1:]

core/trainer/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from core.trainer.custom_hubert_trainer import *

core/trainer/custom_hubert_trainer.py ADDED Viewed

	@@ -0,0 +1,555 @@

+import os
+from pathlib import Path
+from datetime import datetime
+import logging
+import sys
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.optim import Adam
+from torch.optim.lr_scheduler import LRScheduler, LinearLR
+from torch.utils.data import Dataset, DataLoader, random_split
+import torchaudio
+from tqdm import tqdm
+from typing import Literal, List, Optional, Tuple, Dict, Callable, Union, Any
+from core.data_model import WavSemantic, WavSemanticDataset
+from core.utils import read_audio_file, upload_file_to_hf
+# cudnn error about non-contiguous input at the lstm layer, disable it fixed the issue
+torch.backends.cudnn.enabled = False
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    handlers=[logging.StreamHandler(sys.stdout)],
+)
+logger = logging.getLogger(__name__)
+HUBERT_SAMPLE_RATE = 16000
+# 10_000 and 10_001 are for SOS and EOS tokens
+SEMANTIC_PADDING_TOKEN = 10002
+SOS_TOKEN = 10_000
+EOS_TOKEN = 10_001
+class WavSemanticTorchDataset(Dataset):
+    """PyTorch Dataset for WavSemantic data with resampling and noise augmentation.
+    Padding is carried out in a collator function.
+    Args:
+        samples: List of WavSemantic objects (speech data).
+        orig_sample_rate: Original sample rate of the audio.
+        target_sample_rate: Desired sample rate (default: 16000 Hz).
+        device: Device to move tensors to (optional).
+        noises: List of noise waveforms as NumPy arrays (optional, for augmentation).
+            noises audio must already have sample_rate = target_sample rate, this class doesn't resample it
+        augment_prob: Probability of applying noise augmentation (default: 0.5).
+    """
+    def __init__(
+        self,
+        samples: List["WavSemantic"],
+        orig_sample_rate: int,
+        target_sample_rate: Optional[int] = 16000,
+        device: Optional[torch.device] = None,
+        noises: Optional[List[np.ndarray]] = None,
+        augment_prob: float = 0.5,
+    ):
+        self.samples = samples
+        self.orig_sample_rate = orig_sample_rate
+        self.target_sample_rate = target_sample_rate
+        self.device = device
+        self.noises = noises
+        self.augment_prob = augment_prob
+        self.resampler = torchaudio.transforms.Resample(
+            orig_freq=orig_sample_rate, new_freq=target_sample_rate
+        )
+    def __len__(self) -> int:
+        return len(self.samples)
+    def _normalize_waveform(self, wav: torch.Tensor) -> torch.Tensor:
+        """Normalize waveform to [-1, 1]."""
+        max_val = wav.abs().max()
+        if max_val > 0:
+            wav = wav / max_val
+        return wav
+    def _add_time_varying_noise(
+        self, speech: torch.Tensor, noise: torch.Tensor, snr_db: float
+    ) -> torch.Tensor:
+        """Add noise to a random segment of the speech with fade-in/fade-out."""
+        speech_len = speech.size(0)
+        noise_len = noise.size(0)
+        # Match noise length (loop or trim)
+        if noise_len < speech_len:
+            repeats = int(np.ceil(speech_len / noise_len))
+            noise = noise.repeat(repeats)[:speech_len]
+        else:
+            noise = noise[:speech_len]
+        # Random segment (50%-100% of speech length)
+        seg_len = int(speech_len * np.random.uniform(0.5, 1.0))
+        start = np.random.randint(0, speech_len - seg_len + 1)
+        end = start + seg_len
+        # Compute noise scaling based on SNR
+        speech_energy = torch.mean(speech[start:end] ** 2)
+        noise_energy = torch.mean(noise[start:end] ** 2)
+        snr_linear = 10 ** (snr_db / 10.0)
+        noise_scale = torch.sqrt(speech_energy / (noise_energy * snr_linear + 1e-10))
+        # Apply noise to segment with fade-in/fade-out
+        fade_len = min(1000, seg_len // 4)  # Fade over 1000 samples or 1/4 segment
+        fade_in = torch.linspace(0, 1, fade_len)
+        fade_out = torch.linspace(1, 0, fade_len)
+        mask = torch.ones(seg_len)
+        if fade_len > 0:
+            mask[:fade_len] = fade_in
+            mask[-fade_len:] = fade_out
+        noisy_segment = speech[start:end] + (noise_scale * noise[start:end] * mask)
+        noisy_speech = speech.clone()
+        noisy_speech[start:end] = noisy_segment
+        return torch.clamp(noisy_speech, -1, 1)
+    def _augment_with_noise(self, wav: torch.Tensor) -> torch.Tensor:
+        """Augment waveform with random noise mixture."""
+        if not self.noises or len(self.noises) == 0:
+            return wav
+        # Decide how many noises to mix (1 or 2)
+        num_noises = np.random.randint(1, 3)  # 1 or 2 noises
+        random_indices = np.random.randint(0, len(self.noises), size=num_noises)
+        selected_noises = [self.noises[i] for i in random_indices]
+        noisy_wav = wav.clone()
+        for noise_np in selected_noises:
+            noise = torch.from_numpy(noise_np).float()
+            noise = self._normalize_waveform(noise)  # Normalize noise
+            snr_db = np.random.uniform(0, 20)  # Random SNR between 0-20 dB
+            noisy_wav = self._add_time_varying_noise(noisy_wav, noise, snr_db)
+        # Volume normalization: re-normalize after mixing
+        noisy_wav = self._normalize_waveform(noisy_wav)
+        return noisy_wav
+    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor, str]:
+        sample = self.samples[idx]
+        # Convert NumPy wav to torch tensor and resample
+        wav_tensor = torch.from_numpy(sample.wav).float()
+        if self.orig_sample_rate != self.target_sample_rate:
+            wav_tensor = self.resampler(wav_tensor)
+        # Normalize to [-1, 1]
+        wav_tensor = self._normalize_waveform(wav_tensor)
+        # Apply noise augmentation with probability
+        if self.noises and np.random.rand() < self.augment_prob:
+            wav_tensor = self._augment_with_noise(wav_tensor)
+        # Convert semantic to torch tensor (assuming integer tokens for CTC)
+        semantic_tensor = torch.from_numpy(sample.semantic).long()
+        # Move to device if specified
+        if self.device is not None:
+            wav_tensor = wav_tensor.to(self.device)
+            semantic_tensor = semantic_tensor.to(self.device)
+        return wav_tensor, semantic_tensor
+def wav_semantic_collate_fn(
+    batch: List[Tuple[torch.Tensor, torch.Tensor]],
+    sos_token: int = SOS_TOKEN,  # Adjust based on your vocab
+    eos_token: int = EOS_TOKEN,  # Adjust based on your vocab
+    padding_token: int = SEMANTIC_PADDING_TOKEN,  # Adjust based on your vocab
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Collate function for wav and semantic token pairs, adding <SOS> and <EOS> to targets.
+    Args:
+        batch: List of (wav_tensor, semantic_tensor) tuples.
+        sos_token: Index of the <SOS> token.
+        eos_token: Index of the <EOS> token.
+        padding_token: Index of the padding token.
+    Returns:
+        Tuple of (padded_wavs, padded_targets, wav_lengths, target_lengths).
+        - padded_wavs: [B, max_wav_len]
+        - padded_targets: [B, max_target_len] with <SOS> and <EOS>
+        - wav_lengths: [B] (original wav lengths)
+        - target_lengths: [B] (original semantic lengths + 2 for <SOS> and <EOS>)
+    """
+    waves, semantics = zip(*batch)
+    # Add <SOS> and <EOS> to each semantic sequence
+    semantics_with_tokens = [
+        torch.cat(
+            [
+                torch.tensor([sos_token], dtype=torch.long, device=semantic.device),
+                semantic,
+                torch.tensor([eos_token], dtype=torch.long, device=semantic.device),
+            ]
+        )
+        for semantic in semantics
+    ]
+    # Compute lengths *after* adding <SOS> and <EOS>
+    wav_lengths = torch.tensor([wav.size(0) for wav in waves], dtype=torch.long)
+    target_lengths = torch.tensor(
+        [semantic.size(0) for semantic in semantics_with_tokens], dtype=torch.long
+    )
+    # Pad waves and targets to max length in batch
+    max_wav_len = max(wav_lengths).item()
+    max_target_len = max(target_lengths).item()
+    padded_wavs = torch.zeros(size=(len(waves), max_wav_len), device=waves[0].device)
+    padded_targets = torch.full(
+        size=(len(semantics), max_target_len),
+        fill_value=padding_token,
+        dtype=torch.long,
+        device=semantics[0].device,
+    )
+    for i, (wav, semantic) in enumerate(zip(waves, semantics_with_tokens)):
+        padded_wavs[i, : wav.size(0)] = wav
+        padded_targets[i, : semantic.size(0)] = semantic
+    return padded_wavs, padded_targets, wav_lengths, target_lengths
+def load_train_val_dataloaders(
+    dataset: WavSemanticDataset,
+    train_ratio: float,
+    batch_size: int,
+    target_sample_rate: int = 16000,
+    noises: List[np.ndarray] = None,
+    augment_prob: float = 0.5,
+    device: Optional[torch.device] = None,
+) -> Tuple[DataLoader, DataLoader]:
+    """
+    Load train and validation DataLoaders from a WavSemanticDataset with dynamic batch padding.
+    Args:
+        dataset: The WavSemanticDataset instance to split and load.
+        train_ratio: Fraction of data to use for training (0 to 1).
+        batch_size: Number of samples per batch.
+        target_sample_rate: Target sample rate for resampling (default: 16000 Hz).
+        device: Optional device to move tensors to (default: None, stays on CPU).
+    Returns:
+        Tuple of (train_dataloader, val_dataloader).
+    """
+    # Split dataset into train and val
+    total_samples = len(dataset.data)
+    train_size = int(train_ratio * total_samples)
+    val_size = total_samples - train_size
+    train_data, val_data = random_split(dataset.data, [train_size, val_size])
+    # Create datasets without fixed max_sequence_length
+    train_dataset = WavSemanticTorchDataset(
+        samples=train_data,
+        orig_sample_rate=dataset.sample_rate,
+        target_sample_rate=target_sample_rate,
+        device=device,
+        noises=noises,
+        augment_prob=augment_prob,
+    )
+    val_dataset = WavSemanticTorchDataset(
+        samples=val_data,
+        orig_sample_rate=dataset.sample_rate,
+        target_sample_rate=target_sample_rate,
+        device=device,
+        noises=noises,
+        augment_prob=augment_prob,
+    )
+    # Create dataloaders with custom collate function
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_size=batch_size,
+        shuffle=True,
+        num_workers=0,  # Increase if you have multiple cores
+        collate_fn=wav_semantic_collate_fn,
+    )
+    val_dataloader = DataLoader(
+        val_dataset,
+        batch_size=batch_size,
+        shuffle=False,
+        num_workers=0,
+        collate_fn=wav_semantic_collate_fn,
+    )
+    return train_dataloader, val_dataloader
+def train_hubert_one_epoch(
+    model: nn.Module,
+    optimizer: torch.optim.Optimizer,
+    criterion: nn.CrossEntropyLoss,
+    train_dataloader: DataLoader,
+    grad_scaler: torch.cuda.amp.GradScaler,
+    device: torch.device,
+    progress_bar: Optional[tqdm] = None,
+    enable_autocast: bool = False,
+) -> Dict[str, float]:
+    """
+    Train the HuBERT model for one epoch using mixed-precision training with CrossEntropyLoss.
+    Args:
+        model: The HuBERT model with Transformer decoder.
+        optimizer: Optimizer for updating model parameters.
+        criterion: CrossEntropyLoss function.
+        train_dataloader: DataLoader for training data.
+        grad_scaler: Gradient scaler for mixed-precision training.
+        device: Device to train on (e.g., 'cuda', 'mps', 'cpu').
+        progress_bar: Optional tqdm progress bar.
+    Returns:
+        Dict with 'loss' metric.
+    """
+    model.train()
+    total_loss = 0.0
+    for batch in train_dataloader:
+        # DataLoader already moves data to device
+        waves, targets = batch[0], batch[1]
+        optimizer.zero_grad()
+        with torch.autocast(
+            device_type=device.type, dtype=torch.bfloat16, enabled=enable_autocast
+        ):
+            logits: torch.Tensor = model(waves, targets)
+            loss = criterion(logits[:, :-1, :].transpose(1, 2), targets[:, 1:])
+        total_loss += loss.detach().item()
+        # Mixed precision with scaler (remove scaler if autocast is disabled)
+        grad_scaler.scale(loss).backward()
+        grad_scaler.step(optimizer)
+        grad_scaler.update()
+        if progress_bar is not None:
+            progress_bar.update(1)
+    avg_loss = total_loss / len(train_dataloader)
+    return {"loss": avg_loss}
+def eval_hubert(
+    model: nn.Module,
+    criterion: nn.CrossEntropyLoss,
+    val_dataloader: DataLoader,
+    device: torch.device,
+    sos_token: int = SOS_TOKEN,
+    eos_token: int = EOS_TOKEN,
+    padding_token: int = SEMANTIC_PADDING_TOKEN,
+) -> Dict[str, float]:
+    """
+    Evaluate the updated HuBERT model with Transformer decoder on the validation set.
+    Args:
+        model: The HuBERT model with Transformer decoder.
+        criterion: CrossEntropyLoss function.
+        val_dataloader: DataLoader for validation data (waves, targets).
+        device: Device to evaluate on.
+        sos_token: Index of the <SOS> token.
+        eos_token: Index of the <EOS> token.
+        padding_token: Index of the padding token.
+    Returns:
+        Dict with 'loss', 'accuracy', and 'num_tokens' metrics.
+    """
+    model.eval()
+    total_loss = 0.0
+    total_correct = 0
+    total_tokens = 0
+    num_batches = 0
+    for batch in val_dataloader:
+        # targets: [B, T'] with <SOS> and <EOS>
+        waves, targets = batch[0].to(device), batch[1].to(device)
+        with torch.no_grad(), torch.autocast(
+            device_type=device.type, dtype=torch.bfloat16
+        ):
+            # [B, T', semantic_vocab_size]
+            # transformers use batch_first=True
+            # targets is a tensor of [B, T'], all including [SOS] and [EOS] tokens
+            logits: torch.Tensor = model(waves, targets)
+            # remove the last token predictions from the logits
+            # remove the first token, which is SOS token from the targets
+            # transpose the logits tensor from (B, T, C) to (B, C, T)
+            loss = criterion(logits[:, :-1, :].transpose(1, 2), targets[:, 1:])
+            # Calculate accuracy (ignoring padding tokens)
+            preds = logits.argmax(dim=-1)[:, :-1]
+            target_shifted = targets[:, 1:]
+            mask = target_shifted != padding_token
+            total_correct += (preds[mask] == target_shifted[mask]).sum().item()
+            total_tokens += mask.sum().item()
+        total_loss += loss.item()
+        num_batches += 1
+    avg_loss = total_loss / num_batches
+    accuracy = total_correct / total_tokens if total_tokens > 0 else 0.0
+    return {"loss": avg_loss, "accuracy": accuracy, "num_tokens": total_tokens}
+def _load_noise_dataset(data_path: str, target_sample_rate: int) -> List[np.ndarray]:
+    data = []
+    # Add more extensions as needed  ".flac", ".ogg", ".aiff"
+    audio_extensions = (".wav", ".mp3")
+    # Walk through all directories and subdirectories
+    for root, dirs, files in os.walk(data_path):
+        for filename in files:
+            # Check if the file has an audio extension
+            if filename.lower().endswith(audio_extensions):
+                filepath = os.path.join(root, filename)
+                try:
+                    audio = read_audio_file(
+                        filepath,
+                        target_sample_rate=target_sample_rate,
+                        channels=1,
+                        normalize=False,
+                    )
+                    data.append(audio)
+                except Exception as e:
+                    print(f"Warning: Could not load {filepath}: {str(e)}")
+                    continue
+    if len(data) == 0:
+        raise RuntimeError(f"No audio files found in {data_path} or its subdirectories")
+    return data
+def train_hubert_quantizer(
+    model: nn.Module,
+    model_config: Dict[str, Any],
+    lr: float,
+    num_epoch: int,
+    train_ratio: float = 0.8,
+    batch_size: int = 64,
+    data_path: str = "./wav_semantic_dataset",
+    checkpoint_path: str = "./checkpoints",
+    save_checkpoint_every: int = 2,
+    enable_grad_scaler: bool = False,
+    augment_data_with_noise: bool = False,
+    augment_prob: float = 0.5,
+    noise_data_path: str = "./noise_dataset",
+    publish_hf: bool = False,
+    publish_to_repo: str = "",
+    num_samples: int = 5000,
+    device: torch.device = "cuda",
+) -> nn.Module:
+    """
+    Train a HuBERT model with mixed-precision training and save checkpoints.
+    Args:
+        model: The HuBERT model to train.
+        lr: Learning rate for the optimizer.
+        num_epoch: Number of epochs to train.
+        train_ratio: Fraction of data for training.
+        batch_size: Batch size for DataLoaders.
+        data_path: Path to the saved dataset.
+        checkpoint_path: Directory to save checkpoints.
+        save_checkpoint_every: Save checkpoint every N epochs.
+        augment_data_with_noise: whether to add random noise to training audio
+        augment_prob: probability of a sample will be augmented with noise
+        num_samples: maximum number of samples to load from the dataset
+    Returns:
+        The trained model.
+    """
+    #  else "mps" if torch.backends.mps.is_available()
+    # mix precision training doesn't work with mps device at the grad_scaler.step(optimizer) step
+    # for testing just run on cpu
+    model.to(device)
+    # Load dataset and create dataloaders
+    dataset = WavSemanticDataset.load(data_path, num_samples=num_samples)
+    noises = None
+    if augment_data_with_noise:
+        logger.info(f"reading noise data from {noise_data_path}")
+        noises = _load_noise_dataset(noise_data_path, target_sample_rate=16000)
+    train_dataloader, val_dataloader = load_train_val_dataloaders(
+        dataset,
+        train_ratio=train_ratio,
+        batch_size=batch_size,
+        target_sample_rate=HUBERT_SAMPLE_RATE,
+        noises=noises,
+        augment_prob=augment_prob,
+        device=device,
+    )
+    optimizer = Adam(model.parameters(), lr=lr)
+    criterion = nn.CrossEntropyLoss(ignore_index=SEMANTIC_PADDING_TOKEN)
+    grad_scaler = torch.amp.GradScaler(device.type, enabled=enable_grad_scaler)
+    progress_bar = tqdm(total=num_epoch * len(train_dataloader), desc="Training HuBERT")
+    # scheduler = LinearLR(
+    #     optimizer, start_factor=1, end_factor=0.5, total_iters=(num_epoch / 2)
+    # )
+    scheduler = None
+    Path(checkpoint_path).mkdir(parents=True, exist_ok=True)
+    for epoch in range(num_epoch):
+        train_result = train_hubert_one_epoch(
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
+            train_dataloader=train_dataloader,
+            grad_scaler=grad_scaler,
+            device=device,
+            progress_bar=progress_bar,
+            enable_autocast=enable_grad_scaler,
+        )
+        with torch.no_grad():
+            eval_result = eval_hubert(
+                model=model,
+                criterion=criterion,
+                val_dataloader=val_dataloader,
+                device=device,
+            )
+        if scheduler is not None:
+            scheduler.step()
+        logger.info(
+            f"Epoch {epoch + 1}/{num_epoch}, Train: {train_result}, Eval: {eval_result}"
+        )
+        if (epoch + 1) % save_checkpoint_every == 0:
+            checkpoint_file = os.path.join(
+                checkpoint_path,
+                f"hubert_epoch_{epoch + 1}_{datetime.now().strftime('%Y_%m_%d_%H_%M')}_eval_loss_{eval_result.get('loss', 0)}_acc_{eval_result.get('accuracy', 0)}.pt",
+            )
+            torch.save(
+                {  # should have save the model configuration for later loading
+                    "epoch": epoch + 1,
+                    "model_state_dict": model.state_dict(),
+                    # "optimizer_state_dict": optimizer.state_dict(),
+                    "train_result": train_result,
+                    "eval_result": eval_result,
+                    "config": model_config,
+                },
+                checkpoint_file,
+            )
+            logger.info(f"Saved checkpoint to {checkpoint_file}")
+            if publish_hf:
+                upload_file_to_hf(checkpoint_file, publish_to_repo, "model")
+    progress_bar.close()
+    return model

core/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from core.utils.audio import *
+from core.utils.text import *
+from core.utils.read_write_files import *
+from core.utils.huggingface import *

core/utils/audio.py ADDED Viewed

	@@ -0,0 +1,104 @@

+"""
+Helpful functions to process audio
+"""
+import numpy as np
+import soundfile as sf
+from typing_extensions import Annotated, Literal, Optional
+import torchaudio
+import torch
+AudioChannel = Literal[1, 2]
+def read_audio_file(
+    path: str,
+    target_sample_rate: int = 16000,
+    channels: int = 1,
+    normalize: bool = True,
+    max_duration: Optional[float] = None,
+) -> np.ndarray:
+    """Read and resample audio file
+    If target_sample_rate is different than the audio's sample rate, this function will resample it
+    If GPU is available, the resampling will be on GPU.
+    Args:
+        path: Path to the audio file (supports WAV, FLAC, OGG)
+        target_sample_rate: Target sample rate (default: 24000)
+        channels: Number of output channels (1 for mono, 2 for stereo)
+        normalize: Whether to normalize audio to [-1, 1]
+        max_duration: Maximum duration in seconds (truncates longer files)
+        device: Device to process on ("cuda" or "cpu", defaults to cuda if available)
+    Returns:
+        np.ndarray: Processed audio samples as a numpy array
+    Raises:
+        RuntimeError: If the file cannot be read or processing fails
+    """
+    try:
+        # Load audio file with torchaudio
+        waveform, original_sample_rate = torchaudio.load(path)  # [channels, samples]
+        # Truncate to max_duration before resampling
+        if max_duration is not None:
+            max_samples = int(max_duration * original_sample_rate)
+            if waveform.size(1) > max_samples:
+                waveform = waveform[:, :max_samples]
+        # Downmix to desired channels
+        if waveform.size(0) > channels:
+            if channels == 1:
+                waveform = waveform.mean(dim=0, keepdim=True)  # Mono: average channels
+            elif channels == 2:
+                waveform = waveform[:2, :]  # Stereo: take first 2 channels
+        # Resample if needed
+        if original_sample_rate != target_sample_rate:
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            waveform = waveform.to(device)
+            resampler = torchaudio.transforms.Resample(
+                orig_freq=original_sample_rate,
+                new_freq=target_sample_rate,
+                resampling_method="sinc_interp_kaiser",  # Fast and high-quality
+            ).to(device)
+            waveform = resampler(waveform)
+        # Normalize to [-1, 1] if requested
+        if normalize:
+            max_val = waveform.abs().max()
+            if max_val > 0:
+                waveform = waveform / max_val
+        # Move back to CPU and convert to numpy
+        data = waveform.cpu().numpy()
+        # Ensure correct shape (remove extra dim if mono)
+        if channels == 1 and data.shape[0] == 1:
+            data = data[0, :]
+        return data
+    except Exception as e:
+        raise RuntimeError(f"Failed to read audio file {path}: {str(e)}")
+def save_audio_file(
+    audio_array: np.ndarray, sample_rate: int, file_path: str, format="WAV"
+):
+    """
+    Save an audio array to a file.
+    Parameters:
+    - audio_array: numpy array or list containing the audio samples
+    - sample_rate: int, the sample rate of the audio (e.g., 44100 Hz)
+    - file_path: str, path where the file will be saved (e.g., 'output.wav')
+    - format: str, audio file format (e.g., 'WAV', 'FLAC', 'OGG'), default is 'WAV'
+    """
+    try:
+        if not file_path.endswith(".wav"):
+            file_path += ".wav"
+        sf.write(file_path, audio_array, sample_rate, format=format)
+    except Exception as e:
+        print(f"Error saving audio file at {file_path}: {e}")

core/utils/huggingface.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import logging
+import sys
+from typing import Optional, Literal
+import os
+import shutil
+from zipfile import ZipFile
+from pathlib import Path
+from huggingface_hub import hf_hub_download, upload_file
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    handlers=[logging.StreamHandler(sys.stdout)],
+)
+logger = logging.getLogger(__name__)
+__all__ = ["download_dataset_from_hf", "upload_file_to_hf", "download_file_from_hf"]
+def download_dataset_from_hf(
+    repo_id: str,
+    filename: str,
+    dest_path: str,
+    token: str = None,
+    local_dir: str = "./downloads",
+    remove_downloaded_file: bool = True,
+) -> None:
+    """
+    Download a file from Hugging Face repository and unzip it to destination path
+    Args:
+        repo_id (str): Hugging Face repository ID (username/repo_name)
+        filename (str): Name of the file to download from the repository
+        dest_path (str): Destination path where contents will be unzipped
+        token (str, optional): Hugging Face token, if None will prompt for login
+    """
+    # Ensure destination directory exists
+    os.makedirs(dest_path, exist_ok=True)
+    if token is None:
+        logger.info("reading HF_TOKEN variable from environment")
+        token = os.getenv("HF_TOKEN")
+    # Download the file
+    downloaded_file = hf_hub_download(
+        repo_id=repo_id,
+        filename=filename,
+        repo_type="dataset",  # Specify dataset repository
+        local_dir=local_dir,  # Temporary download location
+        token=token,
+    )
+    logger.info(f"Downloaded {filename} to {downloaded_file}")
+    # Check if it's a zip file
+    if filename.endswith(".zip"):
+        # Extract the zip file
+        with ZipFile(downloaded_file, "r") as zip_ref:
+            zip_ref.extractall(dest_path)
+        logger.info(f"Unzipped contents to {dest_path}")
+        # Clean up the downloaded zip file
+        if remove_downloaded_file:
+            os.remove(downloaded_file)
+            logger.info(f"Cleaned up temporary file: {downloaded_file}")
+    else:
+        # If not a zip, just move the file
+        final_path = os.path.join(dest_path, filename)
+        shutil.move(downloaded_file, final_path)
+        logger.info(f"Moved {filename} to {final_path}")
+def download_file_from_hf(
+    repo_id: str,
+    repo_type: Literal["model", "dataset"],
+    filename: str,
+    dest_path: str,
+    token: str = None,
+) -> None:
+    """
+    Download a file from Hugging Face repository and unzip it to destination path
+    Args:
+        repo_id (str): Hugging Face repository ID (username/repo_name)
+        repo_type: model for model repo, dataset for dataset repo
+        filename (str): Name of the file to download from the repository
+        dest_path (str): Destination path where contents will be unzipped
+        token (str, optional): Hugging Face token, if None will prompt for login
+    """
+    # Ensure destination directory exists
+    os.makedirs(dest_path, exist_ok=True)
+    if token is None:
+        logger.info("reading HF_TOKEN variable from environment")
+        token = os.getenv("HF_TOKEN")
+    # Download the file
+    downloaded_file = hf_hub_download(
+        repo_id=repo_id,
+        filename=filename,
+        repo_type=repo_type,
+        local_dir="./downloads",  # Temporary download location
+        token=token,
+    )
+    logger.info(f"Downloaded {filename} to {downloaded_file}")
+    # Check if it's a zip file
+    if filename.endswith(".zip"):
+        # Extract the zip file
+        with ZipFile(downloaded_file, "r") as zip_ref:
+            zip_ref.extractall(dest_path)
+        logger.info(f"Unzipped contents to {dest_path}")
+        # Clean up the downloaded zip file
+        os.remove(downloaded_file)
+        logger.info(f"Cleaned up temporary file: {downloaded_file}")
+    else:
+        # If not a zip, just move the file
+        final_path = os.path.join(dest_path, filename)
+        shutil.move(downloaded_file, final_path)
+        logger.info(f"Moved {filename} to {final_path}")
+def upload_file_to_hf(
+    local_file_path: str,
+    repo_id: str,
+    repo_type: Literal["model", "dataset"],
+    token: Optional[str] = None,
+    path_in_repo: Optional[str] = None,
+    commit_message: str = "Upload file",
+) -> None:
+    """
+    Upload a file to Hugging Face hub.
+    Args:
+        local_file_path (str): Path to the local .pt checkpoint file
+        repo_id (str): Repository ID in format "username/repo_name"
+        repo_type (str, optional): Type of repository, either "model" or "dataset"
+        token (str): Hugging Face authentication token. Read from environment variable HF_TOKEN if don't provide
+        path_in_repo (str, optional): Destination path in the repository.
+            Defaults to the filename from local_checkpoint_path
+        commit_message (str, optional): Commit message for the upload
+    Raises:
+        FileNotFoundError: If the checkpoint file doesn't exist
+        ValueError: If the repository ID is invalid
+    """
+    # Validate file exists
+    if not os.path.isfile(local_file_path):
+        raise FileNotFoundError(f"File not found: {local_file_path}")
+    # Use filename as default path_in_repo if not specified
+    if path_in_repo is None:
+        path_in_repo = Path(local_file_path).name
+    if token is None:
+        logger.info("reading HF_TOKEN variable from environment")
+        token = os.getenv("HF_TOKEN")
+        if token is None:
+            raise RuntimeError("not found HF_TOKEN variable from environment")
+    upload_file(
+        path_or_fileobj=local_file_path,
+        path_in_repo=path_in_repo,
+        repo_id=repo_id,
+        repo_type=repo_type,
+        token=token,
+        commit_message=commit_message,
+    )
+    logger.info(f"Successfully uploaded {local_file_path} to {repo_id}/{path_in_repo}")

core/utils/read_write_files.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import os
+import zipfile
+def zip_folder(folder_path: str, output_path: str) -> bool:
+    """
+    Zip a folder and its contents to a zip file.
+    Args:
+        folder_path (str): Path to the folder to be zipped
+        output_path (str): Path where the zip file will be created
+    Returns:
+        bool: True if successful, False otherwise
+    """
+    try:
+        # Ensure the folder exists
+        if not os.path.isdir(folder_path):
+            print(f"Error: {folder_path} is not a valid directory")
+            return False
+        # Get the absolute path of the folder
+        abs_folder_path = os.path.abspath(folder_path)
+        # Create a ZipFile object in write mode
+        with zipfile.ZipFile(output_path, "w", zipfile.ZIP_DEFLATED) as zipf:
+            # Walk through the folder
+            for root, dirs, files in os.walk(abs_folder_path):
+                for file in files:
+                    # Get the absolute path of the file
+                    abs_file_path = os.path.join(root, file)
+                    # Calculate relative path for the file inside the zip
+                    rel_path = os.path.relpath(
+                        abs_file_path, os.path.dirname(abs_folder_path)
+                    )
+                    # Add file to zip
+                    zipf.write(abs_file_path, rel_path)
+        print(f"Successfully created zip file at {output_path}")
+        return True
+    except Exception as e:
+        print(f"Error creating zip file: {e}")
+        return False

core/utils/text.py ADDED Viewed

	@@ -0,0 +1,13 @@

+def normalize_whitespace(text: str) -> str:
+    """
+    Normalize whitespace in text by:
+    1. Removing leading and trailing whitespace
+    2. Replacing any sequence of whitespace characters with a single space
+    Args:
+        text: Input string to normalize
+    Returns:
+        String with normalized whitespace
+    """
+    return ' '.join(text.split())

event_handlers.py ADDED Viewed

	@@ -0,0 +1,436 @@

+from typing import List, Tuple, Optional, Dict, Any
+import traceback
+import torch
+import gradio as gr
+import numpy as np
+import time
+import os
+import re
+import wave
+import contextlib
+import logging
+import pandas as pd
+import gc
+import nltk
+nltk.download("punkt")
+from nltk.tokenize import sent_tokenize
+from core.data_model import AudioFile
+from core.bark.voice_clone import create_bark_prompt
+from core.bark.generate_audio import generate_audio
+from core.data_model import BarkPrompt, BarkGenerationConfig
+from core.utils.audio import save_audio_file
+from config import *
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+# return list of available devices and the best device to be used as default for all inference
+def get_available_torch_devices() -> Tuple[List[str], str]:
+    devices = ["cpu"]
+    best_device = "cpu"
+    if torch.mps.is_available():
+        devices.append("mps")
+        best_device = "mps"
+    if torch.cuda.is_available():
+        devices.append("cuda")
+        best_device = "cuda"
+    return devices, best_device
+# --- Helper Functions ---
+# (Keep get_wav_duration, load_existing_audio, get_safe_filename,
+#  generate_sine_wave, save_audio, parse_text_prompts, get_available_prompts,
+#  create_audio_prompt as they are, they are mostly backend logic)
+def get_wav_duration(filepath):
+    """Gets the duration of a WAV file in seconds."""
+    try:
+        with contextlib.closing(wave.open(filepath, "r")) as f:
+            frames = f.getnframes()
+            rate = f.getframerate()
+            if rate > 0:
+                duration = frames / float(rate)
+                return duration
+            else:
+                logger.info(f"Warning: Framerate is 0 for {filepath}")
+                return 0
+    except wave.Error as e:
+        logger.info(f"Warning: Could not read wave file header for {filepath}: {e}")
+        return 0
+    except Exception as e:
+        logger.info(f"Warning: Could not get duration for {filepath}: {e}")
+        return 0
+def load_existing_audio() -> List[Dict[str, Any]]:
+    """Scans the audio directory and loads metadata for existing WAV files."""
+    logger.info("\n--- Loading Existing Audio Files ---")
+    existing_files_metadata = []
+    if not os.path.isdir(GENERATED_AUDIO_DIR):
+        logger.info(f"Directory not found: {GENERATED_AUDIO_DIR}")
+        return []
+    try:
+        for filename in os.listdir(GENERATED_AUDIO_DIR):
+            if filename.lower().endswith(".wav"):
+                filepath = os.path.join(GENERATED_AUDIO_DIR, filename)
+                if not os.path.isfile(filepath):
+                    continue
+                match = re.match(r"^(.*)_(\d{13})\.wav$", filename)
+                text_guess = "Unknown (from filename)"
+                timestamp_ms = 0
+                if match:
+                    text_guess = match.group(1).replace("_", " ")
+                    try:
+                        timestamp_ms = int(match.group(2))
+                    except ValueError:
+                        timestamp_ms = 0
+                else:
+                    text_guess = os.path.splitext(filename)[0].replace("_", " ")
+                timestamp_sec = (
+                    timestamp_ms / 1000.0
+                    if timestamp_ms > 0
+                    else os.path.getmtime(filepath)
+                )
+                duration = get_wav_duration(filepath)
+                metadata = {
+                    "text": text_guess,
+                    "path": filepath,
+                    "duration": duration,
+                    "timestamp": timestamp_sec,
+                }
+                existing_files_metadata.append(metadata)
+    except Exception as e:
+        logger.error(f"Error loading existing audio files: {e}")
+    existing_files_metadata.sort(key=lambda x: x.get("timestamp", 0))
+    logger.info(
+        f"--- Finished Loading {len(existing_files_metadata)} Existing Files ---"
+    )
+    return existing_files_metadata
+def get_safe_filename(base_name: str, extension: str, directory: str) -> str:
+    """Creates a safe and unique filename in the target directory."""
+    safe_base = "".join(
+        c if c.isalnum() or c in ["_", "-"] else "_" for c in base_name[:50]
+    )
+    timestamp = int(time.time() * 1000)
+    filename = f"{safe_base}_{timestamp}.{extension}"
+    filepath = os.path.join(directory, filename)
+    counter = 1
+    while os.path.exists(filepath):
+        filename = f"{safe_base}_{timestamp}_{counter}.{extension}"
+        filepath = os.path.join(directory, filename)
+        counter += 1
+    return filepath
+def update_audio_list(
+    newly_generated_metadata: List[Dict[str, Any]],
+    current_audio_list: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """Appends new metadata to the list and sorts it by timestamp."""
+    logger.info(f"\n--- Updating Audio List State ---")
+    if not isinstance(current_audio_list, list):
+        logger.info("Current audio list was not a list, initializing.")
+        current_audio_list = []
+    if not isinstance(newly_generated_metadata, list):
+        logger.info("Newly generated metadata is not a list, skipping update.")
+        return current_audio_list
+    logger.info(f"Current list size: {len(current_audio_list)}")
+    logger.info(f"Adding {len(newly_generated_metadata)} new items.")
+    updated_list = current_audio_list + newly_generated_metadata
+    updated_list.sort(key=lambda x: x.get("timestamp", 0))
+    logger.info(f"Updated list state size: {len(updated_list)}")
+    logger.info("--- Finished Updating Audio List State ---")
+    return updated_list
+def format_audio_list_for_dataframe(audio_list: List[Dict[str, Any]]) -> pd.DataFrame:
+    """Converts the list of audio metadata dicts into a pandas DataFrame for display."""
+    logger.info("\n--- Formatting List for DataFrame ---")
+    if not audio_list:
+        logger.info("Audio list is empty, returning empty DataFrame.")
+        # Return empty DataFrame with correct columns
+        return pd.DataFrame(columns=["File", "Prompt", "Duration (s)"])
+    display_data = []
+    for item in audio_list:
+        filepath = item.get("path", "N/A")
+        filename = os.path.basename(filepath) if filepath != "N/A" else "N/A"
+        # Truncate long text prompts for display in the table
+        text_prompt = item.get("text", "N/A")
+        display_text = (
+            (text_prompt[:75] + "...") if len(text_prompt) > 75 else text_prompt
+        )
+        duration = item.get("duration", 0)
+        display_data.append(
+            {
+                "File": filename,
+                "Prompt": display_text,
+                "Duration (s)": f"{duration:.2f}" if duration else "N/A",
+                # Store the full path implicitly by list order, not shown in df
+            }
+        )
+    df = pd.DataFrame(display_data)
+    logger.info(f"Created DataFrame with {len(df)} rows.")
+    logger.info("--- Finished Formatting List for DataFrame ---")
+    return df
+def handle_row_selection(
+    audio_list: List[Dict[str, Any]], evt: gr.SelectData
+) -> Tuple[Optional[str], int]:
+    """
+    Handles the selection event from the DataFrame.
+    Updates the audio player with the selected file's path.
+    Returns the filepath and the selected index.
+    """
+    logger.info("\n--- Handling Row Selection ---")
+    selected_index = evt.index[0] if evt.index else None  # Get row index
+    logger.info(f"DataFrame row selected. Event data: {evt}")
+    if selected_index is not None and 0 <= selected_index < len(audio_list):
+        selected_item = audio_list[selected_index]
+        filepath = selected_item.get("path")
+        logger.info(f"Selected item at index {selected_index}: {selected_item}")
+        if filepath and os.path.exists(filepath):
+            logger.info(f"Updating audio player with: {filepath}")
+            logger.info("--- Finished Handling Row Selection (Success) ---")
+            return filepath, selected_index
+        else:
+            logger.info(f"File not found for selected item: {filepath}")
+            gr.Warning(
+                f"File not found for selected row: {os.path.basename(filepath or 'N/A')}"
+            )
+            logger.info("--- Finished Handling Row Selection (File Not Found) ---")
+            return None, selected_index  # Keep index, but clear player
+    else:
+        logger.info("Invalid selection index or empty list.")
+        logger.info("--- Finished Handling Row Selection (Invalid Index) ---")
+        return None, -1  # Clear player and indicate no valid selection
+def handle_delete_selected(
+    selected_index: int, current_audio_list: List[Dict[str, Any]]
+) -> Tuple[List[Dict[str, Any]], int, Optional[str]]:
+    """
+    Deletes the audio file corresponding to the selected index.
+    Updates the main audio list state.
+    Clears the selection index and audio player.
+    """
+    logger.info("\n--- Handling Delete Selected ---")
+    logger.info(f"Attempting deletion for selected index: {selected_index}")
+    if (
+        selected_index is None
+        or selected_index < 0
+        or selected_index >= len(current_audio_list)
+    ):
+        gr.Warning("No valid audio selected for deletion.")
+        logger.info("No valid index provided.")
+        # Return current list, clear index, clear player
+        return current_audio_list, -1, None
+    item_to_delete = current_audio_list[selected_index]
+    filepath_to_delete = item_to_delete.get("path")
+    logger.info(f"Item to delete: {item_to_delete}")
+    # Create the new list excluding the item
+    # Corrected slicing logic: include elements before and after the index
+    new_audio_list = (
+        current_audio_list[:selected_index] + current_audio_list[selected_index + 1 :]
+    )
+    logger.info(f"New list size after filtering: {len(new_audio_list)}")
+    # Try to delete the file from disk
+    deletion_successful_on_disk = False
+    try:
+        if filepath_to_delete and os.path.exists(filepath_to_delete):
+            os.remove(filepath_to_delete)
+            logger.info(f"Successfully deleted file: {filepath_to_delete}")
+            gr.Info(f"Deleted {os.path.basename(filepath_to_delete)}")
+            deletion_successful_on_disk = True
+        elif filepath_to_delete:
+            logger.info(f"File not found for deletion: {filepath_to_delete}")
+            gr.Warning("Audio entry removed from list, but file was not found on disk.")
+            deletion_successful_on_disk = True  # Consider list update successful
+        else:
+            logger.info("Invalid filepath in selected item.")
+            gr.Warning("Could not delete: Invalid file path associated with selection.")
+            # Revert list change if filepath was invalid from the start? Or keep it removed?
+            # Let's keep it removed from the list for consistency.
+            deletion_successful_on_disk = True  # Treat as success for list update
+    except OSError as e:
+        logger.info(f"Error deleting file {filepath_to_delete}: {e}")
+        traceback.logger.info_exc()
+        gr.Error(f"Error deleting file: {e}")
+        # If file deletion fails, we still return the updated list (item removed).
+        # If you want to revert the list change on OS error, return `current_audio_list` here.
+    logger.info("--- Finished Deleting Selected Item ---")
+    # Return the updated list, clear the selected index, clear the audio player
+    return new_audio_list, -1, None
+def get_available_prompts() -> List[str]:
+    """Loads available prompt file names."""
+    try:
+        prompts = [
+            f
+            for f in os.listdir(PROMPT_DIR)
+            if os.path.isfile(os.path.join(PROMPT_DIR, f))
+            and f.lower().endswith((".npz", ".npy", ".json"))
+        ]
+        if len(prompts) == 0:
+            gr.Info("No prompts found.", duration=3)
+        return ["None"] + prompts
+    except Exception as e:
+        logger.info(f"Error loading prompts: {e}")
+        gr.Info(f"Error loading prompts {e}", duration=3, title="Error")
+        return ["None"]
+def update_available_prompts() -> gr.update:
+    try:
+        prompts = [
+            f
+            for f in os.listdir(PROMPT_DIR)
+            if os.path.isfile(os.path.join(PROMPT_DIR, f))
+            and f.lower().endswith((".npz", ".npy", ".json"))
+        ]
+        if len(prompts) == 0:
+            gr.Info("No prompts found.", duration=3)
+        return gr.update(choices=["None"] + prompts)
+    except Exception as e:
+        logger.info(f"Error loading prompts: {e}")
+        gr.Info(f"Error loading prompts {e}", duration=3, title="Error")
+        return gr.update()
+def generate_batch_audio(
+    text: str,
+    semantic_temp: float,
+    coarse_temp: float,
+    fine_temp: float,
+    manual_seed: int,
+    model_type: str,
+    inference_device: str,
+    selected_prompt_name: Optional[str],
+) -> Tuple[List[Dict[str, Any]], str]:
+    """
+    Generates audio (sine wave) for each line of text input.
+    Returns metadata for generated files.
+    """
+    gc.collect()
+    torch.manual_seed(manual_seed)
+    if not text:
+        gr.Warning("No valid text prompts provided.")
+        return []
+    generated_metadata = []
+    bark_prompt = None
+    if selected_prompt_name != "None":
+        gr.Info("Loading audio prompt...")
+        prompt_path = os.path.join(PROMPT_DIR, selected_prompt_name)
+        bark_prompt = BarkPrompt.load_prompt(
+            prompt_path, torch.device(inference_device)
+        )
+    generation_config = BarkGenerationConfig(
+        temperature=semantic_temp,
+        generate_coarse_temperature=coarse_temp,
+        generate_fine_temperature=fine_temp,
+        use_small_model=True if model_type == "small" else False,
+    )
+    # split the text into sentences
+    sentences = sent_tokenize(text)
+    gr.Info("Generating Audio....", duration=120)
+    waves = generate_audio(
+        texts=sentences,
+        prompt=bark_prompt,
+        generation_config=generation_config,
+        silent=True,
+    )
+    audio = np.concat(waves, axis=-1)
+    output_filepath = get_safe_filename(text, "wav", GENERATED_AUDIO_DIR)
+    save_audio_file(audio, DEFAULT_AUDIO_SAMPLE_RATE, output_filepath)
+    duration_sec = audio.shape[0] // DEFAULT_AUDIO_SAMPLE_RATE
+    metadata = {
+        "text": text,
+        "path": output_filepath,
+        "duration": duration_sec,
+        "timestamp": time.time(),
+    }
+    generated_metadata.append(metadata)
+    gr.Info("Done!", duration=5)
+    return generated_metadata
+def create_audio_prompt(
+    uploaded_audio_file: Optional[str],
+    device: str,
+    progress: gr.Progress = gr.Progress(),
+) -> gr.update:
+    """Processes an uploaded audio file to create a voice prompt file (stub)."""
+    logger.info("\n--- Starting Prompt Creation ---")
+    if uploaded_audio_file is None or len(uploaded_audio_file) == 0:
+        gr.Warning("No audio file uploaded!")
+        return gr.update()
+    logger.info(f"Processing uploaded file: {uploaded_audio_file}")
+    try:
+        progress(0, desc="Starting prompt creation...")
+        new_prompt_filename = None
+        progress(0.2, desc="Extracting prompt features...")
+        audio_file = AudioFile(audio_file_path=uploaded_audio_file, max_duration=10)
+        prompt = create_bark_prompt(
+            audio_file=audio_file, temperature=1, eos_p=0.2, device=torch.device(device)
+        )
+        progress(0.8, desc="Saving prompt file...")
+        original_basename = os.path.splitext(os.path.basename(uploaded_audio_file))[0]
+        prompt_filepath = get_safe_filename(original_basename, "json", PROMPT_DIR)
+        new_prompt_filename = os.path.basename(prompt_filepath)
+        ok = prompt.save_prompt(prompt_filepath)
+        if ok:
+            progress(1.0, desc="Prompt creation complete.")
+        else:
+            progress(1.0, desc="Error when saving prompt")
+        new_choices = get_available_prompts()
+        return gr.update(choices=new_choices, value=new_prompt_filename)
+    except Exception as e:
+        logger.info(f"Error creating prompt: {e}")
+        gr.Error(f"Prompt creation failed: {e}")
+        return f"Error creating prompt: {e}", gr.update()

generate_audio_semantic_dataset.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import argparse
+import logging
+import os
+from typing import Optional
+from core.bark.generate_audio_semantic_dataset import (
+    generate_wav_semantic_dataset,
+    BarkGenerationConfig,
+)
+from core.utils import upload_file_to_hf, zip_folder
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+def parse_dataset_args(args_list=None):
+    """Parse arguments specific to dataset creation."""
+    parser = argparse.ArgumentParser(description="Audio Semantic Dataset Creation")
+    parser.add_argument(
+        "--text-file",
+        type=str,
+        default="data/test_data.txt",
+        help="Path to text file for dataset generation",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=2,
+        help="Batch size for processing (default: 1)",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="./dataset",
+        help="Output directory for generated files (default: ./dataset)",
+    )
+    parser.add_argument(
+        "--max-tokens",
+        type=int,
+        default=256,
+        help="Maximum tokens per example (default: 256)",
+    )
+    parser.add_argument(
+        "--use-small-model",
+        action="store_true",
+        help="Use small model for generation",
+    )
+    parser.add_argument(
+        "--save-raw-audio",
+        action="store_true",
+        help="Store generated audio as .wav instead of .npz",
+    )
+    parser.add_argument(
+        "--publish-hf",
+        action="store_true",
+        help="Publish dataset to HuggingFace Hub",
+    )
+    parser.add_argument(
+        "--repo-id",
+        type=str,
+        help="HuggingFace repo ID to publish to",
+    )
+    parser.add_argument(
+        "--path-in-repo",
+        type=str,
+        help="Path in HF repo",
+        default=None,
+    )
+    parser.add_argument(
+        "--silent", action="store_true", help="Suppress progress output"
+    )
+    return parser.parse_args(args_list)
+def create_audio_semantic_dataset(
+    text_file: str,
+    output_dir: str = "./dataset",
+    batch_size: int = 1,
+    max_tokens: int = 256,
+    use_small_model: bool = False,
+    save_raw_audio: bool = False,
+    publish_hf: bool = False,
+    repo_id: Optional[str] = None,
+    path_in_repo: Optional[str] = None,
+    silent: bool = False,
+) -> None:
+    """Create audio semantic dataset from text file.
+    Can be called directly with parameters or via command line using parse_dataset_args().
+    Args:
+        text_file: Path to input text file
+        output_dir: Directory to save generated dataset
+        batch_size: Batch size for processing
+        max_tokens: Maximum tokens per example
+        use_small_model: Whether to use small model
+        save_raw_audio: Save as raw audio (.wav) instead of .npz
+        publish_hf: Whether to publish to HuggingFace Hub
+        repo_id: HF repo ID to publish to
+        path_in_repo: Path in HF repo
+        silent: Suppress progress output
+    """
+    os.makedirs(output_dir, exist_ok=True)
+    if not os.path.isfile(text_file):
+        raise FileNotFoundError(f"Text file not found: {text_file}")
+    logger.info(f"Starting dataset generation from {text_file}")
+    generation_config = BarkGenerationConfig(
+        temperature=None,
+        generate_coarse_temperature=None,
+        generate_fine_temperature=None,
+        use_small_model=use_small_model,
+    )
+    generate_wav_semantic_dataset(
+        text_file_path=text_file,
+        generation_config=generation_config,
+        batch_size=batch_size,
+        save_path=output_dir,
+        save_data_as_raw_audio=save_raw_audio,
+        silent=silent,
+    )
+    logger.info("Dataset generation completed")
+    if publish_hf and repo_id:
+        logger.info("Publishing dataset to huggingface hub")
+        zip_path = "./dataset.zip"
+        success = zip_folder(output_dir, zip_path)
+        if not success:
+            raise RuntimeError(f"Unable to zip folder {output_dir}")
+        upload_file_to_hf(zip_path, repo_id, "dataset", path_in_repo=path_in_repo)
+if __name__ == "__main__":
+    args = parse_dataset_args()
+    create_audio_semantic_dataset(
+        text_file=args.text_file,
+        output_dir=args.output_dir,
+        batch_size=args.batch_size,
+        max_tokens=args.max_tokens,
+        use_small_model=args.use_small_model,
+        save_raw_audio=args.save_raw_audio,
+        publish_hf=args.publish_hf,
+        repo_id=args.repo_id,
+        path_in_repo=args.path_in_repo,
+        silent=args.silent,
+    )

prompts/de_speaker_0.npz ADDED Viewed

Binary file (39.6 kB). View file

prompts/de_speaker_1.npz ADDED Viewed

Binary file (27.5 kB). View file

prompts/de_speaker_2.npz ADDED Viewed

Binary file (24.7 kB). View file

prompts/de_speaker_3.npz ADDED Viewed

Binary file (31.3 kB). View file

prompts/de_speaker_4.npz ADDED Viewed

Binary file (30.7 kB). View file

prompts/de_speaker_5.npz ADDED Viewed

Binary file (31.3 kB). View file

prompts/de_speaker_6.npz ADDED Viewed

Binary file (23.2 kB). View file

prompts/de_speaker_7.npz ADDED Viewed

Binary file (40.1 kB). View file

prompts/de_speaker_8.npz ADDED Viewed

Binary file (28.5 kB). View file

prompts/de_speaker_9.npz ADDED Viewed

Binary file (51.1 kB). View file

prompts/en_speaker_0.npz ADDED Viewed

Binary file (28.1 kB). View file

prompts/en_speaker_1.npz ADDED Viewed

Binary file (25.2 kB). View file

prompts/en_speaker_2.npz ADDED Viewed

Binary file (26.2 kB). View file

prompts/en_speaker_3.npz ADDED Viewed

Binary file (35 kB). View file

prompts/en_speaker_4.npz ADDED Viewed

Binary file (23.8 kB). View file

prompts/en_speaker_5.npz ADDED Viewed

Binary file (24.7 kB). View file