input_sample_rate: &input_sample_rate 16000 output_sample_rate: &output_sample_rate 24000 generator_params: input_sample_rate: *input_sample_rate output_sample_rate: *output_sample_rate feature_extractor_kwargs: chunk_length: 30 feature_size: 80 hop_length: 160 n_fft: 400 n_samples: 480000 nb_max_frames: 3000 padding_side: right padding_value: 0.0 return_attention_mask: false sampling_rate: *input_sample_rate ## Codec Args ## semantic channel semantic_encoder_kwargs: # 100hz -> 50hz num_mel_bins: 80 sampling_rate: *input_sample_rate hop_length: 160 stride_size: 2 kernel_size: 3 d_model: 768 scale_embedding: false max_audio_seconds: 30 encoder_layers: 12 encoder_attention_heads: 12 encoder_ffn_dim: 3072 activation_function: "gelu" semantic_encoder_adapter_kwargs: # 50hz input_dim: 768 output_dim: 768 d_model: 768 max_source_positions: 1500 encoder_layers: 4 encoder_attention_heads: 12 encoder_ffn_dim: 3072 ## acoustic channel acoustic_encoder_kwargs: # 100hz -> 50hz num_mel_bins: 80 sampling_rate: *input_sample_rate hop_length: 160 stride_size: 2 kernel_size: 3 d_model: 768 scale_embedding: false max_audio_seconds: 30 encoder_layers: 12 encoder_attention_heads: 12 encoder_ffn_dim: 3072 activation_function: "gelu" ## semantic & acoustic shared parameters pre_rvq_adapter_kwargs: # 50hz input_dim: 1536 output_dim: 768 d_model: 768 max_source_positions: 1500 encoder_layers: 4 encoder_attention_heads: 12 encoder_ffn_dim: 3072 downsample_kwargs: # 50hz -> 12.5hz d_model: 768 avg_pooler: 4 quantizer_kwargs: # 12.5hz input_dim: 3072 rvq_dim: 512 output_dim: 3072 num_quantizers: 8 codebook_size: 1024 codebook_dim: 512 quantizer_dropout: 0.0 commitment: 1 post_rvq_adapter_kwargs: # 12.5hz input_dim: 3072 output_dim: 3072 d_model: 768 max_source_positions: 375 encoder_layers: 4 encoder_attention_heads: 12 encoder_ffn_dim: 3072 upsample_kwargs: # 12.5hz -> 50hz d_model: 768 stride: 4 ## acoustic channel acoustic_decoder_kwargs: # 50hz -> 100hz num_mel_bins: 80 sampling_rate: *input_sample_rate hop_length: 160 stride_size: 2 kernel_size: 3 d_model: 768 scale_embedding: false max_audio_seconds: 30 decoder_layers: 12 decoder_attention_heads: 12 decoder_ffn_dim: 3072 activation_function: "gelu" vocos_kwargs: # 100hz -> 24khz input_channels: 80 dim: 512 intermediate_dim: 4096 num_layers: 30 n_fft: 960 hop_size: 240 padding: "same"