InteractiveOmni-4B / configuration_flow.py
tongww's picture
upload initial model
3b4af99 verified
# --------------------------------------------------------
# SenseTime
# Copyright (c) 2025 SenseTime
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
import copy
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging
logger = logging.get_logger(__name__)
class FlowConfig(PretrainedConfig):
def __init__(
self,
input_size = 512,
output_size= 80,
spk_embed_dim = 192,
output_type = 'mel',
vocab_size = 6561,
input_frame_rate = 25,
only_mask_loss = True,
token_mel_ratio=2,
pre_lookahead_len=3,
encoder_config={'output_size': 512,
'attention_heads': 8,
'linear_units': 2048,
'num_blocks': 6,
'dropout_rate': 0.1,
'positional_dropout_rate': 0.1,
'attention_dropout_rate': 0.1,
'normalize_before': True,
'input_layer': 'linear',
'pos_enc_layer_type': 'rel_pos_espnet',
'selfattention_layer_type': 'rel_selfattn',
'input_size': 512,
'use_cnn_module': False,
'macaron_style': False,
},
decoder_config={'in_channels': 240,
'n_spks': 1,
'spk_emb_dim': 80,
'cfm_params': {
'sigma_min': 1e-06,
'solver': 'euler',
't_scheduler': 'cosine',
'training_cfg_rate': 0.2,
'inference_cfg_rate': 0.7,
'reg_loss_type': 'l1',
},
'estimator_config':{
'in_channels': 320,
'out_channels': 80,
'causal': True,
'channels': [256],
'dropout': 0.0,
'attention_head_dim': 64,
'n_blocks': 4,
'num_mid_blocks': 12,
'num_heads': 8,
'act_fn': 'gelu'
}
},
**kwargs):
super().__init__(**kwargs)
self.encoder_config = encoder_config
self.decoder_config = decoder_config
self.input_size = input_size
self.output_size = output_size
self.spk_embed_dim = spk_embed_dim
self.output_type = output_type
self.vocab_size = vocab_size
self.input_frame_rate = input_frame_rate
self.only_mask_loss = only_mask_loss
self.token_mel_ratio = token_mel_ratio
self.pre_lookahead_len = pre_lookahead_len
pass
def to_dict(self):
"""
Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
Returns:
`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
"""
output = copy.deepcopy(self.__dict__)
output['encoder_config'] = self.encoder_config
output['decoder_config'] = self.decoder_config
output['input_size'] = self.input_size
output['output_size'] = self.output_size
output['spk_embed_dim'] = self.spk_embed_dim
output['output_type'] = self.output_type
output['vocab_size'] = self.vocab_size
output['input_frame_rate'] = self.input_frame_rate
output['only_mask_loss'] = self.only_mask_loss
output['token_mel_ratio'] = self.token_mel_ratio
output['pre_lookahead_len'] = self.pre_lookahead_len
return output