yongjielv commited on
Commit
a1075ac
·
verified ·
1 Parent(s): b32eaba

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. config.json +148 -0
  2. model.safetensors +3 -0
config.json ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "AudioVAE"
4
+ ],
5
+ "dec_kwargs": {
6
+ "backbone": {
7
+ "_attn_implementation": "flash_attention_2",
8
+ "attention_dropout": 0.0,
9
+ "attn_implementation": null,
10
+ "bos_token_id": 151643,
11
+ "eos_token_id": 151645,
12
+ "hidden_act": "silu",
13
+ "hidden_size": 896,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 4864,
16
+ "is_causal": true,
17
+ "max_position_embeddings": 32768,
18
+ "max_window_layers": 0,
19
+ "model_type": "qwen2",
20
+ "num_attention_heads": 14,
21
+ "num_hidden_layers": 24,
22
+ "num_key_value_heads": 2,
23
+ "rms_norm_eps": 1e-06,
24
+ "rope_theta": 1000000.0,
25
+ "sliding_window": 32,
26
+ "tie_word_embeddings": true,
27
+ "torch_dtype": "bfloat16",
28
+ "transformers_version": "4.43.1",
29
+ "use_cache": false,
30
+ "use_sliding_window": true,
31
+ "vocab_size": 1
32
+ },
33
+ "latent_dim": 64,
34
+ "output_dim": 320
35
+ },
36
+ "enc_kwargs": {
37
+ "backbone": {
38
+ "_attn_implementation": "flash_attention_2",
39
+ "attention_dropout": 0.0,
40
+ "attn_implementation": null,
41
+ "bos_token_id": 151643,
42
+ "eos_token_id": 151645,
43
+ "hidden_act": "silu",
44
+ "hidden_size": 896,
45
+ "initializer_range": 0.02,
46
+ "intermediate_size": 4864,
47
+ "is_causal": true,
48
+ "max_position_embeddings": 32768,
49
+ "max_window_layers": 0,
50
+ "model_type": "qwen2",
51
+ "num_attention_heads": 14,
52
+ "num_hidden_layers": 24,
53
+ "num_key_value_heads": 2,
54
+ "rms_norm_eps": 1e-06,
55
+ "rope_theta": 1000000.0,
56
+ "sliding_window": 32,
57
+ "tie_word_embeddings": true,
58
+ "torch_dtype": "bfloat16",
59
+ "transformers_version": "4.43.1",
60
+ "use_cache": false,
61
+ "use_sliding_window": true,
62
+ "vocab_size": 1
63
+ },
64
+ "hop_size": 320,
65
+ "input_dim": 320,
66
+ "latent_dim": 64
67
+ },
68
+ "hifi_gan_disc_kwargs": {
69
+ "channel_increasing_factor": 4,
70
+ "channels": 16,
71
+ "max_downsample_channels": 512,
72
+ "periods": [
73
+ 2,
74
+ 3,
75
+ 5,
76
+ 7,
77
+ 11
78
+ ]
79
+ },
80
+ "init_method": "kaiming",
81
+ "lambda_adv": 1.0,
82
+ "lambda_disc": 1.0,
83
+ "lambda_feat_match_loss": 1.0,
84
+ "lambda_mel_loss": 1.0,
85
+ "lambda_semantic": 2.0,
86
+ "patch_size": -1,
87
+ "semantic_module_kwargs": {
88
+ "causal": true,
89
+ "whisper_encoder": {
90
+ "n_ctx": 1500,
91
+ "n_head": 20,
92
+ "n_layer": 32,
93
+ "n_mels": 128,
94
+ "n_state": 1280
95
+ }
96
+ },
97
+ "spec_disc_kwargs": {
98
+ "channels": 32,
99
+ "downsample_scales": [
100
+ 2,
101
+ 2,
102
+ 2
103
+ ],
104
+ "in_channels": 1,
105
+ "kernel_sizes": [
106
+ 5,
107
+ 3
108
+ ],
109
+ "max_downsample_channels": 512,
110
+ "out_channels": 1,
111
+ "stft_params": {
112
+ "fft_sizes": [
113
+ 78,
114
+ 126,
115
+ 206,
116
+ 334,
117
+ 542,
118
+ 876,
119
+ 1418,
120
+ 2296
121
+ ],
122
+ "hop_sizes": [
123
+ 39,
124
+ 63,
125
+ 103,
126
+ 167,
127
+ 271,
128
+ 438,
129
+ 709,
130
+ 1148
131
+ ],
132
+ "win_lengths": [
133
+ 78,
134
+ 126,
135
+ 206,
136
+ 334,
137
+ 542,
138
+ 876,
139
+ 1418,
140
+ 2296
141
+ ],
142
+ "window": "hann_window"
143
+ },
144
+ "use_weight_norm": true
145
+ },
146
+ "torch_dtype": "bfloat16",
147
+ "transformers_version": "4.52.4"
148
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c36d876de086d13eb1cdcfb9d08e22c3d806cd7893d64fdaf7ea6d30b7d521cd
3
+ size 2700431196