ESPnet
multilingual
audio
universa
ftshijt commited on
Commit
394e160
·
1 Parent(s): 8de9087

Update model

Browse files
README.md ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - universa
6
+ language: multilingual
7
+ datasets:
8
+ - universa_unite
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 universa model
13
+
14
+ ### `espnet/arecho_scale_v0`
15
+
16
+ This model was trained by ftshijt using universa_unite recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+ git checkout 69996dc206e556ec48db77b6cc385ff1d32895b3
26
+ pip install -e .
27
+ cd egs2/universa_unite/uni_versa1
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model espnet/arecho_scale_v0
29
+ ```
30
+
31
+
32
+
33
+ ## universa config
34
+
35
+ <details><summary>expand</summary>
36
+
37
+ ```
38
+ config: conf/train_aruniversa_wavlm.yaml
39
+ print_config: false
40
+ log_level: INFO
41
+ drop_last_iter: false
42
+ dry_run: false
43
+ iterator_type: sequence
44
+ valid_iterator_type: null
45
+ output_dir: exp/universa_universa_ar_overall_scale_token_wavlm
46
+ ngpu: 1
47
+ seed: 777
48
+ num_workers: 1
49
+ num_att_plot: 0
50
+ dist_backend: nccl
51
+ dist_init_method: env://
52
+ dist_world_size: null
53
+ dist_rank: null
54
+ local_rank: 0
55
+ dist_master_addr: null
56
+ dist_master_port: null
57
+ dist_launcher: null
58
+ multiprocessing_distributed: false
59
+ unused_parameters: false
60
+ sharded_ddp: false
61
+ use_deepspeed: false
62
+ deepspeed_config: null
63
+ gradient_as_bucket_view: true
64
+ ddp_comm_hook: null
65
+ cudnn_enabled: true
66
+ cudnn_benchmark: false
67
+ cudnn_deterministic: false
68
+ use_tf32: false
69
+ collect_stats: false
70
+ write_collected_feats: false
71
+ max_epoch: 100
72
+ patience: null
73
+ val_scheduler_criterion:
74
+ - valid
75
+ - loss
76
+ early_stopping_criterion:
77
+ - valid
78
+ - loss
79
+ - min
80
+ best_model_criterion:
81
+ - - train
82
+ - loss
83
+ - min
84
+ - - valid
85
+ - loss
86
+ - min
87
+ - - train
88
+ - acc
89
+ - max
90
+ - - valid
91
+ - acc
92
+ - max
93
+ keep_nbest_models: 1
94
+ nbest_averaging_interval: 0
95
+ grad_clip: -1
96
+ grad_clip_type: 2.0
97
+ grad_noise: false
98
+ accum_grad: 2
99
+ no_forward_run: false
100
+ resume: true
101
+ train_dtype: float32
102
+ use_amp: false
103
+ log_interval: 50
104
+ use_matplotlib: true
105
+ use_tensorboard: true
106
+ create_graph_in_tensorboard: false
107
+ use_wandb: false
108
+ wandb_project: null
109
+ wandb_id: null
110
+ wandb_entity: null
111
+ wandb_name: null
112
+ wandb_model_log_interval: -1
113
+ detect_anomaly: false
114
+ use_adapter: false
115
+ adapter: lora
116
+ save_strategy: all
117
+ adapter_conf: {}
118
+ pretrain_path: null
119
+ init_param: []
120
+ ignore_init_mismatch: false
121
+ freeze_param:
122
+ - frontend.upstream
123
+ num_iters_per_epoch: null
124
+ batch_size: 16
125
+ valid_batch_size: null
126
+ batch_bins: 1000000
127
+ valid_batch_bins: null
128
+ category_sample_size: 10
129
+ train_shape_file:
130
+ - exp/universa_stats_overall_scale/train/audio_shape
131
+ - exp/universa_stats_overall_scale/train/ref_audio_shape
132
+ valid_shape_file:
133
+ - exp/universa_stats_overall_scale/valid/audio_shape
134
+ - exp/universa_stats_overall_scale/valid/ref_audio_shape
135
+ batch_type: sorted
136
+ valid_batch_type: null
137
+ fold_length:
138
+ - 256000
139
+ sort_in_batch: descending
140
+ shuffle_within_batch: false
141
+ sort_batch: descending
142
+ multiple_iterator: false
143
+ chunk_length: 500
144
+ chunk_shift_ratio: 0.5
145
+ num_cache_chunks: 1024
146
+ chunk_excluded_key_prefixes: []
147
+ chunk_default_fs: null
148
+ chunk_max_abs_length: null
149
+ chunk_discard_short_samples: true
150
+ train_data_path_and_name_and_type:
151
+ - - dump/raw/overall_scale/wav.scp
152
+ - audio
153
+ - kaldi_ark
154
+ - - dump/raw/overall_scale/metric.scp
155
+ - metrics
156
+ - metric
157
+ - - dump/raw/overall_scale/ref_wav.scp
158
+ - ref_audio
159
+ - kaldi_ark
160
+ valid_data_path_and_name_and_type:
161
+ - - dump/raw/overall_dev/wav.scp
162
+ - audio
163
+ - kaldi_ark
164
+ - - dump/raw/overall_dev/metric.scp
165
+ - metrics
166
+ - metric
167
+ - - dump/raw/overall_dev/ref_wav.scp
168
+ - ref_audio
169
+ - kaldi_ark
170
+ multi_task_dataset: false
171
+ allow_variable_data_keys: false
172
+ max_cache_size: 0.0
173
+ max_cache_fd: 32
174
+ allow_multi_rates: false
175
+ valid_max_cache_size: null
176
+ exclude_weight_decay: false
177
+ exclude_weight_decay_conf: {}
178
+ optim: adamw
179
+ optim_conf:
180
+ lr: 0.001
181
+ scheduler: warmuplr
182
+ scheduler_conf:
183
+ warmup_steps: 25000
184
+ metric2id: dump/raw/overall_scale/metric2id
185
+ metric2type: dump/raw/overall_scale/metric2type
186
+ metric_pad_value: -100
187
+ token_list: null
188
+ metric_token_info: data/token_list/metric_500_percentile_overall_scale_w-numerical/tokens.json
189
+ metric_token_pad_value: 0
190
+ tokenize_numerical_metric: true
191
+ init: null
192
+ model_conf: {}
193
+ use_ref_audio: true
194
+ use_ref_text: false
195
+ use_preprocessor: true
196
+ token_type: bpe
197
+ bpemodel: null
198
+ non_linguistic_symbols: null
199
+ cleaner: null
200
+ g2p: null
201
+ sequential_metric: true
202
+ randomize_sequential_metric: true
203
+ frontend: s3prl
204
+ frontend_conf:
205
+ frontend_conf:
206
+ upstream: wavlm_large
207
+ download_dir: ./hub
208
+ multilayer_feature: true
209
+ universa: ar_universa
210
+ universa_conf:
211
+ embedding_dim: 256
212
+ audio_encoder_type: transformer
213
+ audio_encoder_params:
214
+ num_blocks: 4
215
+ attention_heads: 4
216
+ linear_units: 1024
217
+ dropout_rate: 0.1
218
+ positional_dropout_rate: 0.1
219
+ attention_dropout_rate: 0.1
220
+ input_layer: conv2d
221
+ normalize_before: true
222
+ concat_after: false
223
+ positionwise_layer_type: linear
224
+ positionwise_conv_kernel_size: 1
225
+ layer_drop_rate: 0.1
226
+ qk_norm: false
227
+ use_flash_attn: false
228
+ cross_attention_type: multihead
229
+ cross_attention_params:
230
+ n_head: 2
231
+ dropout_rate: 0.1
232
+ metric_decoder_params:
233
+ num_blocks: 4
234
+ attention_heads: 4
235
+ linear_units: 1024
236
+ dropout_rate: 0.1
237
+ positional_dropout_rate: 0.1
238
+ src_attention_dropout_rate: 0.1
239
+ self_attention_dropout_rate: 0.1
240
+ input_layer: embed
241
+ normalize_before: true
242
+ concat_after: false
243
+ layer_drop_rate: 0.1
244
+ qk_norm: false
245
+ use_flash_attn: false
246
+ use_rope: true
247
+ lsm_weight: 0.1
248
+ sym_sos: <sos>
249
+ sym_eos: <eos>
250
+ required:
251
+ - output_dir
252
+ - metric2id
253
+ version: '202503'
254
+ distributed: false
255
+ ```
256
+
257
+ </details>
258
+
259
+
260
+
261
+ ### Citing ESPnet
262
+
263
+ ```BibTex
264
+ @inproceedings{watanabe2018espnet,
265
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
266
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
267
+ year={2018},
268
+ booktitle={Proceedings of Interspeech},
269
+ pages={2207--2211},
270
+ doi={10.21437/Interspeech.2018-1456},
271
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
272
+ }
273
+
274
+
275
+
276
+
277
+
278
+
279
+ ```
280
+
281
+ or arXiv:
282
+
283
+ ```bibtex
284
+ @misc{watanabe2018espnet,
285
+ title={ESPnet: End-to-End Speech Processing Toolkit},
286
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
287
+ year={2018},
288
+ eprint={1804.00015},
289
+ archivePrefix={arXiv},
290
+ primaryClass={cs.CL}
291
+ }
292
+ ```
dump/raw/overall_scale/metric2id ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ asr_match_error_rate
2
+ asvspoof_score
3
+ audiobox_aesthetics_CE
4
+ audiobox_aesthetics_CU
5
+ audiobox_aesthetics_PC
6
+ audiobox_aesthetics_PQ
7
+ cer
8
+ ci_sdr
9
+ dns_overall
10
+ dns_p808
11
+ emotion_similarity
12
+ f0corr
13
+ f0rmse
14
+ language
15
+ mcd
16
+ nisqa_col_pred
17
+ nisqa_dis_pred
18
+ nisqa_loud_pred
19
+ nisqa_mos_pred
20
+ nisqa_noi_pred
21
+ nisqa_real_mos
22
+ nomad
23
+ noresqa_score
24
+ pam_score
25
+ pesq
26
+ plcmos
27
+ pred_text_length
28
+ pysepm_c_bak
29
+ pysepm_cd
30
+ pysepm_c_ovl
31
+ pysepm_c_sig
32
+ pysepm_csii_high
33
+ pysepm_csii_low
34
+ pysepm_csii_mid
35
+ pysepm_fwsegsnr
36
+ pysepm_llr
37
+ pysepm_ncm
38
+ pysepm_wss
39
+ qwen_channel_type
40
+ qwen_language
41
+ qwen_laughter_crying
42
+ qwen_pitch_range
43
+ qwen_recording_quality
44
+ qwen_speaker_age
45
+ qwen_speaker_count
46
+ qwen_speaker_gender
47
+ qwen_speaking_style
48
+ qwen_speech_background_environment
49
+ qwen_speech_clarity
50
+ qwen_speech_emotion
51
+ qwen_speech_impairment
52
+ qwen_speech_purpose
53
+ qwen_speech_rate
54
+ qwen_speech_register
55
+ qwen_speech_volume_level
56
+ qwen_vocabulary_complexity
57
+ qwen_voice_pitch
58
+ qwen_voice_type
59
+ real_language
60
+ ref_text_length
61
+ rir_room_size
62
+ rt60
63
+ sar
64
+ scoreq_nr
65
+ scoreq_ref
66
+ sdr
67
+ se_ci_sdr
68
+ se_sar
69
+ se_sdr
70
+ se_si_snr
71
+ sheet_ssqa
72
+ singmos
73
+ si_snr
74
+ snr_simulation
75
+ speaking_rate
76
+ speech_bert
77
+ speech_bleu
78
+ speech_token_distance
79
+ spk_similarity
80
+ srmr
81
+ stoi
82
+ urgent_mos
83
+ utmos
84
+ utmosv2
85
+ visqol
86
+ voicemos_real_mos
87
+ wer
dump/raw/overall_scale/metric2type ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ asr_match_error_rate numerical
2
+ asvspoof_score numerical
3
+ audiobox_aesthetics_CE numerical
4
+ audiobox_aesthetics_CU numerical
5
+ audiobox_aesthetics_PC numerical
6
+ audiobox_aesthetics_PQ numerical
7
+ cer numerical
8
+ ci_sdr numerical
9
+ dns_overall numerical
10
+ dns_p808 numerical
11
+ emotion_similarity numerical
12
+ f0corr numerical
13
+ f0rmse numerical
14
+ language categorical
15
+ mcd numerical
16
+ nisqa_col_pred numerical
17
+ nisqa_dis_pred numerical
18
+ nisqa_loud_pred numerical
19
+ nisqa_mos_pred numerical
20
+ nisqa_noi_pred numerical
21
+ nisqa_real_mos numerical
22
+ nomad numerical
23
+ noresqa_score numerical
24
+ pam_score numerical
25
+ pesq numerical
26
+ plcmos numerical
27
+ pred_text_length numerical
28
+ pysepm_c_bak numerical
29
+ pysepm_cd numerical
30
+ pysepm_c_ovl numerical
31
+ pysepm_c_sig numerical
32
+ pysepm_csii_high numerical
33
+ pysepm_csii_low numerical
34
+ pysepm_csii_mid numerical
35
+ pysepm_fwsegsnr numerical
36
+ pysepm_llr numerical
37
+ pysepm_ncm numerical
38
+ pysepm_wss numerical
39
+ qwen_channel_type categorical
40
+ qwen_language categorical
41
+ qwen_laughter_crying categorical
42
+ qwen_pitch_range categorical
43
+ qwen_recording_quality categorical
44
+ qwen_speaker_age categorical
45
+ qwen_speaker_count numerical
46
+ qwen_speaker_gender categorical
47
+ qwen_speaking_style categorical
48
+ qwen_speech_background_environment categorical
49
+ qwen_speech_clarity categorical
50
+ qwen_speech_emotion categorical
51
+ qwen_speech_impairment categorical
52
+ qwen_speech_purpose categorical
53
+ qwen_speech_rate categorical
54
+ qwen_speech_register categorical
55
+ qwen_speech_volume_level categorical
56
+ qwen_vocabulary_complexity categorical
57
+ qwen_voice_pitch categorical
58
+ qwen_voice_type categorical
59
+ real_language categorical
60
+ ref_text_length numerical
61
+ rir_room_size categorical
62
+ rt60 numerical
63
+ sar numerical
64
+ scoreq_nr numerical
65
+ scoreq_ref numerical
66
+ sdr numerical
67
+ se_ci_sdr numerical
68
+ se_sar numerical
69
+ se_sdr numerical
70
+ se_si_snr numerical
71
+ sheet_ssqa numerical
72
+ singmos numerical
73
+ si_snr numerical
74
+ snr_simulation numerical
75
+ speaking_rate numerical
76
+ speech_bert numerical
77
+ speech_bleu numerical
78
+ speech_token_distance numerical
79
+ spk_similarity numerical
80
+ srmr numerical
81
+ stoi numerical
82
+ urgent_mos numerical
83
+ utmos numerical
84
+ utmosv2 numerical
85
+ visqol numerical
86
+ voicemos_real_mos numerical
87
+ wer numerical
exp/universa_universa_ar_overall_scale_token_wavlm/6epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f44093ca75bac69aa1fe7eea7c50c664ffb6e3d31d157c2b4b552d7e681acb57
3
+ size 2325327716
exp/universa_universa_ar_overall_scale_token_wavlm/config.yaml ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/train_aruniversa_wavlm.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ drop_last_iter: false
5
+ dry_run: false
6
+ iterator_type: sequence
7
+ valid_iterator_type: null
8
+ output_dir: exp/universa_universa_ar_overall_scale_token_wavlm
9
+ ngpu: 1
10
+ seed: 777
11
+ num_workers: 1
12
+ num_att_plot: 0
13
+ dist_backend: nccl
14
+ dist_init_method: env://
15
+ dist_world_size: null
16
+ dist_rank: null
17
+ local_rank: 0
18
+ dist_master_addr: null
19
+ dist_master_port: null
20
+ dist_launcher: null
21
+ multiprocessing_distributed: false
22
+ unused_parameters: false
23
+ sharded_ddp: false
24
+ use_deepspeed: false
25
+ deepspeed_config: null
26
+ gradient_as_bucket_view: true
27
+ ddp_comm_hook: null
28
+ cudnn_enabled: true
29
+ cudnn_benchmark: false
30
+ cudnn_deterministic: false
31
+ use_tf32: false
32
+ collect_stats: false
33
+ write_collected_feats: false
34
+ max_epoch: 100
35
+ patience: null
36
+ val_scheduler_criterion:
37
+ - valid
38
+ - loss
39
+ early_stopping_criterion:
40
+ - valid
41
+ - loss
42
+ - min
43
+ best_model_criterion:
44
+ - - train
45
+ - loss
46
+ - min
47
+ - - valid
48
+ - loss
49
+ - min
50
+ - - train
51
+ - acc
52
+ - max
53
+ - - valid
54
+ - acc
55
+ - max
56
+ keep_nbest_models: 1
57
+ nbest_averaging_interval: 0
58
+ grad_clip: -1
59
+ grad_clip_type: 2.0
60
+ grad_noise: false
61
+ accum_grad: 2
62
+ no_forward_run: false
63
+ resume: true
64
+ train_dtype: float32
65
+ use_amp: false
66
+ log_interval: 50
67
+ use_matplotlib: true
68
+ use_tensorboard: true
69
+ create_graph_in_tensorboard: false
70
+ use_wandb: false
71
+ wandb_project: null
72
+ wandb_id: null
73
+ wandb_entity: null
74
+ wandb_name: null
75
+ wandb_model_log_interval: -1
76
+ detect_anomaly: false
77
+ use_adapter: false
78
+ adapter: lora
79
+ save_strategy: all
80
+ adapter_conf: {}
81
+ pretrain_path: null
82
+ init_param: []
83
+ ignore_init_mismatch: false
84
+ freeze_param:
85
+ - frontend.upstream
86
+ num_iters_per_epoch: null
87
+ batch_size: 16
88
+ valid_batch_size: null
89
+ batch_bins: 1000000
90
+ valid_batch_bins: null
91
+ category_sample_size: 10
92
+ train_shape_file:
93
+ - exp/universa_stats_overall_scale/train/audio_shape
94
+ - exp/universa_stats_overall_scale/train/ref_audio_shape
95
+ valid_shape_file:
96
+ - exp/universa_stats_overall_scale/valid/audio_shape
97
+ - exp/universa_stats_overall_scale/valid/ref_audio_shape
98
+ batch_type: sorted
99
+ valid_batch_type: null
100
+ fold_length:
101
+ - 256000
102
+ sort_in_batch: descending
103
+ shuffle_within_batch: false
104
+ sort_batch: descending
105
+ multiple_iterator: false
106
+ chunk_length: 500
107
+ chunk_shift_ratio: 0.5
108
+ num_cache_chunks: 1024
109
+ chunk_excluded_key_prefixes: []
110
+ chunk_default_fs: null
111
+ chunk_max_abs_length: null
112
+ chunk_discard_short_samples: true
113
+ train_data_path_and_name_and_type:
114
+ - - dump/raw/overall_scale/wav.scp
115
+ - audio
116
+ - kaldi_ark
117
+ - - dump/raw/overall_scale/metric.scp
118
+ - metrics
119
+ - metric
120
+ - - dump/raw/overall_scale/ref_wav.scp
121
+ - ref_audio
122
+ - kaldi_ark
123
+ valid_data_path_and_name_and_type:
124
+ - - dump/raw/overall_dev/wav.scp
125
+ - audio
126
+ - kaldi_ark
127
+ - - dump/raw/overall_dev/metric.scp
128
+ - metrics
129
+ - metric
130
+ - - dump/raw/overall_dev/ref_wav.scp
131
+ - ref_audio
132
+ - kaldi_ark
133
+ multi_task_dataset: false
134
+ allow_variable_data_keys: false
135
+ max_cache_size: 0.0
136
+ max_cache_fd: 32
137
+ allow_multi_rates: false
138
+ valid_max_cache_size: null
139
+ exclude_weight_decay: false
140
+ exclude_weight_decay_conf: {}
141
+ optim: adamw
142
+ optim_conf:
143
+ lr: 0.001
144
+ scheduler: warmuplr
145
+ scheduler_conf:
146
+ warmup_steps: 25000
147
+ metric2id: dump/raw/overall_scale/metric2id
148
+ metric2type: dump/raw/overall_scale/metric2type
149
+ metric_pad_value: -100
150
+ token_list: null
151
+ metric_token_info: data/token_list/metric_500_percentile_overall_scale_w-numerical/tokens.json
152
+ metric_token_pad_value: 0
153
+ tokenize_numerical_metric: true
154
+ init: null
155
+ model_conf: {}
156
+ use_ref_audio: true
157
+ use_ref_text: false
158
+ use_preprocessor: true
159
+ token_type: bpe
160
+ bpemodel: null
161
+ non_linguistic_symbols: null
162
+ cleaner: null
163
+ g2p: null
164
+ sequential_metric: true
165
+ randomize_sequential_metric: true
166
+ frontend: s3prl
167
+ frontend_conf:
168
+ frontend_conf:
169
+ upstream: wavlm_large
170
+ download_dir: ./hub
171
+ multilayer_feature: true
172
+ universa: ar_universa
173
+ universa_conf:
174
+ embedding_dim: 256
175
+ audio_encoder_type: transformer
176
+ audio_encoder_params:
177
+ num_blocks: 4
178
+ attention_heads: 4
179
+ linear_units: 1024
180
+ dropout_rate: 0.1
181
+ positional_dropout_rate: 0.1
182
+ attention_dropout_rate: 0.1
183
+ input_layer: conv2d
184
+ normalize_before: true
185
+ concat_after: false
186
+ positionwise_layer_type: linear
187
+ positionwise_conv_kernel_size: 1
188
+ layer_drop_rate: 0.1
189
+ qk_norm: false
190
+ use_flash_attn: false
191
+ cross_attention_type: multihead
192
+ cross_attention_params:
193
+ n_head: 2
194
+ dropout_rate: 0.1
195
+ metric_decoder_params:
196
+ num_blocks: 4
197
+ attention_heads: 4
198
+ linear_units: 1024
199
+ dropout_rate: 0.1
200
+ positional_dropout_rate: 0.1
201
+ src_attention_dropout_rate: 0.1
202
+ self_attention_dropout_rate: 0.1
203
+ input_layer: embed
204
+ normalize_before: true
205
+ concat_after: false
206
+ layer_drop_rate: 0.1
207
+ qk_norm: false
208
+ use_flash_attn: false
209
+ use_rope: true
210
+ lsm_weight: 0.1
211
+ sym_sos: <sos>
212
+ sym_eos: <eos>
213
+ required:
214
+ - output_dir
215
+ - metric2id
216
+ version: '202503'
217
+ distributed: false
exp/universa_universa_ar_overall_scale_token_wavlm/images/acc_ar_decoder.png ADDED
exp/universa_universa_ar_overall_scale_token_wavlm/images/backward_time.png ADDED
exp/universa_universa_ar_overall_scale_token_wavlm/images/clip.png ADDED
exp/universa_universa_ar_overall_scale_token_wavlm/images/forward_time.png ADDED
exp/universa_universa_ar_overall_scale_token_wavlm/images/gpu_max_cached_mem_GB.png ADDED
exp/universa_universa_ar_overall_scale_token_wavlm/images/grad_norm.png ADDED
exp/universa_universa_ar_overall_scale_token_wavlm/images/iter_time.png ADDED
exp/universa_universa_ar_overall_scale_token_wavlm/images/loss.png ADDED
exp/universa_universa_ar_overall_scale_token_wavlm/images/loss_ar_decoder.png ADDED
exp/universa_universa_ar_overall_scale_token_wavlm/images/loss_scale.png ADDED
exp/universa_universa_ar_overall_scale_token_wavlm/images/optim0_lr0.png ADDED
exp/universa_universa_ar_overall_scale_token_wavlm/images/optim_step_time.png ADDED
exp/universa_universa_ar_overall_scale_token_wavlm/images/train_time.png ADDED
exp/universa_universa_ar_overall_scale_token_wavlm/images/value_ar_decoder.png ADDED
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202503'
2
+ files:
3
+ model_file: exp/universa_universa_ar_overall_scale_token_wavlm/6epoch.pth
4
+ python: "3.9.18 | packaged by conda-forge | (main, Dec 23 2023, 17:20:25) \n[GCC 12.3.0]"
5
+ timestamp: 1749800845.586997
6
+ torch: 2.6.0.dev20241210+cu124
7
+ yaml_files:
8
+ train_config: exp/universa_universa_ar_overall_scale_token_wavlm/config.yaml