Update model
Browse files- README.md +292 -0
- dump/raw/overall_scale/metric2id +87 -0
- dump/raw/overall_scale/metric2type +87 -0
- exp/universa_universa_ar_overall_scale_token_wavlm/6epoch.pth +3 -0
- exp/universa_universa_ar_overall_scale_token_wavlm/config.yaml +217 -0
- exp/universa_universa_ar_overall_scale_token_wavlm/images/acc_ar_decoder.png +0 -0
- exp/universa_universa_ar_overall_scale_token_wavlm/images/backward_time.png +0 -0
- exp/universa_universa_ar_overall_scale_token_wavlm/images/clip.png +0 -0
- exp/universa_universa_ar_overall_scale_token_wavlm/images/forward_time.png +0 -0
- exp/universa_universa_ar_overall_scale_token_wavlm/images/gpu_max_cached_mem_GB.png +0 -0
- exp/universa_universa_ar_overall_scale_token_wavlm/images/grad_norm.png +0 -0
- exp/universa_universa_ar_overall_scale_token_wavlm/images/iter_time.png +0 -0
- exp/universa_universa_ar_overall_scale_token_wavlm/images/loss.png +0 -0
- exp/universa_universa_ar_overall_scale_token_wavlm/images/loss_ar_decoder.png +0 -0
- exp/universa_universa_ar_overall_scale_token_wavlm/images/loss_scale.png +0 -0
- exp/universa_universa_ar_overall_scale_token_wavlm/images/optim0_lr0.png +0 -0
- exp/universa_universa_ar_overall_scale_token_wavlm/images/optim_step_time.png +0 -0
- exp/universa_universa_ar_overall_scale_token_wavlm/images/train_time.png +0 -0
- exp/universa_universa_ar_overall_scale_token_wavlm/images/value_ar_decoder.png +0 -0
- meta.yaml +8 -0
README.md
ADDED
@@ -0,0 +1,292 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
tags:
|
3 |
+
- espnet
|
4 |
+
- audio
|
5 |
+
- universa
|
6 |
+
language: multilingual
|
7 |
+
datasets:
|
8 |
+
- universa_unite
|
9 |
+
license: cc-by-4.0
|
10 |
+
---
|
11 |
+
|
12 |
+
## ESPnet2 universa model
|
13 |
+
|
14 |
+
### `espnet/arecho_scale_v0`
|
15 |
+
|
16 |
+
This model was trained by ftshijt using universa_unite recipe in [espnet](https://github.com/espnet/espnet/).
|
17 |
+
|
18 |
+
### Demo: How to use in ESPnet2
|
19 |
+
|
20 |
+
Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
|
21 |
+
if you haven't done that already.
|
22 |
+
|
23 |
+
```bash
|
24 |
+
cd espnet
|
25 |
+
git checkout 69996dc206e556ec48db77b6cc385ff1d32895b3
|
26 |
+
pip install -e .
|
27 |
+
cd egs2/universa_unite/uni_versa1
|
28 |
+
./run.sh --skip_data_prep false --skip_train true --download_model espnet/arecho_scale_v0
|
29 |
+
```
|
30 |
+
|
31 |
+
|
32 |
+
|
33 |
+
## universa config
|
34 |
+
|
35 |
+
<details><summary>expand</summary>
|
36 |
+
|
37 |
+
```
|
38 |
+
config: conf/train_aruniversa_wavlm.yaml
|
39 |
+
print_config: false
|
40 |
+
log_level: INFO
|
41 |
+
drop_last_iter: false
|
42 |
+
dry_run: false
|
43 |
+
iterator_type: sequence
|
44 |
+
valid_iterator_type: null
|
45 |
+
output_dir: exp/universa_universa_ar_overall_scale_token_wavlm
|
46 |
+
ngpu: 1
|
47 |
+
seed: 777
|
48 |
+
num_workers: 1
|
49 |
+
num_att_plot: 0
|
50 |
+
dist_backend: nccl
|
51 |
+
dist_init_method: env://
|
52 |
+
dist_world_size: null
|
53 |
+
dist_rank: null
|
54 |
+
local_rank: 0
|
55 |
+
dist_master_addr: null
|
56 |
+
dist_master_port: null
|
57 |
+
dist_launcher: null
|
58 |
+
multiprocessing_distributed: false
|
59 |
+
unused_parameters: false
|
60 |
+
sharded_ddp: false
|
61 |
+
use_deepspeed: false
|
62 |
+
deepspeed_config: null
|
63 |
+
gradient_as_bucket_view: true
|
64 |
+
ddp_comm_hook: null
|
65 |
+
cudnn_enabled: true
|
66 |
+
cudnn_benchmark: false
|
67 |
+
cudnn_deterministic: false
|
68 |
+
use_tf32: false
|
69 |
+
collect_stats: false
|
70 |
+
write_collected_feats: false
|
71 |
+
max_epoch: 100
|
72 |
+
patience: null
|
73 |
+
val_scheduler_criterion:
|
74 |
+
- valid
|
75 |
+
- loss
|
76 |
+
early_stopping_criterion:
|
77 |
+
- valid
|
78 |
+
- loss
|
79 |
+
- min
|
80 |
+
best_model_criterion:
|
81 |
+
- - train
|
82 |
+
- loss
|
83 |
+
- min
|
84 |
+
- - valid
|
85 |
+
- loss
|
86 |
+
- min
|
87 |
+
- - train
|
88 |
+
- acc
|
89 |
+
- max
|
90 |
+
- - valid
|
91 |
+
- acc
|
92 |
+
- max
|
93 |
+
keep_nbest_models: 1
|
94 |
+
nbest_averaging_interval: 0
|
95 |
+
grad_clip: -1
|
96 |
+
grad_clip_type: 2.0
|
97 |
+
grad_noise: false
|
98 |
+
accum_grad: 2
|
99 |
+
no_forward_run: false
|
100 |
+
resume: true
|
101 |
+
train_dtype: float32
|
102 |
+
use_amp: false
|
103 |
+
log_interval: 50
|
104 |
+
use_matplotlib: true
|
105 |
+
use_tensorboard: true
|
106 |
+
create_graph_in_tensorboard: false
|
107 |
+
use_wandb: false
|
108 |
+
wandb_project: null
|
109 |
+
wandb_id: null
|
110 |
+
wandb_entity: null
|
111 |
+
wandb_name: null
|
112 |
+
wandb_model_log_interval: -1
|
113 |
+
detect_anomaly: false
|
114 |
+
use_adapter: false
|
115 |
+
adapter: lora
|
116 |
+
save_strategy: all
|
117 |
+
adapter_conf: {}
|
118 |
+
pretrain_path: null
|
119 |
+
init_param: []
|
120 |
+
ignore_init_mismatch: false
|
121 |
+
freeze_param:
|
122 |
+
- frontend.upstream
|
123 |
+
num_iters_per_epoch: null
|
124 |
+
batch_size: 16
|
125 |
+
valid_batch_size: null
|
126 |
+
batch_bins: 1000000
|
127 |
+
valid_batch_bins: null
|
128 |
+
category_sample_size: 10
|
129 |
+
train_shape_file:
|
130 |
+
- exp/universa_stats_overall_scale/train/audio_shape
|
131 |
+
- exp/universa_stats_overall_scale/train/ref_audio_shape
|
132 |
+
valid_shape_file:
|
133 |
+
- exp/universa_stats_overall_scale/valid/audio_shape
|
134 |
+
- exp/universa_stats_overall_scale/valid/ref_audio_shape
|
135 |
+
batch_type: sorted
|
136 |
+
valid_batch_type: null
|
137 |
+
fold_length:
|
138 |
+
- 256000
|
139 |
+
sort_in_batch: descending
|
140 |
+
shuffle_within_batch: false
|
141 |
+
sort_batch: descending
|
142 |
+
multiple_iterator: false
|
143 |
+
chunk_length: 500
|
144 |
+
chunk_shift_ratio: 0.5
|
145 |
+
num_cache_chunks: 1024
|
146 |
+
chunk_excluded_key_prefixes: []
|
147 |
+
chunk_default_fs: null
|
148 |
+
chunk_max_abs_length: null
|
149 |
+
chunk_discard_short_samples: true
|
150 |
+
train_data_path_and_name_and_type:
|
151 |
+
- - dump/raw/overall_scale/wav.scp
|
152 |
+
- audio
|
153 |
+
- kaldi_ark
|
154 |
+
- - dump/raw/overall_scale/metric.scp
|
155 |
+
- metrics
|
156 |
+
- metric
|
157 |
+
- - dump/raw/overall_scale/ref_wav.scp
|
158 |
+
- ref_audio
|
159 |
+
- kaldi_ark
|
160 |
+
valid_data_path_and_name_and_type:
|
161 |
+
- - dump/raw/overall_dev/wav.scp
|
162 |
+
- audio
|
163 |
+
- kaldi_ark
|
164 |
+
- - dump/raw/overall_dev/metric.scp
|
165 |
+
- metrics
|
166 |
+
- metric
|
167 |
+
- - dump/raw/overall_dev/ref_wav.scp
|
168 |
+
- ref_audio
|
169 |
+
- kaldi_ark
|
170 |
+
multi_task_dataset: false
|
171 |
+
allow_variable_data_keys: false
|
172 |
+
max_cache_size: 0.0
|
173 |
+
max_cache_fd: 32
|
174 |
+
allow_multi_rates: false
|
175 |
+
valid_max_cache_size: null
|
176 |
+
exclude_weight_decay: false
|
177 |
+
exclude_weight_decay_conf: {}
|
178 |
+
optim: adamw
|
179 |
+
optim_conf:
|
180 |
+
lr: 0.001
|
181 |
+
scheduler: warmuplr
|
182 |
+
scheduler_conf:
|
183 |
+
warmup_steps: 25000
|
184 |
+
metric2id: dump/raw/overall_scale/metric2id
|
185 |
+
metric2type: dump/raw/overall_scale/metric2type
|
186 |
+
metric_pad_value: -100
|
187 |
+
token_list: null
|
188 |
+
metric_token_info: data/token_list/metric_500_percentile_overall_scale_w-numerical/tokens.json
|
189 |
+
metric_token_pad_value: 0
|
190 |
+
tokenize_numerical_metric: true
|
191 |
+
init: null
|
192 |
+
model_conf: {}
|
193 |
+
use_ref_audio: true
|
194 |
+
use_ref_text: false
|
195 |
+
use_preprocessor: true
|
196 |
+
token_type: bpe
|
197 |
+
bpemodel: null
|
198 |
+
non_linguistic_symbols: null
|
199 |
+
cleaner: null
|
200 |
+
g2p: null
|
201 |
+
sequential_metric: true
|
202 |
+
randomize_sequential_metric: true
|
203 |
+
frontend: s3prl
|
204 |
+
frontend_conf:
|
205 |
+
frontend_conf:
|
206 |
+
upstream: wavlm_large
|
207 |
+
download_dir: ./hub
|
208 |
+
multilayer_feature: true
|
209 |
+
universa: ar_universa
|
210 |
+
universa_conf:
|
211 |
+
embedding_dim: 256
|
212 |
+
audio_encoder_type: transformer
|
213 |
+
audio_encoder_params:
|
214 |
+
num_blocks: 4
|
215 |
+
attention_heads: 4
|
216 |
+
linear_units: 1024
|
217 |
+
dropout_rate: 0.1
|
218 |
+
positional_dropout_rate: 0.1
|
219 |
+
attention_dropout_rate: 0.1
|
220 |
+
input_layer: conv2d
|
221 |
+
normalize_before: true
|
222 |
+
concat_after: false
|
223 |
+
positionwise_layer_type: linear
|
224 |
+
positionwise_conv_kernel_size: 1
|
225 |
+
layer_drop_rate: 0.1
|
226 |
+
qk_norm: false
|
227 |
+
use_flash_attn: false
|
228 |
+
cross_attention_type: multihead
|
229 |
+
cross_attention_params:
|
230 |
+
n_head: 2
|
231 |
+
dropout_rate: 0.1
|
232 |
+
metric_decoder_params:
|
233 |
+
num_blocks: 4
|
234 |
+
attention_heads: 4
|
235 |
+
linear_units: 1024
|
236 |
+
dropout_rate: 0.1
|
237 |
+
positional_dropout_rate: 0.1
|
238 |
+
src_attention_dropout_rate: 0.1
|
239 |
+
self_attention_dropout_rate: 0.1
|
240 |
+
input_layer: embed
|
241 |
+
normalize_before: true
|
242 |
+
concat_after: false
|
243 |
+
layer_drop_rate: 0.1
|
244 |
+
qk_norm: false
|
245 |
+
use_flash_attn: false
|
246 |
+
use_rope: true
|
247 |
+
lsm_weight: 0.1
|
248 |
+
sym_sos: <sos>
|
249 |
+
sym_eos: <eos>
|
250 |
+
required:
|
251 |
+
- output_dir
|
252 |
+
- metric2id
|
253 |
+
version: '202503'
|
254 |
+
distributed: false
|
255 |
+
```
|
256 |
+
|
257 |
+
</details>
|
258 |
+
|
259 |
+
|
260 |
+
|
261 |
+
### Citing ESPnet
|
262 |
+
|
263 |
+
```BibTex
|
264 |
+
@inproceedings{watanabe2018espnet,
|
265 |
+
author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
|
266 |
+
title={{ESPnet}: End-to-End Speech Processing Toolkit},
|
267 |
+
year={2018},
|
268 |
+
booktitle={Proceedings of Interspeech},
|
269 |
+
pages={2207--2211},
|
270 |
+
doi={10.21437/Interspeech.2018-1456},
|
271 |
+
url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
|
272 |
+
}
|
273 |
+
|
274 |
+
|
275 |
+
|
276 |
+
|
277 |
+
|
278 |
+
|
279 |
+
```
|
280 |
+
|
281 |
+
or arXiv:
|
282 |
+
|
283 |
+
```bibtex
|
284 |
+
@misc{watanabe2018espnet,
|
285 |
+
title={ESPnet: End-to-End Speech Processing Toolkit},
|
286 |
+
author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
|
287 |
+
year={2018},
|
288 |
+
eprint={1804.00015},
|
289 |
+
archivePrefix={arXiv},
|
290 |
+
primaryClass={cs.CL}
|
291 |
+
}
|
292 |
+
```
|
dump/raw/overall_scale/metric2id
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
asr_match_error_rate
|
2 |
+
asvspoof_score
|
3 |
+
audiobox_aesthetics_CE
|
4 |
+
audiobox_aesthetics_CU
|
5 |
+
audiobox_aesthetics_PC
|
6 |
+
audiobox_aesthetics_PQ
|
7 |
+
cer
|
8 |
+
ci_sdr
|
9 |
+
dns_overall
|
10 |
+
dns_p808
|
11 |
+
emotion_similarity
|
12 |
+
f0corr
|
13 |
+
f0rmse
|
14 |
+
language
|
15 |
+
mcd
|
16 |
+
nisqa_col_pred
|
17 |
+
nisqa_dis_pred
|
18 |
+
nisqa_loud_pred
|
19 |
+
nisqa_mos_pred
|
20 |
+
nisqa_noi_pred
|
21 |
+
nisqa_real_mos
|
22 |
+
nomad
|
23 |
+
noresqa_score
|
24 |
+
pam_score
|
25 |
+
pesq
|
26 |
+
plcmos
|
27 |
+
pred_text_length
|
28 |
+
pysepm_c_bak
|
29 |
+
pysepm_cd
|
30 |
+
pysepm_c_ovl
|
31 |
+
pysepm_c_sig
|
32 |
+
pysepm_csii_high
|
33 |
+
pysepm_csii_low
|
34 |
+
pysepm_csii_mid
|
35 |
+
pysepm_fwsegsnr
|
36 |
+
pysepm_llr
|
37 |
+
pysepm_ncm
|
38 |
+
pysepm_wss
|
39 |
+
qwen_channel_type
|
40 |
+
qwen_language
|
41 |
+
qwen_laughter_crying
|
42 |
+
qwen_pitch_range
|
43 |
+
qwen_recording_quality
|
44 |
+
qwen_speaker_age
|
45 |
+
qwen_speaker_count
|
46 |
+
qwen_speaker_gender
|
47 |
+
qwen_speaking_style
|
48 |
+
qwen_speech_background_environment
|
49 |
+
qwen_speech_clarity
|
50 |
+
qwen_speech_emotion
|
51 |
+
qwen_speech_impairment
|
52 |
+
qwen_speech_purpose
|
53 |
+
qwen_speech_rate
|
54 |
+
qwen_speech_register
|
55 |
+
qwen_speech_volume_level
|
56 |
+
qwen_vocabulary_complexity
|
57 |
+
qwen_voice_pitch
|
58 |
+
qwen_voice_type
|
59 |
+
real_language
|
60 |
+
ref_text_length
|
61 |
+
rir_room_size
|
62 |
+
rt60
|
63 |
+
sar
|
64 |
+
scoreq_nr
|
65 |
+
scoreq_ref
|
66 |
+
sdr
|
67 |
+
se_ci_sdr
|
68 |
+
se_sar
|
69 |
+
se_sdr
|
70 |
+
se_si_snr
|
71 |
+
sheet_ssqa
|
72 |
+
singmos
|
73 |
+
si_snr
|
74 |
+
snr_simulation
|
75 |
+
speaking_rate
|
76 |
+
speech_bert
|
77 |
+
speech_bleu
|
78 |
+
speech_token_distance
|
79 |
+
spk_similarity
|
80 |
+
srmr
|
81 |
+
stoi
|
82 |
+
urgent_mos
|
83 |
+
utmos
|
84 |
+
utmosv2
|
85 |
+
visqol
|
86 |
+
voicemos_real_mos
|
87 |
+
wer
|
dump/raw/overall_scale/metric2type
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
asr_match_error_rate numerical
|
2 |
+
asvspoof_score numerical
|
3 |
+
audiobox_aesthetics_CE numerical
|
4 |
+
audiobox_aesthetics_CU numerical
|
5 |
+
audiobox_aesthetics_PC numerical
|
6 |
+
audiobox_aesthetics_PQ numerical
|
7 |
+
cer numerical
|
8 |
+
ci_sdr numerical
|
9 |
+
dns_overall numerical
|
10 |
+
dns_p808 numerical
|
11 |
+
emotion_similarity numerical
|
12 |
+
f0corr numerical
|
13 |
+
f0rmse numerical
|
14 |
+
language categorical
|
15 |
+
mcd numerical
|
16 |
+
nisqa_col_pred numerical
|
17 |
+
nisqa_dis_pred numerical
|
18 |
+
nisqa_loud_pred numerical
|
19 |
+
nisqa_mos_pred numerical
|
20 |
+
nisqa_noi_pred numerical
|
21 |
+
nisqa_real_mos numerical
|
22 |
+
nomad numerical
|
23 |
+
noresqa_score numerical
|
24 |
+
pam_score numerical
|
25 |
+
pesq numerical
|
26 |
+
plcmos numerical
|
27 |
+
pred_text_length numerical
|
28 |
+
pysepm_c_bak numerical
|
29 |
+
pysepm_cd numerical
|
30 |
+
pysepm_c_ovl numerical
|
31 |
+
pysepm_c_sig numerical
|
32 |
+
pysepm_csii_high numerical
|
33 |
+
pysepm_csii_low numerical
|
34 |
+
pysepm_csii_mid numerical
|
35 |
+
pysepm_fwsegsnr numerical
|
36 |
+
pysepm_llr numerical
|
37 |
+
pysepm_ncm numerical
|
38 |
+
pysepm_wss numerical
|
39 |
+
qwen_channel_type categorical
|
40 |
+
qwen_language categorical
|
41 |
+
qwen_laughter_crying categorical
|
42 |
+
qwen_pitch_range categorical
|
43 |
+
qwen_recording_quality categorical
|
44 |
+
qwen_speaker_age categorical
|
45 |
+
qwen_speaker_count numerical
|
46 |
+
qwen_speaker_gender categorical
|
47 |
+
qwen_speaking_style categorical
|
48 |
+
qwen_speech_background_environment categorical
|
49 |
+
qwen_speech_clarity categorical
|
50 |
+
qwen_speech_emotion categorical
|
51 |
+
qwen_speech_impairment categorical
|
52 |
+
qwen_speech_purpose categorical
|
53 |
+
qwen_speech_rate categorical
|
54 |
+
qwen_speech_register categorical
|
55 |
+
qwen_speech_volume_level categorical
|
56 |
+
qwen_vocabulary_complexity categorical
|
57 |
+
qwen_voice_pitch categorical
|
58 |
+
qwen_voice_type categorical
|
59 |
+
real_language categorical
|
60 |
+
ref_text_length numerical
|
61 |
+
rir_room_size categorical
|
62 |
+
rt60 numerical
|
63 |
+
sar numerical
|
64 |
+
scoreq_nr numerical
|
65 |
+
scoreq_ref numerical
|
66 |
+
sdr numerical
|
67 |
+
se_ci_sdr numerical
|
68 |
+
se_sar numerical
|
69 |
+
se_sdr numerical
|
70 |
+
se_si_snr numerical
|
71 |
+
sheet_ssqa numerical
|
72 |
+
singmos numerical
|
73 |
+
si_snr numerical
|
74 |
+
snr_simulation numerical
|
75 |
+
speaking_rate numerical
|
76 |
+
speech_bert numerical
|
77 |
+
speech_bleu numerical
|
78 |
+
speech_token_distance numerical
|
79 |
+
spk_similarity numerical
|
80 |
+
srmr numerical
|
81 |
+
stoi numerical
|
82 |
+
urgent_mos numerical
|
83 |
+
utmos numerical
|
84 |
+
utmosv2 numerical
|
85 |
+
visqol numerical
|
86 |
+
voicemos_real_mos numerical
|
87 |
+
wer numerical
|
exp/universa_universa_ar_overall_scale_token_wavlm/6epoch.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f44093ca75bac69aa1fe7eea7c50c664ffb6e3d31d157c2b4b552d7e681acb57
|
3 |
+
size 2325327716
|
exp/universa_universa_ar_overall_scale_token_wavlm/config.yaml
ADDED
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
config: conf/train_aruniversa_wavlm.yaml
|
2 |
+
print_config: false
|
3 |
+
log_level: INFO
|
4 |
+
drop_last_iter: false
|
5 |
+
dry_run: false
|
6 |
+
iterator_type: sequence
|
7 |
+
valid_iterator_type: null
|
8 |
+
output_dir: exp/universa_universa_ar_overall_scale_token_wavlm
|
9 |
+
ngpu: 1
|
10 |
+
seed: 777
|
11 |
+
num_workers: 1
|
12 |
+
num_att_plot: 0
|
13 |
+
dist_backend: nccl
|
14 |
+
dist_init_method: env://
|
15 |
+
dist_world_size: null
|
16 |
+
dist_rank: null
|
17 |
+
local_rank: 0
|
18 |
+
dist_master_addr: null
|
19 |
+
dist_master_port: null
|
20 |
+
dist_launcher: null
|
21 |
+
multiprocessing_distributed: false
|
22 |
+
unused_parameters: false
|
23 |
+
sharded_ddp: false
|
24 |
+
use_deepspeed: false
|
25 |
+
deepspeed_config: null
|
26 |
+
gradient_as_bucket_view: true
|
27 |
+
ddp_comm_hook: null
|
28 |
+
cudnn_enabled: true
|
29 |
+
cudnn_benchmark: false
|
30 |
+
cudnn_deterministic: false
|
31 |
+
use_tf32: false
|
32 |
+
collect_stats: false
|
33 |
+
write_collected_feats: false
|
34 |
+
max_epoch: 100
|
35 |
+
patience: null
|
36 |
+
val_scheduler_criterion:
|
37 |
+
- valid
|
38 |
+
- loss
|
39 |
+
early_stopping_criterion:
|
40 |
+
- valid
|
41 |
+
- loss
|
42 |
+
- min
|
43 |
+
best_model_criterion:
|
44 |
+
- - train
|
45 |
+
- loss
|
46 |
+
- min
|
47 |
+
- - valid
|
48 |
+
- loss
|
49 |
+
- min
|
50 |
+
- - train
|
51 |
+
- acc
|
52 |
+
- max
|
53 |
+
- - valid
|
54 |
+
- acc
|
55 |
+
- max
|
56 |
+
keep_nbest_models: 1
|
57 |
+
nbest_averaging_interval: 0
|
58 |
+
grad_clip: -1
|
59 |
+
grad_clip_type: 2.0
|
60 |
+
grad_noise: false
|
61 |
+
accum_grad: 2
|
62 |
+
no_forward_run: false
|
63 |
+
resume: true
|
64 |
+
train_dtype: float32
|
65 |
+
use_amp: false
|
66 |
+
log_interval: 50
|
67 |
+
use_matplotlib: true
|
68 |
+
use_tensorboard: true
|
69 |
+
create_graph_in_tensorboard: false
|
70 |
+
use_wandb: false
|
71 |
+
wandb_project: null
|
72 |
+
wandb_id: null
|
73 |
+
wandb_entity: null
|
74 |
+
wandb_name: null
|
75 |
+
wandb_model_log_interval: -1
|
76 |
+
detect_anomaly: false
|
77 |
+
use_adapter: false
|
78 |
+
adapter: lora
|
79 |
+
save_strategy: all
|
80 |
+
adapter_conf: {}
|
81 |
+
pretrain_path: null
|
82 |
+
init_param: []
|
83 |
+
ignore_init_mismatch: false
|
84 |
+
freeze_param:
|
85 |
+
- frontend.upstream
|
86 |
+
num_iters_per_epoch: null
|
87 |
+
batch_size: 16
|
88 |
+
valid_batch_size: null
|
89 |
+
batch_bins: 1000000
|
90 |
+
valid_batch_bins: null
|
91 |
+
category_sample_size: 10
|
92 |
+
train_shape_file:
|
93 |
+
- exp/universa_stats_overall_scale/train/audio_shape
|
94 |
+
- exp/universa_stats_overall_scale/train/ref_audio_shape
|
95 |
+
valid_shape_file:
|
96 |
+
- exp/universa_stats_overall_scale/valid/audio_shape
|
97 |
+
- exp/universa_stats_overall_scale/valid/ref_audio_shape
|
98 |
+
batch_type: sorted
|
99 |
+
valid_batch_type: null
|
100 |
+
fold_length:
|
101 |
+
- 256000
|
102 |
+
sort_in_batch: descending
|
103 |
+
shuffle_within_batch: false
|
104 |
+
sort_batch: descending
|
105 |
+
multiple_iterator: false
|
106 |
+
chunk_length: 500
|
107 |
+
chunk_shift_ratio: 0.5
|
108 |
+
num_cache_chunks: 1024
|
109 |
+
chunk_excluded_key_prefixes: []
|
110 |
+
chunk_default_fs: null
|
111 |
+
chunk_max_abs_length: null
|
112 |
+
chunk_discard_short_samples: true
|
113 |
+
train_data_path_and_name_and_type:
|
114 |
+
- - dump/raw/overall_scale/wav.scp
|
115 |
+
- audio
|
116 |
+
- kaldi_ark
|
117 |
+
- - dump/raw/overall_scale/metric.scp
|
118 |
+
- metrics
|
119 |
+
- metric
|
120 |
+
- - dump/raw/overall_scale/ref_wav.scp
|
121 |
+
- ref_audio
|
122 |
+
- kaldi_ark
|
123 |
+
valid_data_path_and_name_and_type:
|
124 |
+
- - dump/raw/overall_dev/wav.scp
|
125 |
+
- audio
|
126 |
+
- kaldi_ark
|
127 |
+
- - dump/raw/overall_dev/metric.scp
|
128 |
+
- metrics
|
129 |
+
- metric
|
130 |
+
- - dump/raw/overall_dev/ref_wav.scp
|
131 |
+
- ref_audio
|
132 |
+
- kaldi_ark
|
133 |
+
multi_task_dataset: false
|
134 |
+
allow_variable_data_keys: false
|
135 |
+
max_cache_size: 0.0
|
136 |
+
max_cache_fd: 32
|
137 |
+
allow_multi_rates: false
|
138 |
+
valid_max_cache_size: null
|
139 |
+
exclude_weight_decay: false
|
140 |
+
exclude_weight_decay_conf: {}
|
141 |
+
optim: adamw
|
142 |
+
optim_conf:
|
143 |
+
lr: 0.001
|
144 |
+
scheduler: warmuplr
|
145 |
+
scheduler_conf:
|
146 |
+
warmup_steps: 25000
|
147 |
+
metric2id: dump/raw/overall_scale/metric2id
|
148 |
+
metric2type: dump/raw/overall_scale/metric2type
|
149 |
+
metric_pad_value: -100
|
150 |
+
token_list: null
|
151 |
+
metric_token_info: data/token_list/metric_500_percentile_overall_scale_w-numerical/tokens.json
|
152 |
+
metric_token_pad_value: 0
|
153 |
+
tokenize_numerical_metric: true
|
154 |
+
init: null
|
155 |
+
model_conf: {}
|
156 |
+
use_ref_audio: true
|
157 |
+
use_ref_text: false
|
158 |
+
use_preprocessor: true
|
159 |
+
token_type: bpe
|
160 |
+
bpemodel: null
|
161 |
+
non_linguistic_symbols: null
|
162 |
+
cleaner: null
|
163 |
+
g2p: null
|
164 |
+
sequential_metric: true
|
165 |
+
randomize_sequential_metric: true
|
166 |
+
frontend: s3prl
|
167 |
+
frontend_conf:
|
168 |
+
frontend_conf:
|
169 |
+
upstream: wavlm_large
|
170 |
+
download_dir: ./hub
|
171 |
+
multilayer_feature: true
|
172 |
+
universa: ar_universa
|
173 |
+
universa_conf:
|
174 |
+
embedding_dim: 256
|
175 |
+
audio_encoder_type: transformer
|
176 |
+
audio_encoder_params:
|
177 |
+
num_blocks: 4
|
178 |
+
attention_heads: 4
|
179 |
+
linear_units: 1024
|
180 |
+
dropout_rate: 0.1
|
181 |
+
positional_dropout_rate: 0.1
|
182 |
+
attention_dropout_rate: 0.1
|
183 |
+
input_layer: conv2d
|
184 |
+
normalize_before: true
|
185 |
+
concat_after: false
|
186 |
+
positionwise_layer_type: linear
|
187 |
+
positionwise_conv_kernel_size: 1
|
188 |
+
layer_drop_rate: 0.1
|
189 |
+
qk_norm: false
|
190 |
+
use_flash_attn: false
|
191 |
+
cross_attention_type: multihead
|
192 |
+
cross_attention_params:
|
193 |
+
n_head: 2
|
194 |
+
dropout_rate: 0.1
|
195 |
+
metric_decoder_params:
|
196 |
+
num_blocks: 4
|
197 |
+
attention_heads: 4
|
198 |
+
linear_units: 1024
|
199 |
+
dropout_rate: 0.1
|
200 |
+
positional_dropout_rate: 0.1
|
201 |
+
src_attention_dropout_rate: 0.1
|
202 |
+
self_attention_dropout_rate: 0.1
|
203 |
+
input_layer: embed
|
204 |
+
normalize_before: true
|
205 |
+
concat_after: false
|
206 |
+
layer_drop_rate: 0.1
|
207 |
+
qk_norm: false
|
208 |
+
use_flash_attn: false
|
209 |
+
use_rope: true
|
210 |
+
lsm_weight: 0.1
|
211 |
+
sym_sos: <sos>
|
212 |
+
sym_eos: <eos>
|
213 |
+
required:
|
214 |
+
- output_dir
|
215 |
+
- metric2id
|
216 |
+
version: '202503'
|
217 |
+
distributed: false
|
exp/universa_universa_ar_overall_scale_token_wavlm/images/acc_ar_decoder.png
ADDED
![]() |
exp/universa_universa_ar_overall_scale_token_wavlm/images/backward_time.png
ADDED
![]() |
exp/universa_universa_ar_overall_scale_token_wavlm/images/clip.png
ADDED
![]() |
exp/universa_universa_ar_overall_scale_token_wavlm/images/forward_time.png
ADDED
![]() |
exp/universa_universa_ar_overall_scale_token_wavlm/images/gpu_max_cached_mem_GB.png
ADDED
![]() |
exp/universa_universa_ar_overall_scale_token_wavlm/images/grad_norm.png
ADDED
![]() |
exp/universa_universa_ar_overall_scale_token_wavlm/images/iter_time.png
ADDED
![]() |
exp/universa_universa_ar_overall_scale_token_wavlm/images/loss.png
ADDED
![]() |
exp/universa_universa_ar_overall_scale_token_wavlm/images/loss_ar_decoder.png
ADDED
![]() |
exp/universa_universa_ar_overall_scale_token_wavlm/images/loss_scale.png
ADDED
![]() |
exp/universa_universa_ar_overall_scale_token_wavlm/images/optim0_lr0.png
ADDED
![]() |
exp/universa_universa_ar_overall_scale_token_wavlm/images/optim_step_time.png
ADDED
![]() |
exp/universa_universa_ar_overall_scale_token_wavlm/images/train_time.png
ADDED
![]() |
exp/universa_universa_ar_overall_scale_token_wavlm/images/value_ar_decoder.png
ADDED
![]() |
meta.yaml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
espnet: '202503'
|
2 |
+
files:
|
3 |
+
model_file: exp/universa_universa_ar_overall_scale_token_wavlm/6epoch.pth
|
4 |
+
python: "3.9.18 | packaged by conda-forge | (main, Dec 23 2023, 17:20:25) \n[GCC 12.3.0]"
|
5 |
+
timestamp: 1749800845.586997
|
6 |
+
torch: 2.6.0.dev20241210+cu124
|
7 |
+
yaml_files:
|
8 |
+
train_config: exp/universa_universa_ar_overall_scale_token_wavlm/config.yaml
|