Upload 3 files

Browse files

Files changed (3) hide show

speech_gen_ep2.bin +3 -0
wavtokenizer_large_unify_600_24k.ckpt +3 -0
wavtokenizer_smalldata_frame40_3s_nq1_code4096_dim512_kmeans200_attn.yaml +93 -0

speech_gen_ep2.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:35328d51bf792bc88a6a1cd06912d8a9e29c204f2fb81b520dffeab9f8248ec8
+size 3239597589

wavtokenizer_large_unify_600_24k.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:72182c1b6bd5ea7f84cf3ec78a0a3244cf42daa660b2e9bce23f5d74064d8205
+size 1759224573

wavtokenizer_smalldata_frame40_3s_nq1_code4096_dim512_kmeans200_attn.yaml ADDED Viewed

	@@ -0,0 +1,93 @@

+seed_everything: 3407
+data:
+  class_path: decoder.dataset.VocosDataModule
+  init_args:
+    train_params:
+      filelist_path: ./WavTokenizer/data/train/libritts_train
+      sampling_rate: 24000
+      num_samples: 72000
+      batch_size: 40  # 20
+      num_workers: 8
+    val_params:
+      filelist_path: ./WavTokenizer/data/infer/librttts_val
+      sampling_rate: 24000
+      num_samples: 72000
+      batch_size: 5   # 10
+      num_workers: 8
+model:
+  class_path: decoder.experiment.WavTokenizer
+  init_args:
+    sample_rate: 24000
+    initial_learning_rate: 2e-4
+    mel_loss_coeff: 45
+    mrd_loss_coeff: 1.0
+    num_warmup_steps: 0 # Optimizers warmup steps
+    pretrain_mel_steps: 0  # 0 means GAN objective from the first iteration
+    # automatic evaluation
+    evaluate_utmos: true
+    evaluate_pesq: true
+    evaluate_periodicty: true
+    resume: false
+    resume_config: ./WavTokenizer/configs/wavtokenizer_smalldata_frame40_3s_nq1_code16384_dim512_kmeans800_attn.yaml
+    resume_model: ./version_3/checkpoints/xxx.ckpt
+    feature_extractor:
+      class_path: decoder.feature_extractors.EncodecFeatures
+      init_args:
+        encodec_model: encodec_24khz
+        bandwidths: [6.6, 6.6, 6.6, 6.6]
+        train_codebooks: true
+        num_quantizers: 1
+        dowmsamples: [6, 5, 5, 4]
+        vq_bins: 4096
+        vq_kmeans: 200
+    backbone:
+      class_path: decoder.models.VocosBackbone
+      init_args:
+        input_channels: 512
+        dim: 768
+        intermediate_dim: 2304
+        num_layers: 12
+        adanorm_num_embeddings: 4
+    head:
+      class_path: decoder.heads.ISTFTHead
+      init_args:
+        dim: 768
+        n_fft: 2400
+        hop_length: 600
+        padding: same
+trainer:
+  logger:
+    class_path: pytorch_lightning.loggers.TensorBoardLogger
+    init_args:
+      save_dir: ./WavTokenizer/result/train/wavtokenizer_smalldata_frame40_3s_nq1_code4096_dim512_kmeans200_attn/
+  callbacks:
+    - class_path: pytorch_lightning.callbacks.LearningRateMonitor
+    - class_path: pytorch_lightning.callbacks.ModelSummary
+      init_args:
+        max_depth: 2
+    - class_path: pytorch_lightning.callbacks.ModelCheckpoint
+      init_args:
+        monitor: val_loss
+        filename: wavtokenizer_checkpoint_{epoch}_{step}_{val_loss:.4f}
+        save_top_k: 10
+        save_last: true
+    - class_path: decoder.helpers.GradNormCallback
+  # Lightning calculates max_steps across all optimizer steps (rather than number of batches)
+  # This equals to 1M steps per generator and 1M per discriminator
+  max_steps: 20000000
+  # You might want to limit val batches when evaluating all the metrics, as they are time-consuming
+  limit_val_batches: 200
+  accelerator: gpu
+  strategy: ddp
+  devices: [0,1,2,3,4,5,6,7]
+  log_every_n_steps: 1000