inclusionAI
/

MingTok-Audio

Safetensors

Model card Files Files and versions

xet

Community

yongjielv commited on Sep 29

Commit

a1075ac

verified ·

1 Parent(s): b32eaba

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

config.json +148 -0
model.safetensors +3 -0

config.json ADDED Viewed

	@@ -0,0 +1,148 @@

+{
+  "architectures": [
+    "AudioVAE"
+  ],
+  "dec_kwargs": {
+    "backbone": {
+      "_attn_implementation": "flash_attention_2",
+      "attention_dropout": 0.0,
+      "attn_implementation": null,
+      "bos_token_id": 151643,
+      "eos_token_id": 151645,
+      "hidden_act": "silu",
+      "hidden_size": 896,
+      "initializer_range": 0.02,
+      "intermediate_size": 4864,
+      "is_causal": true,
+      "max_position_embeddings": 32768,
+      "max_window_layers": 0,
+      "model_type": "qwen2",
+      "num_attention_heads": 14,
+      "num_hidden_layers": 24,
+      "num_key_value_heads": 2,
+      "rms_norm_eps": 1e-06,
+      "rope_theta": 1000000.0,
+      "sliding_window": 32,
+      "tie_word_embeddings": true,
+      "torch_dtype": "bfloat16",
+      "transformers_version": "4.43.1",
+      "use_cache": false,
+      "use_sliding_window": true,
+      "vocab_size": 1
+    },
+    "latent_dim": 64,
+    "output_dim": 320
+  },
+  "enc_kwargs": {
+    "backbone": {
+      "_attn_implementation": "flash_attention_2",
+      "attention_dropout": 0.0,
+      "attn_implementation": null,
+      "bos_token_id": 151643,
+      "eos_token_id": 151645,
+      "hidden_act": "silu",
+      "hidden_size": 896,
+      "initializer_range": 0.02,
+      "intermediate_size": 4864,
+      "is_causal": true,
+      "max_position_embeddings": 32768,
+      "max_window_layers": 0,
+      "model_type": "qwen2",
+      "num_attention_heads": 14,
+      "num_hidden_layers": 24,
+      "num_key_value_heads": 2,
+      "rms_norm_eps": 1e-06,
+      "rope_theta": 1000000.0,
+      "sliding_window": 32,
+      "tie_word_embeddings": true,
+      "torch_dtype": "bfloat16",
+      "transformers_version": "4.43.1",
+      "use_cache": false,
+      "use_sliding_window": true,
+      "vocab_size": 1
+    },
+    "hop_size": 320,
+    "input_dim": 320,
+    "latent_dim": 64
+  },
+  "hifi_gan_disc_kwargs": {
+    "channel_increasing_factor": 4,
+    "channels": 16,
+    "max_downsample_channels": 512,
+    "periods": [
+      2,
+      3,
+      5,
+      7,
+      11
+    ]
+  },
+  "init_method": "kaiming",
+  "lambda_adv": 1.0,
+  "lambda_disc": 1.0,
+  "lambda_feat_match_loss": 1.0,
+  "lambda_mel_loss": 1.0,
+  "lambda_semantic": 2.0,
+  "patch_size": -1,
+  "semantic_module_kwargs": {
+    "causal": true,
+    "whisper_encoder": {
+      "n_ctx": 1500,
+      "n_head": 20,
+      "n_layer": 32,
+      "n_mels": 128,
+      "n_state": 1280
+    }
+  },
+  "spec_disc_kwargs": {
+    "channels": 32,
+    "downsample_scales": [
+      2,
+      2,
+      2
+    ],
+    "in_channels": 1,
+    "kernel_sizes": [
+      5,
+      3
+    ],
+    "max_downsample_channels": 512,
+    "out_channels": 1,
+    "stft_params": {
+      "fft_sizes": [
+        78,
+        126,
+        206,
+        334,
+        542,
+        876,
+        1418,
+        2296
+      ],
+      "hop_sizes": [
+        39,
+        63,
+        103,
+        167,
+        271,
+        438,
+        709,
+        1148
+      ],
+      "win_lengths": [
+        78,
+        126,
+        206,
+        334,
+        542,
+        876,
+        1418,
+        2296
+      ],
+      "window": "hann_window"
+    },
+    "use_weight_norm": true
+  },
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.4"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c36d876de086d13eb1cdcfb9d08e22c3d806cd7893d64fdaf7ea6d30b7d521cd
+size 2700431196