sriting commited on
Commit
293f84c
·
verified ·
1 Parent(s): 130e821

Upload config.json (#40)

Browse files

- Upload config.json (adb147c3a267df56fe958e16ea49b85a01ce8371)

Files changed (1) hide show
  1. config.json +94 -90
config.json CHANGED
@@ -1,90 +1,94 @@
1
  {
2
  "architectures": [
3
- "MiniMaxForCausalLM"
4
  ],
5
  "attention_dropout": 0.0,
6
- "layer_types": [
7
- "linear_attention",
8
- "linear_attention",
9
- "linear_attention",
10
- "linear_attention",
11
- "linear_attention",
12
- "linear_attention",
13
- "linear_attention",
14
- "full_attention",
15
- "linear_attention",
16
- "linear_attention",
17
- "linear_attention",
18
- "linear_attention",
19
- "linear_attention",
20
- "linear_attention",
21
- "linear_attention",
22
- "full_attention",
23
- "linear_attention",
24
- "linear_attention",
25
- "linear_attention",
26
- "linear_attention",
27
- "linear_attention",
28
- "linear_attention",
29
- "linear_attention",
30
- "full_attention",
31
- "linear_attention",
32
- "linear_attention",
33
- "linear_attention",
34
- "linear_attention",
35
- "linear_attention",
36
- "linear_attention",
37
- "linear_attention",
38
- "full_attention",
39
- "linear_attention",
40
- "linear_attention",
41
- "linear_attention",
42
- "linear_attention",
43
- "linear_attention",
44
- "linear_attention",
45
- "linear_attention",
46
- "full_attention",
47
- "linear_attention",
48
- "linear_attention",
49
- "linear_attention",
50
- "linear_attention",
51
- "linear_attention",
52
- "linear_attention",
53
- "linear_attention",
54
- "full_attention",
55
- "linear_attention",
56
- "linear_attention",
57
- "linear_attention",
58
- "linear_attention",
59
- "linear_attention",
60
- "linear_attention",
61
- "linear_attention",
62
- "full_attention",
63
- "linear_attention",
64
- "linear_attention",
65
- "linear_attention",
66
- "linear_attention",
67
- "linear_attention",
68
- "linear_attention",
69
- "linear_attention",
70
- "full_attention",
71
- "linear_attention",
72
- "linear_attention",
73
- "linear_attention",
74
- "linear_attention",
75
- "linear_attention",
76
- "linear_attention",
77
- "linear_attention",
78
- "full_attention",
79
- "linear_attention",
80
- "linear_attention",
81
- "linear_attention",
82
- "linear_attention",
83
- "linear_attention",
84
- "linear_attention",
85
- "linear_attention",
86
- "full_attention"
87
  ],
 
 
 
 
88
  "bos_token_id": null,
89
  "eos_token_id": null,
90
  "head_dim": 128,
@@ -92,14 +96,14 @@
92
  "hidden_size": 6144,
93
  "initializer_range": 0.02,
94
  "intermediate_size": 9216,
95
- "full_attn_alpha_factor": 3.5565588200778455,
96
- "full_attn_beta_factor": 1.0,
97
- "linear_attn_alpha_factor": 3.5565588200778455,
98
- "linear_attn_beta_factor": 1.0,
99
- "mlp_alpha_factor": 3.5565588200778455,
100
- "mlp_beta_factor": 1.0,
101
  "max_position_embeddings": 10240000,
102
- "model_type": "minimax",
103
  "num_attention_heads": 64,
104
  "num_experts_per_tok": 2,
105
  "num_hidden_layers": 80,
@@ -119,4 +123,4 @@
119
  "transformers_version": "4.45.2",
120
  "use_cache": true,
121
  "vocab_size": 200064
122
- }
 
1
  {
2
  "architectures": [
3
+ "MiniMaxText01ForCausalLM"
4
  ],
5
  "attention_dropout": 0.0,
6
+ "attn_type_list": [
7
+ 0,
8
+ 0,
9
+ 0,
10
+ 0,
11
+ 0,
12
+ 0,
13
+ 0,
14
+ 1,
15
+ 0,
16
+ 0,
17
+ 0,
18
+ 0,
19
+ 0,
20
+ 0,
21
+ 0,
22
+ 1,
23
+ 0,
24
+ 0,
25
+ 0,
26
+ 0,
27
+ 0,
28
+ 0,
29
+ 0,
30
+ 1,
31
+ 0,
32
+ 0,
33
+ 0,
34
+ 0,
35
+ 0,
36
+ 0,
37
+ 0,
38
+ 1,
39
+ 0,
40
+ 0,
41
+ 0,
42
+ 0,
43
+ 0,
44
+ 0,
45
+ 0,
46
+ 1,
47
+ 0,
48
+ 0,
49
+ 0,
50
+ 0,
51
+ 0,
52
+ 0,
53
+ 0,
54
+ 1,
55
+ 0,
56
+ 0,
57
+ 0,
58
+ 0,
59
+ 0,
60
+ 0,
61
+ 0,
62
+ 1,
63
+ 0,
64
+ 0,
65
+ 0,
66
+ 0,
67
+ 0,
68
+ 0,
69
+ 0,
70
+ 1,
71
+ 0,
72
+ 0,
73
+ 0,
74
+ 0,
75
+ 0,
76
+ 0,
77
+ 0,
78
+ 1,
79
+ 0,
80
+ 0,
81
+ 0,
82
+ 0,
83
+ 0,
84
+ 0,
85
+ 0,
86
+ 1
87
  ],
88
+ "auto_map": {
89
+ "AutoConfig": "configuration_minimax_text_01.MiniMaxText01Config",
90
+ "AutoModelForCausalLM": "modeling_minimax_text_01.MiniMaxText01ForCausalLM"
91
+ },
92
  "bos_token_id": null,
93
  "eos_token_id": null,
94
  "head_dim": 128,
 
96
  "hidden_size": 6144,
97
  "initializer_range": 0.02,
98
  "intermediate_size": 9216,
99
+ "layernorm_full_attention_alpha": 3.5565588200778455,
100
+ "layernorm_full_attention_beta": 1.0,
101
+ "layernorm_linear_attention_alpha": 3.5565588200778455,
102
+ "layernorm_linear_attention_beta": 1.0,
103
+ "layernorm_mlp_alpha": 3.5565588200778455,
104
+ "layernorm_mlp_beta": 1.0,
105
  "max_position_embeddings": 10240000,
106
+ "model_type": "minimax_text_01",
107
  "num_attention_heads": 64,
108
  "num_experts_per_tok": 2,
109
  "num_hidden_layers": 80,
 
123
  "transformers_version": "4.45.2",
124
  "use_cache": true,
125
  "vocab_size": 200064
126
+ }