antoine-dandi commited on
Commit
4dabcc9
·
verified ·
1 Parent(s): 0349b87

Upload folder using huggingface_hub

Browse files
added_tokens.json CHANGED
@@ -1,3 +1,3 @@
1
  {
2
- "<image>": 257152
3
  }
 
1
  {
2
+ "<image_soft_token>": 262144
3
  }
chat_template.jinja ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{ bos_token }}
2
+ {%- if messages[0]['role'] == 'system' -%}
3
+ {%- if messages[0]['content'] is string -%}
4
+ {%- set first_user_prefix = messages[0]['content'] + '
5
+
6
+ ' -%}
7
+ {%- else -%}
8
+ {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
9
+
10
+ ' -%}
11
+ {%- endif -%}
12
+ {%- set loop_messages = messages[1:] -%}
13
+ {%- else -%}
14
+ {%- set first_user_prefix = "" -%}
15
+ {%- set loop_messages = messages -%}
16
+ {%- endif -%}
17
+ {%- for message in loop_messages -%}
18
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
19
+ {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
20
+ {%- endif -%}
21
+ {%- if (message['role'] == 'assistant') -%}
22
+ {%- set role = "model" -%}
23
+ {%- else -%}
24
+ {%- set role = message['role'] -%}
25
+ {%- endif -%}
26
+ {{ '<start_of_turn>' + role + '
27
+ ' + (first_user_prefix if loop.first else "") }}
28
+ {%- if message['content'] is string -%}
29
+ {{ message['content'] | trim }}
30
+ {%- elif message['content'] is iterable -%}
31
+ {%- for item in message['content'] -%}
32
+ {%- if item['type'] == 'image' -%}
33
+ {{ '<start_of_image>' }}
34
+ {%- elif item['type'] == 'text' -%}
35
+ {{ item['text'] | trim }}
36
+ {%- endif -%}
37
+ {%- endfor -%}
38
+ {%- else -%}
39
+ {{ raise_exception("Invalid content type") }}
40
+ {%- endif -%}
41
+ {{ '<end_of_turn>
42
+ ' }}
43
+ {%- endfor -%}
44
+ {%- if add_generation_prompt -%}
45
+ {{'<start_of_turn>model
46
+ '}}
47
+ {%- endif -%}
config.json CHANGED
@@ -1,280 +1,343 @@
1
  {
2
- "architectures": [
3
- "InferenceModel"
4
- ],
5
- "dtype": "bfloat16",
6
- "model_args_dict": {
7
- "action_dim": 7,
8
- "action_head_args": {
9
- "action_head_type": "LINEAR"
10
- },
11
- "action_horizon": 10,
12
- "action_time_encoder_args": {
13
- "action_time_encoder_type": "EARLY_FUSE"
 
 
 
 
 
14
  },
15
- "action_token_id": 1000000,
16
- "adapter_args": {
17
- "activation_class": "GELU",
18
- "adapter_type": "FFN",
19
- "hidden_dim": "output_dim",
20
- "input_norm": false,
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  "norm_instance": {
22
  "norm_class": "RMS",
23
  "norm_eps": 1e-05
24
  },
25
- "output_norm": false,
26
- "pooling_factor_per_dim": 1,
27
- "pre_project_norm": false,
28
- "use_biases": true,
29
- "use_gating": false
30
  },
31
- "decoder_args": {
32
- "activation_class": "SILU",
33
- "checkpoint": false,
34
- "dim": 256,
35
- "dropout": 0.0,
36
- "head_dim": 32,
37
- "hidden_dim": 768,
38
- "n_heads": 16,
39
- "n_kv_heads": 8,
40
- "n_layers": 4,
41
- "norm_args": {
42
- "attention_norm": true,
43
- "ffn_norm": true,
44
- "norm_instance": {
45
- "norm_class": "RMS",
46
- "norm_eps": 1e-05
47
- },
48
- "post_attention_norm": false,
49
- "post_ffn_norm": false,
50
- "qk_norm": false
51
- },
52
- "pos_embed": "ROPE_1D",
53
- "rope_freqs_split": null,
54
- "rope_theta": 1000000.0,
55
- "total_parameter_count": 3147776,
56
- "use_attn_wk_biases": false,
57
- "use_attn_wo_biases": false,
58
- "use_attn_wq_biases": false,
59
- "use_attn_wv_biases": false,
60
- "use_ffn_biases": false,
61
- "use_ffn_gating": false
62
- },
63
- "env_processor_args": {
64
- "action_space": {
65
- "format": "default",
66
- "horizon": 10,
67
- "names": null,
68
- "tensors": {
69
- "action": {
70
  "dtype": "float32",
71
  "high": Infinity,
72
  "low": -Infinity,
73
  "names": [
74
- "action_0",
75
- "action_1",
76
- "action_2",
77
- "action_3",
78
- "action_4",
79
- "action_5",
80
- "action_6"
81
  ]
 
 
 
 
82
  }
83
- },
84
- "video_info": {
85
- "has_audio": false,
86
- "video.codec": "h264",
87
- "video.fps": 10.0,
88
- "video.is_depth_map": false,
89
- "video.pix_fmt": "yuv420p"
90
- }
91
  },
92
- "input_action_transforms_config": [
93
- {
94
- "norm_type": "MIN_MAX",
95
- "predefined_stats": null,
96
- "train_only": false,
97
- "type": "normalize_action"
98
- }
99
- ],
100
- "input_observation_transforms_config": [
101
- {
102
- "norm_type": "MIN_MAX",
103
- "train_only": false,
104
- "type": "normalize_observation"
105
- }
106
- ],
107
- "observation_space": {
108
- "format": "default",
109
- "images": {
110
- "observation.images.image": {
111
- "height": 256,
112
- "n_channels": 3,
113
- "video_info": {
114
- "has_audio": false,
115
- "video.codec": "h264",
116
- "video.fps": 10.0,
117
- "video.is_depth_map": false,
118
- "video.pix_fmt": "yuv420p"
119
- },
120
- "width": 256
 
 
 
 
 
 
121
  },
122
- "observation.images.wrist_image": {
123
- "height": 256,
124
- "n_channels": 3,
125
- "video_info": {
126
- "has_audio": false,
127
- "video.codec": "h264",
128
- "video.fps": 10.0,
129
- "video.is_depth_map": false,
130
- "video.pix_fmt": "yuv420p"
131
- },
132
- "width": 256
133
- }
134
  },
135
- "names": null,
136
- "state": {
137
- "observation.state": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  "dtype": "float64",
139
  "high": Infinity,
140
  "low": -Infinity,
141
  "names": [
142
- "observation.state_0",
143
- "observation.state_1",
144
- "observation.state_2",
145
- "observation.state_3",
146
- "observation.state_4",
147
- "observation.state_5",
148
- "observation.state_6",
149
- "observation.state_7"
150
  ]
 
 
 
 
151
  }
152
- },
153
- "video_info": {
154
- "has_audio": false,
155
- "video.codec": "h264",
156
- "video.fps": 10.0,
157
- "video.is_depth_map": false,
158
- "video.pix_fmt": "yuv420p"
159
- }
160
  },
161
- "output_action_transforms_config": [
162
- {
163
- "norm_type": "MIN_MAX",
164
- "predefined_stats": null,
165
- "train_only": false,
166
- "type": "denormalize_action"
167
- }
168
- ],
169
- "output_observation_transforms_config": [
170
- {
171
- "norm_type": "MIN_MAX",
172
- "train_only": false,
173
- "type": "denormalize_observation"
174
- }
175
- ]
176
  },
177
- "image_encoder_args": {
178
- "image_pooling_args": {
179
- "pooling_type": "NONE"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  "max_image_size": 224,
182
- "output_norm": true,
183
- "patch_norm": false,
184
  "patch_size": 16,
185
- "total_parameter_count": null,
186
- "transformer_args": {
187
- "activation_class": "GELU",
188
- "checkpoint": false,
189
- "dim": 384,
190
- "dropout": 0.0,
191
- "head_dim": 64,
192
- "hidden_dim": 1536,
193
- "n_heads": 6,
194
- "n_kv_heads": 6,
195
- "n_layers": 12,
196
- "norm_args": {
197
- "attention_norm": true,
198
- "ffn_norm": true,
199
- "norm_instance": {
200
- "norm_class": "LN",
201
- "norm_eps": 1e-05
202
- },
203
- "post_attention_norm": false,
204
- "post_ffn_norm": false,
205
- "qk_norm": false
206
- },
207
- "pos_embed": "ROPE_2D_PIXTRAL",
208
- "rope_freqs_split": null,
209
- "rope_theta": 100.0,
210
- "total_parameter_count": null,
211
- "use_attn_wk_biases": false,
212
- "use_attn_wo_biases": true,
213
- "use_attn_wq_biases": true,
214
- "use_attn_wv_biases": true,
215
- "use_ffn_biases": true,
216
- "use_ffn_gating": false
217
- },
218
- "use_patch_conv_biases": true
219
  },
220
- "image_token_id": 257152,
221
- "img_encoding_tokens": [
222
- 257152
223
- ],
224
- "model_parallel": 1,
225
- "model_type": "vla_flow_matching",
226
- "processor_args": {
227
- "action_tokenizer_args": {
228
- "fault_tolerant_decoding": true,
229
- "max_action": 2.0,
230
- "min_action": -2.0,
231
- "n_bins": null,
232
- "tokenizer_type": "CONTINUOUS"
233
- },
234
- "annotation_formatter_args": {
235
- "formatter_type": "COMPACT_DISCRETE"
236
- },
237
- "decoder_attention_args": {
238
- "pattern": "PREFIX_LM"
239
- },
240
- "image_preprocess_args": {
241
- "interpolation_mode": "BICUBIC",
242
- "max_image_size": 224,
243
- "patch_size": 14,
244
- "pooling_factor_per_dim": 1,
245
- "resize_mode": "FIXED",
246
- "rgb_normalization": "SIGLIP"
247
- },
248
- "instruct_template_args": {
249
- "template_type": "GENESIS"
250
- },
251
- "tokenizer_args": {
252
- "tokenizer_type": "PALIGEMMA"
253
- }
254
  },
255
- "return_logits": false,
 
 
 
 
 
256
  "stats": {
257
- "max": {
258
- "action": "OAAAAAAAAAB7IiI6eyJkdHlwZSI6IkY2NCIsInNoYXBlIjpbN10sImRhdGFfb2Zmc2V0cyI6WzAsNTZdfX0gIAAAAAAAAO4/AAAAAAAA7j8AAAAAAADuP2JFkd4FxNY/AAAAAAAA2D8AAAAAAADYPwAAAAAAAPA/",
259
- "observation.state": "OAAAAAAAAAB7IiI6eyJkdHlwZSI6IkY2NCIsInNoYXBlIjpbOF0sImRhdGFfb2Zmc2V0cyI6WzAsNjRdfX0gILhg3iCw68o/k6sW4NQK2T8G2+PgLdv1P/Zh3l8UXw1AzENkYDZ8DED1a0Zhci72PyIXLgCMraU/d7WDnxpWVj8="
260
- },
261
- "mean": {
262
- "action": "OAAAAAAAAAB7IiI6eyJkdHlwZSI6IkY2NCIsInNoYXBlIjpbN10sImRhdGFfb2Zmc2V0cyI6WzAsNTZdfX0gIGsI8fVzErA/gXzoBjM7tj+4siNZsCK3v5NCzU4OuEE/HFafSIMddz8Q28zXG2t1v2RY5imCaqm/",
263
- "observation.state": "OAAAAAAAAAB7IiI6eyJkdHlwZSI6IkY2NCIsInNoYXBlIjpbOF0sImRhdGFfb2Zmc2V0cyI6WzAsNjRdfX0gIEPofz1P0ae/quqrzBCeoT+6ZvLNNnfoP1TiOsYVxwdAnCrnpFo4zL8UkQZc/BLAvxh7L75oj5s/STJpW+bXm78="
264
- },
265
- "min": {
266
- "action": "OAAAAAAAAAB7IiI6eyJkdHlwZSI6IkY2NCIsInNoYXBlIjpbN10sImRhdGFfb2Zmc2V0cyI6WzAsNTZdfX0gIAAAAAAAAO6/AAAAAAAA7r8AAAAAAADuvyU7ujqVhtC/AAAAAAAA2L/2sg7DHoXXvwAAAAAAAPC/",
267
- "observation.state": "OAAAAAAAAAB7IiI6eyJkdHlwZSI6IkY2NCIsInNoYXBlIjpbOF0sImRhdGFfb2Zmc2V0cyI6WzAsNjRdfX0gIPwSlh2H5t6/2aTsPhHV1L8pQDXagaWAP9p1yv7Uk9Y/OhZGQKYhDcCcIwJA23v9v5qKfZeMQla/TLXkK2CGpb8="
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
  },
269
- "std": {
270
- "action": "OAAAAAAAAAB7IiI6eyJkdHlwZSI6IkY2NCIsInNoYXBlIjpbN10sImRhdGFfb2Zmc2V0cyI6WzAsNTZdfX0gILniXnA4edU/93RUuXk42D+pIibybnbcP2UvwrG6F6Q/jCP/YYU6sD9U86g63PWzP0k2sobm9e8/",
271
- "observation.state": "OAAAAAAAAAB7IiI6eyJkdHlwZSI6IkY2NCIsInNoYXBlIjpbOF0sImRhdGFfb2Zmc2V0cyI6WzAsNjRdfX0gIATcHHyb3bo/fyFYExNtwz8IIZwmnjnYP1uca3eTCNY/Grp+ZrUF7T8HiDiMONPUP7WEIRdBCI0/Zf0kRujKjD8="
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
  }
273
- },
274
- "total_parameter_count": null,
275
- "use_output_biases": false,
276
- "vocab_size": 257216
277
  },
278
- "model_type": "inference_model",
279
- "transformers_version": "4.57.0"
 
280
  }
 
1
  {
2
+ "action_dim": 7,
3
+ "action_head_args": {
4
+ "action_head_type": "LINEAR"
5
+ },
6
+ "action_horizon": 10,
7
+ "action_time_encoder_args": {
8
+ "action_time_encoder_type": "EARLY_FUSE"
9
+ },
10
+ "action_token_id": 1000000,
11
+ "adapter_args": {
12
+ "activation_class": "GELU",
13
+ "adapter_type": "FFN",
14
+ "hidden_dim": "output_dim",
15
+ "input_norm": false,
16
+ "norm_instance": {
17
+ "norm_class": "RMS",
18
+ "norm_eps": 1e-05
19
  },
20
+ "output_norm": false,
21
+ "pooling_factor_per_dim": 1,
22
+ "pre_project_norm": false,
23
+ "use_biases": true,
24
+ "use_gating": false
25
+ },
26
+ "decoder_args": {
27
+ "activation_class": "SILU",
28
+ "checkpoint": false,
29
+ "dim": 256,
30
+ "dropout": 0.0,
31
+ "head_dim": 32,
32
+ "hidden_dim": 768,
33
+ "n_heads": 16,
34
+ "n_kv_heads": 8,
35
+ "n_layers": 4,
36
+ "norm_args": {
37
+ "attention_norm": true,
38
+ "ffn_norm": true,
39
  "norm_instance": {
40
  "norm_class": "RMS",
41
  "norm_eps": 1e-05
42
  },
43
+ "post_attention_norm": false,
44
+ "post_ffn_norm": false,
45
+ "qk_norm": false
 
 
46
  },
47
+ "pos_embed": "ROPE_1D",
48
+ "rope_freqs_split": null,
49
+ "rope_theta": 1000000.0,
50
+ "total_parameter_count": 3147776,
51
+ "use_attn_wk_biases": false,
52
+ "use_attn_wo_biases": false,
53
+ "use_attn_wq_biases": false,
54
+ "use_attn_wv_biases": false,
55
+ "use_ffn_biases": false,
56
+ "use_ffn_gating": false
57
+ },
58
+ "env_processor_args": {
59
+ "action_space": {
60
+ "format": "default",
61
+ "horizon": 10,
62
+ "names": null,
63
+ "tensors": {
64
+ "action": [
65
+ {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  "dtype": "float32",
67
  "high": Infinity,
68
  "low": -Infinity,
69
  "names": [
70
+ "action_{i}",
71
+ "action_{i}",
72
+ "action_{i}",
73
+ "action_{i}",
74
+ "action_{i}",
75
+ "action_{i}",
76
+ "action_{i}"
77
  ]
78
+ },
79
+ {
80
+ "tensor_decoder_type": "default",
81
+ "type": "gs_vla.third_party.libero.data.LiberoAction"
82
  }
83
+ ]
 
 
 
 
 
 
 
84
  },
85
+ "video_info": {
86
+ "has_audio": false,
87
+ "video.codec": "h264",
88
+ "video.fps": 10.0,
89
+ "video.is_depth_map": false,
90
+ "video.pix_fmt": "yuv420p"
91
+ }
92
+ },
93
+ "input_action_transforms_config": [
94
+ {
95
+ "norm_type": "MIN_MAX",
96
+ "predefined_stats": null,
97
+ "train_only": false,
98
+ "type": "normalize_action"
99
+ }
100
+ ],
101
+ "input_observation_transforms_config": [
102
+ {
103
+ "norm_type": "MIN_MAX",
104
+ "train_only": false,
105
+ "type": "normalize_observation"
106
+ }
107
+ ],
108
+ "observation_space": {
109
+ "format": "default",
110
+ "images": {
111
+ "observation.images.image": {
112
+ "height": 256,
113
+ "n_channels": 3,
114
+ "video_info": {
115
+ "has_audio": false,
116
+ "video.codec": "h264",
117
+ "video.fps": 10.0,
118
+ "video.is_depth_map": false,
119
+ "video.pix_fmt": "yuv420p"
120
  },
121
+ "width": 256
 
 
 
 
 
 
 
 
 
 
 
122
  },
123
+ "observation.images.wrist_image": {
124
+ "height": 256,
125
+ "n_channels": 3,
126
+ "video_info": {
127
+ "has_audio": false,
128
+ "video.codec": "h264",
129
+ "video.fps": 10.0,
130
+ "video.is_depth_map": false,
131
+ "video.pix_fmt": "yuv420p"
132
+ },
133
+ "width": 256
134
+ }
135
+ },
136
+ "names": null,
137
+ "state": {
138
+ "observation.state": [
139
+ {
140
  "dtype": "float64",
141
  "high": Infinity,
142
  "low": -Infinity,
143
  "names": [
144
+ "observation.state_{i}",
145
+ "observation.state_{i}",
146
+ "observation.state_{i}",
147
+ "observation.state_{i}",
148
+ "observation.state_{i}",
149
+ "observation.state_{i}",
150
+ "observation.state_{i}",
151
+ "observation.state_{i}"
152
  ]
153
+ },
154
+ {
155
+ "tensor_decoder_type": "default",
156
+ "type": "gs_vla.third_party.libero.data.LiberoObservation"
157
  }
158
+ ]
 
 
 
 
 
 
 
159
  },
160
+ "video_info": {
161
+ "has_audio": false,
162
+ "video.codec": "h264",
163
+ "video.fps": 10.0,
164
+ "video.is_depth_map": false,
165
+ "video.pix_fmt": "yuv420p"
166
+ }
 
 
 
 
 
 
 
 
167
  },
168
+ "output_action_transforms_config": [
169
+ {
170
+ "norm_type": "MIN_MAX",
171
+ "predefined_stats": null,
172
+ "train_only": false,
173
+ "type": "denormalize_action"
174
+ }
175
+ ],
176
+ "output_observation_transforms_config": [
177
+ {
178
+ "norm_type": "MIN_MAX",
179
+ "train_only": false,
180
+ "type": "denormalize_observation"
181
+ }
182
+ ]
183
+ },
184
+ "image_encoder_args": {
185
+ "image_pooling_args": {
186
+ "pooling_type": "NONE"
187
+ },
188
+ "max_image_size": 224,
189
+ "num_register_tokens": 4,
190
+ "output_norm": true,
191
+ "patch_norm": false,
192
+ "patch_size": 16,
193
+ "total_parameter_count": null,
194
+ "transformer_args": {
195
+ "activation_class": "GELU",
196
+ "checkpoint": false,
197
+ "dim": 384,
198
+ "dropout": 0.0,
199
+ "head_dim": 64,
200
+ "hidden_dim": 1536,
201
+ "n_heads": 6,
202
+ "n_kv_heads": 6,
203
+ "n_layers": 12,
204
+ "norm_args": {
205
+ "attention_norm": true,
206
+ "ffn_norm": true,
207
+ "norm_instance": {
208
+ "norm_class": "LN",
209
+ "norm_eps": 1e-05
210
+ },
211
+ "post_attention_norm": false,
212
+ "post_ffn_norm": false,
213
+ "qk_norm": false
214
  },
215
+ "pos_embed": "ROPE_2D_DINO",
216
+ "rope_freqs_split": null,
217
+ "rope_theta": 100.0,
218
+ "total_parameter_count": null,
219
+ "use_attn_wk_biases": false,
220
+ "use_attn_wo_biases": true,
221
+ "use_attn_wq_biases": true,
222
+ "use_attn_wv_biases": true,
223
+ "use_ffn_biases": true,
224
+ "use_ffn_gating": false
225
+ },
226
+ "use_cls_token": true,
227
+ "use_patch_conv_biases": true
228
+ },
229
+ "image_token_id": 262144,
230
+ "img_encoding_tokens": [
231
+ 262144,
232
+ 255999,
233
+ 256000
234
+ ],
235
+ "model_parallel": 1,
236
+ "model_type": "vla_flow_matching",
237
+ "processor_args": {
238
+ "action_tokenizer_args": {
239
+ "fault_tolerant_decoding": true,
240
+ "max_action": 2.0,
241
+ "min_action": -2.0,
242
+ "n_bins": null,
243
+ "tokenizer_type": "CONTINUOUS"
244
+ },
245
+ "annotation_formatter_args": {
246
+ "formatter_type": "COMPACT_DISCRETE"
247
+ },
248
+ "bottleneck": null,
249
+ "decoder_attention_args": {
250
+ "pattern": "BIDIR_ON_IMAGE"
251
+ },
252
+ "image_preprocess_args": {
253
+ "interpolation_mode": "BICUBIC",
254
  "max_image_size": 224,
 
 
255
  "patch_size": 16,
256
+ "pooling_factor_per_dim": 1,
257
+ "resize_mode": "FIXED",
258
+ "rgb_normalization": "IMAGENET"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
  },
260
+ "instruct_template_args": {
261
+ "template_type": "GENESIS"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262
  },
263
+ "tokenizer_args": {
264
+ "tokenizer_type": "GEMMA3"
265
+ }
266
+ },
267
+ "return_logits": false,
268
+ "stats": {
269
  "stats": {
270
+ "action": {
271
+ "max": {
272
+ "batch_dims": [],
273
+ "decoder": {
274
+ "tensor_decoder_type": "default",
275
+ "type": "gs_vla.third_party.libero.data.LiberoAction"
276
+ },
277
+ "tensor": "OAAAAAAAAAB7IiI6eyJkdHlwZSI6IkY2NCIsInNoYXBlIjpbN10sImRhdGFfb2Zmc2V0cyI6WzAsNTZdfX0gIAAAAAAAAO4/AAAAAAAA7j8AAAAAAADuP2JFkd4FxNY/AAAAAAAA2D8AAAAAAADYPwAAAAAAAPA/"
278
+ },
279
+ "mean": {
280
+ "batch_dims": [],
281
+ "decoder": {
282
+ "tensor_decoder_type": "default",
283
+ "type": "gs_vla.third_party.libero.data.LiberoAction"
284
+ },
285
+ "tensor": "OAAAAAAAAAB7IiI6eyJkdHlwZSI6IkY2NCIsInNoYXBlIjpbN10sImRhdGFfb2Zmc2V0cyI6WzAsNTZdfX0gIGsI8fVzErA/gXzoBjM7tj+4siNZsCK3v5NCzU4OuEE/HFafSIMddz8Q28zXG2t1v2RY5imCaqm/"
286
+ },
287
+ "min": {
288
+ "batch_dims": [],
289
+ "decoder": {
290
+ "tensor_decoder_type": "default",
291
+ "type": "gs_vla.third_party.libero.data.LiberoAction"
292
+ },
293
+ "tensor": "OAAAAAAAAAB7IiI6eyJkdHlwZSI6IkY2NCIsInNoYXBlIjpbN10sImRhdGFfb2Zmc2V0cyI6WzAsNTZdfX0gIAAAAAAAAO6/AAAAAAAA7r8AAAAAAADuvyU7ujqVhtC/AAAAAAAA2L/2sg7DHoXXvwAAAAAAAPC/"
294
+ },
295
+ "std": {
296
+ "batch_dims": [],
297
+ "decoder": {
298
+ "tensor_decoder_type": "default",
299
+ "type": "gs_vla.third_party.libero.data.LiberoAction"
300
+ },
301
+ "tensor": "OAAAAAAAAAB7IiI6eyJkdHlwZSI6IkY2NCIsInNoYXBlIjpbN10sImRhdGFfb2Zmc2V0cyI6WzAsNTZdfX0gILniXnA4edU/93RUuXk42D+pIibybnbcP2UvwrG6F6Q/jCP/YYU6sD9U86g63PWzP0k2sobm9e8/"
302
+ }
303
  },
304
+ "observation.state": {
305
+ "max": {
306
+ "batch_dims": [],
307
+ "decoder": {
308
+ "tensor_decoder_type": "default",
309
+ "type": "gs_vla.third_party.libero.data.LiberoObservation"
310
+ },
311
+ "tensor": "OAAAAAAAAAB7IiI6eyJkdHlwZSI6IkY2NCIsInNoYXBlIjpbOF0sImRhdGFfb2Zmc2V0cyI6WzAsNjRdfX0gILhg3iCw68o/k6sW4NQK2T8G2+PgLdv1P/Zh3l8UXw1AzENkYDZ8DED1a0Zhci72PyIXLgCMraU/d7WDnxpWVj8="
312
+ },
313
+ "mean": {
314
+ "batch_dims": [],
315
+ "decoder": {
316
+ "tensor_decoder_type": "default",
317
+ "type": "gs_vla.third_party.libero.data.LiberoObservation"
318
+ },
319
+ "tensor": "OAAAAAAAAAB7IiI6eyJkdHlwZSI6IkY2NCIsInNoYXBlIjpbOF0sImRhdGFfb2Zmc2V0cyI6WzAsNjRdfX0gIEPofz1P0ae/quqrzBCeoT+6ZvLNNnfoP1TiOsYVxwdAnCrnpFo4zL8UkQZc/BLAvxh7L75oj5s/STJpW+bXm78="
320
+ },
321
+ "min": {
322
+ "batch_dims": [],
323
+ "decoder": {
324
+ "tensor_decoder_type": "default",
325
+ "type": "gs_vla.third_party.libero.data.LiberoObservation"
326
+ },
327
+ "tensor": "OAAAAAAAAAB7IiI6eyJkdHlwZSI6IkY2NCIsInNoYXBlIjpbOF0sImRhdGFfb2Zmc2V0cyI6WzAsNjRdfX0gIPwSlh2H5t6/2aTsPhHV1L8pQDXagaWAP9p1yv7Uk9Y/OhZGQKYhDcCcIwJA23v9v5qKfZeMQla/TLXkK2CGpb8="
328
+ },
329
+ "std": {
330
+ "batch_dims": [],
331
+ "decoder": {
332
+ "tensor_decoder_type": "default",
333
+ "type": "gs_vla.third_party.libero.data.LiberoObservation"
334
+ },
335
+ "tensor": "OAAAAAAAAAB7IiI6eyJkdHlwZSI6IkY2NCIsInNoYXBlIjpbOF0sImRhdGFfb2Zmc2V0cyI6WzAsNjRdfX0gIATcHHyb3bo/fyFYExNtwz8IIZwmnjnYP1uca3eTCNY/Grp+ZrUF7T8HiDiMONPUP7WEIRdBCI0/Zf0kRujKjD8="
336
+ }
337
  }
338
+ }
 
 
 
339
  },
340
+ "total_parameter_count": null,
341
+ "use_output_biases": false,
342
+ "vocab_size": 262208
343
  }
consolidated.00.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:754d25c36a092cbad6607cc91bea6fb4169c8c5ebb6539875cd95c004d433ee4
3
- size 356783550
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a641c01209f255dcf4373e1a2d962cbed6544c9a3a8746d0d399530e4a65b612
3
+ size 361903222
model_args.yaml CHANGED
@@ -54,17 +54,19 @@ env_processor_args:
54
  names: null
55
  tensors:
56
  action:
57
- dtype: float32
58
  high: .inf
59
  low: -.inf
60
  names:
61
- - action_0
62
- - action_1
63
- - action_2
64
- - action_3
65
- - action_4
66
- - action_5
67
- - action_6
 
 
68
  video_info:
69
  has_audio: false
70
  video.codec: h264
@@ -106,18 +108,20 @@ env_processor_args:
106
  names: null
107
  state:
108
  observation.state:
109
- dtype: float64
110
  high: .inf
111
  low: -.inf
112
  names:
113
- - observation.state_0
114
- - observation.state_1
115
- - observation.state_2
116
- - observation.state_3
117
- - observation.state_4
118
- - observation.state_5
119
- - observation.state_6
120
- - observation.state_7
 
 
121
  video_info:
122
  has_audio: false
123
  video.codec: h264
@@ -137,6 +141,7 @@ image_encoder_args:
137
  image_pooling_args:
138
  pooling_type: NONE
139
  max_image_size: 224
 
140
  output_norm: true
141
  patch_norm: false
142
  patch_size: 16
@@ -160,7 +165,7 @@ image_encoder_args:
160
  post_attention_norm: false
161
  post_ffn_norm: false
162
  qk_norm: false
163
- pos_embed: ROPE_2D_PIXTRAL
164
  rope_freqs_split: null
165
  rope_theta: 100.0
166
  total_parameter_count: null
@@ -170,10 +175,13 @@ image_encoder_args:
170
  use_attn_wv_biases: true
171
  use_ffn_biases: true
172
  use_ffn_gating: false
 
173
  use_patch_conv_biases: true
174
- image_token_id: 257152
175
  img_encoding_tokens:
176
- - 257152
 
 
177
  model_parallel: 1
178
  model_type: vla_flow_matching
179
  processor_args:
@@ -185,33 +193,73 @@ processor_args:
185
  tokenizer_type: CONTINUOUS
186
  annotation_formatter_args:
187
  formatter_type: COMPACT_DISCRETE
 
188
  decoder_attention_args:
189
- pattern: PREFIX_LM
190
  image_preprocess_args:
191
  interpolation_mode: BICUBIC
192
  max_image_size: 224
193
- patch_size: 14
194
  pooling_factor_per_dim: 1
195
  resize_mode: FIXED
196
- rgb_normalization: SIGLIP
197
  instruct_template_args:
198
  template_type: GENESIS
199
  tokenizer_args:
200
- tokenizer_type: PALIGEMMA
201
  return_logits: false
202
  stats:
203
- max:
204
- action: OAAAAAAAAAB7IiI6eyJkdHlwZSI6IkY2NCIsInNoYXBlIjpbN10sImRhdGFfb2Zmc2V0cyI6WzAsNTZdfX0gIAAAAAAAAO4/AAAAAAAA7j8AAAAAAADuP2JFkd4FxNY/AAAAAAAA2D8AAAAAAADYPwAAAAAAAPA/
205
- observation.state: OAAAAAAAAAB7IiI6eyJkdHlwZSI6IkY2NCIsInNoYXBlIjpbOF0sImRhdGFfb2Zmc2V0cyI6WzAsNjRdfX0gILhg3iCw68o/k6sW4NQK2T8G2+PgLdv1P/Zh3l8UXw1AzENkYDZ8DED1a0Zhci72PyIXLgCMraU/d7WDnxpWVj8=
206
- mean:
207
- action: OAAAAAAAAAB7IiI6eyJkdHlwZSI6IkY2NCIsInNoYXBlIjpbN10sImRhdGFfb2Zmc2V0cyI6WzAsNTZdfX0gIGsI8fVzErA/gXzoBjM7tj+4siNZsCK3v5NCzU4OuEE/HFafSIMddz8Q28zXG2t1v2RY5imCaqm/
208
- observation.state: OAAAAAAAAAB7IiI6eyJkdHlwZSI6IkY2NCIsInNoYXBlIjpbOF0sImRhdGFfb2Zmc2V0cyI6WzAsNjRdfX0gIEPofz1P0ae/quqrzBCeoT+6ZvLNNnfoP1TiOsYVxwdAnCrnpFo4zL8UkQZc/BLAvxh7L75oj5s/STJpW+bXm78=
209
- min:
210
- action: OAAAAAAAAAB7IiI6eyJkdHlwZSI6IkY2NCIsInNoYXBlIjpbN10sImRhdGFfb2Zmc2V0cyI6WzAsNTZdfX0gIAAAAAAAAO6/AAAAAAAA7r8AAAAAAADuvyU7ujqVhtC/AAAAAAAA2L/2sg7DHoXXvwAAAAAAAPC/
211
- observation.state: OAAAAAAAAAB7IiI6eyJkdHlwZSI6IkY2NCIsInNoYXBlIjpbOF0sImRhdGFfb2Zmc2V0cyI6WzAsNjRdfX0gIPwSlh2H5t6/2aTsPhHV1L8pQDXagaWAP9p1yv7Uk9Y/OhZGQKYhDcCcIwJA23v9v5qKfZeMQla/TLXkK2CGpb8=
212
- std:
213
- action: OAAAAAAAAAB7IiI6eyJkdHlwZSI6IkY2NCIsInNoYXBlIjpbN10sImRhdGFfb2Zmc2V0cyI6WzAsNTZdfX0gILniXnA4edU/93RUuXk42D+pIibybnbcP2UvwrG6F6Q/jCP/YYU6sD9U86g63PWzP0k2sobm9e8/
214
- observation.state: OAAAAAAAAAB7IiI6eyJkdHlwZSI6IkY2NCIsInNoYXBlIjpbOF0sImRhdGFfb2Zmc2V0cyI6WzAsNjRdfX0gIATcHHyb3bo/fyFYExNtwz8IIZwmnjnYP1uca3eTCNY/Grp+ZrUF7T8HiDiMONPUP7WEIRdBCI0/Zf0kRujKjD8=
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  total_parameter_count: null
216
  use_output_biases: false
217
- vocab_size: 257216
 
54
  names: null
55
  tensors:
56
  action:
57
+ - dtype: float32
58
  high: .inf
59
  low: -.inf
60
  names:
61
+ - action_{i}
62
+ - action_{i}
63
+ - action_{i}
64
+ - action_{i}
65
+ - action_{i}
66
+ - action_{i}
67
+ - action_{i}
68
+ - tensor_decoder_type: default
69
+ type: gs_vla.third_party.libero.data.LiberoAction
70
  video_info:
71
  has_audio: false
72
  video.codec: h264
 
108
  names: null
109
  state:
110
  observation.state:
111
+ - dtype: float64
112
  high: .inf
113
  low: -.inf
114
  names:
115
+ - observation.state_{i}
116
+ - observation.state_{i}
117
+ - observation.state_{i}
118
+ - observation.state_{i}
119
+ - observation.state_{i}
120
+ - observation.state_{i}
121
+ - observation.state_{i}
122
+ - observation.state_{i}
123
+ - tensor_decoder_type: default
124
+ type: gs_vla.third_party.libero.data.LiberoObservation
125
  video_info:
126
  has_audio: false
127
  video.codec: h264
 
141
  image_pooling_args:
142
  pooling_type: NONE
143
  max_image_size: 224
144
+ num_register_tokens: 4
145
  output_norm: true
146
  patch_norm: false
147
  patch_size: 16
 
165
  post_attention_norm: false
166
  post_ffn_norm: false
167
  qk_norm: false
168
+ pos_embed: ROPE_2D_DINO
169
  rope_freqs_split: null
170
  rope_theta: 100.0
171
  total_parameter_count: null
 
175
  use_attn_wv_biases: true
176
  use_ffn_biases: true
177
  use_ffn_gating: false
178
+ use_cls_token: true
179
  use_patch_conv_biases: true
180
+ image_token_id: 262144
181
  img_encoding_tokens:
182
+ - 262144
183
+ - 255999
184
+ - 256000
185
  model_parallel: 1
186
  model_type: vla_flow_matching
187
  processor_args:
 
193
  tokenizer_type: CONTINUOUS
194
  annotation_formatter_args:
195
  formatter_type: COMPACT_DISCRETE
196
+ bottleneck: null
197
  decoder_attention_args:
198
+ pattern: BIDIR_ON_IMAGE
199
  image_preprocess_args:
200
  interpolation_mode: BICUBIC
201
  max_image_size: 224
202
+ patch_size: 16
203
  pooling_factor_per_dim: 1
204
  resize_mode: FIXED
205
+ rgb_normalization: IMAGENET
206
  instruct_template_args:
207
  template_type: GENESIS
208
  tokenizer_args:
209
+ tokenizer_type: GEMMA3
210
  return_logits: false
211
  stats:
212
+ stats:
213
+ action:
214
+ max:
215
+ batch_dims: []
216
+ decoder:
217
+ tensor_decoder_type: default
218
+ type: gs_vla.third_party.libero.data.LiberoAction
219
+ tensor: OAAAAAAAAAB7IiI6eyJkdHlwZSI6IkY2NCIsInNoYXBlIjpbN10sImRhdGFfb2Zmc2V0cyI6WzAsNTZdfX0gIAAAAAAAAO4/AAAAAAAA7j8AAAAAAADuP2JFkd4FxNY/AAAAAAAA2D8AAAAAAADYPwAAAAAAAPA/
220
+ mean:
221
+ batch_dims: []
222
+ decoder:
223
+ tensor_decoder_type: default
224
+ type: gs_vla.third_party.libero.data.LiberoAction
225
+ tensor: OAAAAAAAAAB7IiI6eyJkdHlwZSI6IkY2NCIsInNoYXBlIjpbN10sImRhdGFfb2Zmc2V0cyI6WzAsNTZdfX0gIGsI8fVzErA/gXzoBjM7tj+4siNZsCK3v5NCzU4OuEE/HFafSIMddz8Q28zXG2t1v2RY5imCaqm/
226
+ min:
227
+ batch_dims: []
228
+ decoder:
229
+ tensor_decoder_type: default
230
+ type: gs_vla.third_party.libero.data.LiberoAction
231
+ tensor: OAAAAAAAAAB7IiI6eyJkdHlwZSI6IkY2NCIsInNoYXBlIjpbN10sImRhdGFfb2Zmc2V0cyI6WzAsNTZdfX0gIAAAAAAAAO6/AAAAAAAA7r8AAAAAAADuvyU7ujqVhtC/AAAAAAAA2L/2sg7DHoXXvwAAAAAAAPC/
232
+ std:
233
+ batch_dims: []
234
+ decoder:
235
+ tensor_decoder_type: default
236
+ type: gs_vla.third_party.libero.data.LiberoAction
237
+ tensor: OAAAAAAAAAB7IiI6eyJkdHlwZSI6IkY2NCIsInNoYXBlIjpbN10sImRhdGFfb2Zmc2V0cyI6WzAsNTZdfX0gILniXnA4edU/93RUuXk42D+pIibybnbcP2UvwrG6F6Q/jCP/YYU6sD9U86g63PWzP0k2sobm9e8/
238
+ observation.state:
239
+ max:
240
+ batch_dims: []
241
+ decoder:
242
+ tensor_decoder_type: default
243
+ type: gs_vla.third_party.libero.data.LiberoObservation
244
+ tensor: OAAAAAAAAAB7IiI6eyJkdHlwZSI6IkY2NCIsInNoYXBlIjpbOF0sImRhdGFfb2Zmc2V0cyI6WzAsNjRdfX0gILhg3iCw68o/k6sW4NQK2T8G2+PgLdv1P/Zh3l8UXw1AzENkYDZ8DED1a0Zhci72PyIXLgCMraU/d7WDnxpWVj8=
245
+ mean:
246
+ batch_dims: []
247
+ decoder:
248
+ tensor_decoder_type: default
249
+ type: gs_vla.third_party.libero.data.LiberoObservation
250
+ tensor: OAAAAAAAAAB7IiI6eyJkdHlwZSI6IkY2NCIsInNoYXBlIjpbOF0sImRhdGFfb2Zmc2V0cyI6WzAsNjRdfX0gIEPofz1P0ae/quqrzBCeoT+6ZvLNNnfoP1TiOsYVxwdAnCrnpFo4zL8UkQZc/BLAvxh7L75oj5s/STJpW+bXm78=
251
+ min:
252
+ batch_dims: []
253
+ decoder:
254
+ tensor_decoder_type: default
255
+ type: gs_vla.third_party.libero.data.LiberoObservation
256
+ tensor: OAAAAAAAAAB7IiI6eyJkdHlwZSI6IkY2NCIsInNoYXBlIjpbOF0sImRhdGFfb2Zmc2V0cyI6WzAsNjRdfX0gIPwSlh2H5t6/2aTsPhHV1L8pQDXagaWAP9p1yv7Uk9Y/OhZGQKYhDcCcIwJA23v9v5qKfZeMQla/TLXkK2CGpb8=
257
+ std:
258
+ batch_dims: []
259
+ decoder:
260
+ tensor_decoder_type: default
261
+ type: gs_vla.third_party.libero.data.LiberoObservation
262
+ tensor: OAAAAAAAAAB7IiI6eyJkdHlwZSI6IkY2NCIsInNoYXBlIjpbOF0sImRhdGFfb2Zmc2V0cyI6WzAsNjRdfX0gIATcHHyb3bo/fyFYExNtwz8IIZwmnjnYP1uca3eTCNY/Grp+ZrUF7T8HiDiMONPUP7WEIRdBCI0/Zf0kRujKjD8=
263
  total_parameter_count: null
264
  use_output_biases: false
265
+ vocab_size: 262208
special_tokens_map.json CHANGED
@@ -1,7 +1,5 @@
1
  {
2
- "additional_special_tokens": [
3
- "<image>"
4
- ],
5
  "bos_token": {
6
  "content": "<bos>",
7
  "lstrip": false,
@@ -9,6 +7,7 @@
9
  "rstrip": false,
10
  "single_word": false
11
  },
 
12
  "eos_token": {
13
  "content": "<eos>",
14
  "lstrip": false,
@@ -16,6 +15,7 @@
16
  "rstrip": false,
17
  "single_word": false
18
  },
 
19
  "pad_token": {
20
  "content": "<pad>",
21
  "lstrip": false,
 
1
  {
2
+ "boi_token": "<start_of_image>",
 
 
3
  "bos_token": {
4
  "content": "<bos>",
5
  "lstrip": false,
 
7
  "rstrip": false,
8
  "single_word": false
9
  },
10
+ "eoi_token": "<end_of_image>",
11
  "eos_token": {
12
  "content": "<eos>",
13
  "lstrip": false,
 
15
  "rstrip": false,
16
  "single_word": false
17
  },
18
+ "image_token": "<image_soft_token>",
19
  "pad_token": {
20
  "content": "<pad>",
21
  "lstrip": false,
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9d20a85d1cb83be179f5892faf2fdaf9044d94bbcbd668f4d03cecefd7789483
3
- size 34387120
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4667f2089529e8e7657cfb6d1c19910ae71ff5f28aa7ab2ff2763330affad795
3
+ size 33384568
tokenizer.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8986bb4f423f07f8c7f70d0dbe3526fb2316056c17bae71b1ea975e77a168fc6
3
- size 4264023
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
3
+ size 4689074
tokenizer_config.json CHANGED
The diff for this file is too large to render. See raw diff