|
|
from transformers import PretrainedConfig |
|
|
from transformers.models.dinov2.configuration_dinov2 import Dinov2Config |
|
|
|
|
|
|
|
|
class EmbodiedMAEConfig(PretrainedConfig): |
|
|
model_type = "EmbodiedMAE" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
hidden_size: int = 768, |
|
|
num_hidden_layers: int = 12, |
|
|
num_attention_heads: int = 12, |
|
|
mlp_ratio: int = 4, |
|
|
hidden_dropout_prob: float = 0.0, |
|
|
attention_probs_dropout_prob: float = 0.0, |
|
|
initializer_range: float = 0.02, |
|
|
qkv_bias: bool = True, |
|
|
apply_layernorm: bool = True, |
|
|
attn_implementation: str = "eager", |
|
|
layerscale_value: float = 1.0, |
|
|
drop_path_rate: float = 0.0, |
|
|
layer_norm_eps: float = 1e-6, |
|
|
hidden_act: str = "gelu", |
|
|
use_swiglu_ffn: bool = False, |
|
|
image_size: int = 224, |
|
|
patch_size: int = 16, |
|
|
num_pc_centers: int = 196, |
|
|
num_pc_knn: int = 64, |
|
|
dirichlet_alpha: int = 1.0, |
|
|
unmask_sz: int = 98, |
|
|
decoder_hidden_size: int = 512, |
|
|
decoder_num_hidden_layers: int = 4, |
|
|
decoder_num_attention_heads: int = 8, |
|
|
norm_pix_loss: int = False, |
|
|
**kwargs, |
|
|
): |
|
|
self.hidden_size = hidden_size |
|
|
self.num_hidden_layers = num_hidden_layers |
|
|
self.num_attention_heads = num_attention_heads |
|
|
self.mlp_ratio = mlp_ratio |
|
|
self.hidden_dropout_prob = hidden_dropout_prob |
|
|
self.attention_probs_dropout_prob = attention_probs_dropout_prob |
|
|
self.initializer_range = initializer_range |
|
|
|
|
|
self.image_size = image_size |
|
|
self.patch_size = patch_size |
|
|
self.qkv_bias = qkv_bias |
|
|
self.apply_layernorm = apply_layernorm |
|
|
self.num_pc_centers = num_pc_centers |
|
|
self.num_pc_knn = num_pc_knn |
|
|
self.dirichlet_alpha = dirichlet_alpha |
|
|
self.unmask_sz = unmask_sz |
|
|
|
|
|
self._attn_implementation = attn_implementation |
|
|
self.layerscale_value = layerscale_value |
|
|
self.drop_path_rate = drop_path_rate |
|
|
self.layer_norm_eps = layer_norm_eps |
|
|
self.hidden_act = hidden_act |
|
|
self.use_swiglu_ffn = use_swiglu_ffn |
|
|
|
|
|
self.decoder_hidden_size = decoder_hidden_size |
|
|
self.decoder_num_hidden_layers = decoder_num_hidden_layers |
|
|
self.decoder_num_attention_heads = decoder_num_attention_heads |
|
|
|
|
|
self.norm_pix_loss = norm_pix_loss |
|
|
|
|
|
super().__init__(**kwargs) |
|
|
|
|
|
|
|
|
BACKBONE_KWARGS = { |
|
|
"hidden_size", |
|
|
"num_hidden_layers", |
|
|
"num_attention_heads", |
|
|
"mlp_ratio", |
|
|
"hidden_dropout_prob", |
|
|
"attention_probs_dropout_prob", |
|
|
"initializer_range", |
|
|
"qkv_bias", |
|
|
"apply_layernorm", |
|
|
"attn_implementation", |
|
|
"layerscale_value", |
|
|
"drop_path_rate", |
|
|
"layer_norm_eps", |
|
|
"hidden_act", |
|
|
"use_swiglu_ffn", |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
def get_embodied_mae_config(size: str = "base") -> EmbodiedMAEConfig: |
|
|
backbone_config = Dinov2Config.from_pretrained(f"facebook/dinov2-{size}") |
|
|
kwargs = {k: v for k, v in backbone_config.to_dict().items() if k in BACKBONE_KWARGS} |
|
|
norm_pix_loss = True if size == "giant" else False |
|
|
return EmbodiedMAEConfig(**kwargs, norm_pix_loss=norm_pix_loss) |
|
|
|
|
|
|
|
|
__all__ = [EmbodiedMAEConfig, get_embodied_mae_config] |
|
|
|