File size: 1,739 Bytes
0c8d55e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import os
from transformers.configuration_utils import PretrainedConfig
from transformers import SiglipVisionConfig
from typing import Literal, Optional, Union


class UnivaVisionTowerConfig(PretrainedConfig):
    model_type = "univa_vision_tower"

    def __init__(
        self,
        vision_tower_type: Literal["siglip"] = "siglip",
        vision_tower_config: Optional[Union[str, dict]] = None,
        mm_projector_type: str = "mlp2x_gelu",
        feature_select_layer: int = -1,
        output_hidden_size: int = 1152,
        shortcut_projector_type: Optional[str] = None,
        **kwargs,
    ):
        super().__init__(**kwargs)

        self.vision_tower_type = vision_tower_type
        self.mm_projector_type = mm_projector_type
        self.feature_select_layer = feature_select_layer
        self.output_hidden_size = output_hidden_size
        self.shortcut_projector_type = shortcut_projector_type

        if vision_tower_type == "siglip":
            config_cls = SiglipVisionConfig
        else:
            raise ValueError(f"Unknown vision tower type: {vision_tower_type}")

        if vision_tower_config is not None:
            if isinstance(vision_tower_config, str):
                if not os.path.exists(vision_tower_config):
                    raise FileNotFoundError(
                        f"Vision tower config file not found: {vision_tower_config}"
                    )
                self.vision_tower_config = config_cls.from_pretrained(
                    vision_tower_config
                )
            elif isinstance(vision_tower_config, dict):
                self.vision_tower_config = config_cls(**vision_tower_config)
        else:
            self.vision_tower_config = config_cls()