|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from transformers.configuration_utils import PretrainedConfig |
|
|
from transformers.utils import logging |
|
|
from .configuration_nemotron_h import NemotronHConfig |
|
|
from .configuration_radio import RADIOConfig |
|
|
|
|
|
logger = logging.get_logger(__name__) |
|
|
|
|
|
class NemotronH_Nano_VL_V2_Config(PretrainedConfig): |
|
|
model_type = 'NemotronH_Nano_VL_V2' |
|
|
is_composition = True |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
vision_config=None, |
|
|
llm_config=None, |
|
|
force_image_size=None, |
|
|
downsample_ratio=0.5, |
|
|
template=None, |
|
|
ps_version='v1', |
|
|
image_tag_type="internvl", |
|
|
projector_hidden_size=4096, |
|
|
vit_hidden_size=1280, |
|
|
attn_implementation="flash_attention_2", |
|
|
video_pruning_rate: float = 0.0, |
|
|
**kwargs |
|
|
): |
|
|
super().__init__(**kwargs) |
|
|
|
|
|
if vision_config is not None: |
|
|
self.vision_config = RADIOConfig(**vision_config) |
|
|
else: |
|
|
self.vision_config = RADIOConfig() |
|
|
|
|
|
|
|
|
if llm_config is not None: |
|
|
self.llm_config = NemotronHConfig(**llm_config) |
|
|
else: |
|
|
self.llm_config = NemotronHConfig() |
|
|
|
|
|
|
|
|
self.force_image_size = force_image_size |
|
|
self.downsample_ratio = downsample_ratio |
|
|
self.template = template |
|
|
self.ps_version = ps_version |
|
|
self.image_tag_type = image_tag_type |
|
|
self.projector_hidden_size = projector_hidden_size |
|
|
self.vit_hidden_size = vit_hidden_size |
|
|
self.video_pruning_rate = video_pruning_rate |
|
|
|
|
|
self._attn_implementation = attn_implementation |
|
|
self.vision_config.use_flash_attn = self._attn_implementation is not None and "flash_attention" in self._attn_implementation |
|
|
self.llm_config._attn_implementation = self._attn_implementation |
|
|
|