File size: 1,869 Bytes
ba5e03e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
import torch
from transformers import Qwen2_5OmniThinkerTextModel, Qwen2_5OmniThinkerForConditionalGeneration
from transformers.cache_utils import Cache
from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
from transformers.models.qwen2_5_omni.configuration_qwen2_5_omni import Qwen2_5OmniThinkerConfig
class BidirectQwen2_5OmniThinkerTextModel(Qwen2_5OmniThinkerTextModel):
def __init__(self, config):
super().__init__(config)
for layer in self.layers:
layer.self_attn.is_causal = False
# override the _update_causal_mask method to generate bi-directional attention
def _update_causal_mask(
self,
attention_mask: torch.Tensor,
input_tensor: torch.Tensor,
cache_position: torch.Tensor,
past_key_values: Cache,
output_attentions: bool = False,
):
calculated_attention_mask = super()._update_causal_mask(
attention_mask,
input_tensor,
cache_position,
past_key_values,
output_attentions)
if calculated_attention_mask is None:
return None
if self.config._attn_implementation == "flash_attention_2":
if attention_mask is not None and 0.0 in attention_mask:
return attention_mask
causal_mask = _prepare_4d_attention_mask(
attention_mask,
dtype=input_tensor.dtype,
)
return causal_mask
class NVOmniEmbedConfig(Qwen2_5OmniThinkerConfig):
model_type = "nvomniembed"
class NVOmniEmbedModel(Qwen2_5OmniThinkerForConditionalGeneration):
config_class = NVOmniEmbedConfig
def __init__(self, config):
super().__init__(config)
self.model = BidirectQwen2_5OmniThinkerTextModel._from_config(
config.text_config, attn_implementation=config._attn_implementation
)
|