|
|
|
|
|
|
|
from transformers.configuration_utils import PretrainedConfig |
|
from transformers.modeling_rope_utils import rope_config_validation |
|
|
|
|
|
class Qwen2_5_VLVisionConfig(PretrainedConfig): |
|
model_type = "qwen2_5_vl" |
|
base_config_key = "vision_config" |
|
|
|
def __init__( |
|
self, |
|
depth=32, |
|
hidden_size=3584, |
|
hidden_act="silu", |
|
intermediate_size=3420, |
|
num_heads=16, |
|
in_channels=3, |
|
patch_size=14, |
|
spatial_merge_size=2, |
|
temporal_patch_size=2, |
|
tokens_per_second=4, |
|
window_size=112, |
|
out_hidden_size=3584, |
|
fullatt_block_indexes=[7, 15, 23, 31], |
|
initializer_range=0.02, |
|
**kwargs, |
|
): |
|
super().__init__(**kwargs) |
|
|
|
self.depth = depth |
|
self.hidden_size = hidden_size |
|
self.hidden_act = hidden_act |
|
self.intermediate_size = intermediate_size |
|
self.num_heads = num_heads |
|
self.in_channels = in_channels |
|
self.patch_size = patch_size |
|
self.spatial_merge_size = spatial_merge_size |
|
self.temporal_patch_size = temporal_patch_size |
|
self.tokens_per_second = tokens_per_second |
|
self.window_size = window_size |
|
self.fullatt_block_indexes = fullatt_block_indexes |
|
self.out_hidden_size = out_hidden_size |
|
self.initializer_range = initializer_range |
|
|
|
|
|
class Qwen2_5_VLTextConfig(PretrainedConfig): |
|
r""" |
|
This is the configuration class to store the configuration of a [`Qwen2_5_VLTextModel`]. It is used to instantiate a |
|
Qwen2-VL model according to the specified arguments, defining the model architecture. Instantiating a configuration |
|
with the defaults will yield a similar configuration to that of |
|
Qwen2-VL-7B-Instruct [Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct). |
|
|
|
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the |
|
documentation from [`PretrainedConfig`] for more information. |
|
|
|
Args: |
|
vocab_size (`int`, *optional*, defaults to 152064): |
|
Vocabulary size of the Qwen2_5_VL model. Defines the number of different tokens that can be represented by the |
|
`inputs_ids` passed when calling [`Qwen2_5_VLModel`] |
|
hidden_size (`int`, *optional*, defaults to 8192): |
|
Dimension of the hidden representations. |
|
intermediate_size (`int`, *optional*, defaults to 29568): |
|
Dimension of the MLP representations. |
|
num_hidden_layers (`int`, *optional*, defaults to 80): |
|
Number of hidden layers in the Transformer encoder. |
|
num_attention_heads (`int`, *optional*, defaults to 64): |
|
Number of attention heads for each attention layer in the Transformer encoder. |
|
num_key_value_heads (`int`, *optional*, defaults to 8): |
|
This is the number of key_value heads that should be used to implement Grouped Query Attention. If |
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if |
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When |
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed |
|
by meanpooling all the original heads within that group. For more details checkout [this |
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`. |
|
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): |
|
The non-linear activation function (function or string) in the decoder. |
|
max_position_embeddings (`int`, *optional*, defaults to 32768): |
|
The maximum sequence length that this model might ever be used with. |
|
initializer_range (`float`, *optional*, defaults to 0.02): |
|
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. |
|
rms_norm_eps (`float`, *optional*, defaults to 1e-05): |
|
The epsilon used by the rms normalization layers. |
|
use_cache (`bool`, *optional*, defaults to `True`): |
|
Whether or not the model should return the last key/values attentions (not used by all models). Only |
|
relevant if `config.is_decoder=True`. |
|
tie_word_embeddings (`bool`, *optional*, defaults to `False`): |
|
Whether the model's input and output word embeddings should be tied. |
|
rope_theta (`float`, *optional*, defaults to 1000000.0): |
|
The base period of the RoPE embeddings. |
|
use_sliding_window (`bool`, *optional*, defaults to `False`): |
|
Whether to use sliding window attention. |
|
sliding_window (`int`, *optional*, defaults to 4096): |
|
Sliding window attention (SWA) window size. If not specified, will default to `4096`. |
|
max_window_layers (`int`, *optional*, defaults to 80): |
|
The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention. |
|
attention_dropout (`float`, *optional*, defaults to 0.0): |
|
The dropout ratio for the attention probabilities. |
|
rope_scaling (`Dict`, *optional*): |
|
Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type |
|
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value |
|
accordingly. |
|
Expected contents: |
|
`rope_type` (`str`): |
|
The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', |
|
'llama3'], with 'default' being the original RoPE implementation. |
|
`factor` (`float`, *optional*): |
|
Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In |
|
most scaling types, a `factor` of x will enable the model to handle sequences of length x * |
|
original maximum pre-trained length. |
|
`original_max_position_embeddings` (`int`, *optional*): |
|
Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during |
|
pretraining. |
|
`attention_factor` (`float`, *optional*): |
|
Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention |
|
computation. If unspecified, it defaults to value recommended by the implementation, using the |
|
`factor` field to infer the suggested value. |
|
`beta_fast` (`float`, *optional*): |
|
Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear |
|
ramp function. If unspecified, it defaults to 32. |
|
`beta_slow` (`float`, *optional*): |
|
Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear |
|
ramp function. If unspecified, it defaults to 1. |
|
`short_factor` (`List[float]`, *optional*): |
|
Only used with 'longrope'. The scaling factor to be applied to short contexts (< |
|
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden |
|
size divided by the number of attention heads divided by 2 |
|
`long_factor` (`List[float]`, *optional*): |
|
Only used with 'longrope'. The scaling factor to be applied to long contexts (< |
|
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden |
|
size divided by the number of attention heads divided by 2 |
|
`low_freq_factor` (`float`, *optional*): |
|
Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE |
|
`high_freq_factor` (`float`, *optional*): |
|
Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE |
|
image_token_id (`int`, *optional*): |
|
Token index used as placeholder for image embeddings. |
|
video_token_id (`int`, *optional*): |
|
Token index used as placeholder for video embeddings. |
|
|
|
```python |
|
>>> from transformers import Qwen2_5_VLTextModel, Qwen2_5_VLConfig |
|
|
|
>>> # Initializing a Qwen2_5_VL style configuration |
|
>>> configuration = Qwen2_5_VLConfig() |
|
|
|
>>> # Initializing a model from the Qwen2-VL-7B style configuration |
|
>>> model = Qwen2_5_VLTextModel(configuration) |
|
|
|
>>> # Accessing the model configuration |
|
>>> configuration = model.config |
|
```""" |
|
|
|
model_type = "qwen2_5_vl_text" |
|
base_config_key = "text_config" |
|
keys_to_ignore_at_inference = ["past_key_values"] |
|
|
|
base_model_tp_plan = { |
|
"layers.*.self_attn.q_proj": "colwise", |
|
"layers.*.self_attn.k_proj": "colwise", |
|
"layers.*.self_attn.v_proj": "colwise", |
|
"layers.*.self_attn.o_proj": "rowwise", |
|
"layers.*.mlp.gate_proj": "colwise", |
|
"layers.*.mlp.up_proj": "colwise", |
|
"layers.*.mlp.down_proj": "rowwise", |
|
} |
|
base_model_pp_plan = { |
|
"embed_tokens": (["input_ids"], ["inputs_embeds"]), |
|
"layers": (["hidden_states", "attention_mask"], ["hidden_states"]), |
|
"norm": (["hidden_states"], ["hidden_states"]), |
|
} |
|
|
|
def __init__( |
|
self, |
|
vocab_size=152064, |
|
hidden_size=8192, |
|
intermediate_size=29568, |
|
num_hidden_layers=80, |
|
num_attention_heads=64, |
|
num_key_value_heads=8, |
|
hidden_act="silu", |
|
max_position_embeddings=32768, |
|
initializer_range=0.02, |
|
rms_norm_eps=1e-05, |
|
use_cache=True, |
|
tie_word_embeddings=False, |
|
rope_theta=1000000.0, |
|
use_sliding_window=False, |
|
sliding_window=4096, |
|
max_window_layers=80, |
|
attention_dropout=0.0, |
|
rope_scaling=None, |
|
image_token_id=None, |
|
video_token_id=None, |
|
**kwargs, |
|
): |
|
self.vocab_size = vocab_size |
|
self.max_position_embeddings = max_position_embeddings |
|
self.hidden_size = hidden_size |
|
self.intermediate_size = intermediate_size |
|
self.num_hidden_layers = num_hidden_layers |
|
self.num_attention_heads = num_attention_heads |
|
self.use_sliding_window = use_sliding_window |
|
self.sliding_window = sliding_window |
|
self.max_window_layers = max_window_layers |
|
|
|
|
|
if num_key_value_heads is None: |
|
num_key_value_heads = num_attention_heads |
|
|
|
self.num_key_value_heads = num_key_value_heads |
|
self.hidden_act = hidden_act |
|
self.initializer_range = initializer_range |
|
self.rms_norm_eps = rms_norm_eps |
|
self.use_cache = use_cache |
|
self.rope_theta = rope_theta |
|
self.attention_dropout = attention_dropout |
|
self.rope_scaling = rope_scaling |
|
|
|
|
|
|
|
|
|
|
|
|
|
if self.rope_scaling is not None and "type" in self.rope_scaling: |
|
if self.rope_scaling["type"] == "mrope": |
|
self.rope_scaling["type"] = "default" |
|
self.rope_scaling["rope_type"] = self.rope_scaling["type"] |
|
rope_config_validation(self, ignore_keys={"mrope_section"}) |
|
self.image_token_id = image_token_id |
|
self.video_token_id = video_token_id |
|
|
|
super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) |
|
|
|
|
|
class Qwen2_5_VLConfig(PretrainedConfig): |
|
r""" |
|
This is the configuration class to store the configuration of a [`Qwen2_5_VLModel`]. It is used to instantiate a |
|
Qwen2-VL model according to the specified arguments, defining the model architecture. Instantiating a configuration |
|
with the defaults will yield a similar configuration to that of |
|
Qwen2-VL-7B-Instruct [Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct). |
|
|
|
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the |
|
documentation from [`PretrainedConfig`] for more information. |
|
|
|
|
|
Args: |
|
text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Qwen2_5_VLTextConfig`): |
|
The config object or dictionary of the text backbone. |
|
vision_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Qwen2_5_VLVisionConfig`): |
|
The config object or dictionary of the vision backbone. |
|
image_token_id (`int`, *optional*, defaults to 151655): |
|
The image token index to encode the image prompt. |
|
video_token_id (`int`, *optional*, defaults to 151656): |
|
The video token index to encode the image prompt. |
|
|
|
```python |
|
>>> from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLConfig |
|
|
|
>>> # Initializing a Qwen2_5_VL style configuration |
|
>>> configuration = Qwen2_5_VLConfig() |
|
|
|
>>> # Initializing a model from the Qwen2-VL-7B style configuration |
|
>>> model = Qwen2_5_VLForConditionalGeneration(configuration) |
|
|
|
>>> # Accessing the model configuration |
|
>>> configuration = model.config |
|
```""" |
|
|
|
model_type = "qwen2_5_vl" |
|
sub_configs = {"vision_config": Qwen2_5_VLVisionConfig, "text_config": Qwen2_5_VLTextConfig} |
|
keys_to_ignore_at_inference = ["past_key_values"] |
|
|
|
def __init__( |
|
self, |
|
text_config=None, |
|
vision_config=None, |
|
image_token_id=151655, |
|
video_token_id=151656, |
|
**kwargs, |
|
): |
|
if isinstance(vision_config, dict): |
|
self.vision_config = self.sub_configs["vision_config"](**vision_config) |
|
elif vision_config is None: |
|
self.vision_config = self.sub_configs["vision_config"]() |
|
|
|
if isinstance(text_config, dict): |
|
self.text_config = self.sub_configs["text_config"](**text_config) |
|
elif text_config is None: |
|
|
|
self.text_config = self.sub_configs["text_config"](**kwargs) |
|
|
|
self.image_token_id = image_token_id |
|
self.video_token_id = video_token_id |
|
|
|
super().__init__(**kwargs) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import math |
|
from dataclasses import dataclass |
|
from typing import Any, Dict, List, Optional, Tuple, Union |
|
|
|
import torch |
|
import torch.nn as nn |
|
import torch.nn.functional as F |
|
|
|
from transformers.activations import ACT2FN |
|
from transformers.cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache |
|
from transformers.generation import GenerationMixin |
|
from transformers.modeling_attn_mask_utils import AttentionMaskConverter |
|
from transformers.modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available |
|
from transformers.modeling_outputs import BaseModelOutputWithPast, ModelOutput |
|
from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update |
|
from transformers.modeling_utils import PreTrainedModel |
|
from transformers.utils import auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging |
|
|
|
|
|
if is_flash_attn_available(): |
|
from transformers.modeling_flash_attention_utils import apply_rotary_emb, flash_attn_varlen_func |
|
|
|
|
|
if is_flash_attn_available(): |
|
from transformers.modeling_flash_attention_utils import _flash_attention_forward |
|
|
|
if is_torch_flex_attn_available(): |
|
from torch.nn.attention.flex_attention import BlockMask |
|
|
|
from transformers.integrations.flex_attention import make_flex_block_causal_mask |
|
|
|
|
|
logger = logging.get_logger(__name__) |
|
|
|
|
|
class Qwen2_5_VLMLP(nn.Module): |
|
def __init__(self, config, bias: bool = False): |
|
super().__init__() |
|
self.hidden_size = config.hidden_size |
|
self.intermediate_size = config.intermediate_size |
|
self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=bias) |
|
self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=bias) |
|
self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=bias) |
|
self.act_fn = ACT2FN[config.hidden_act] |
|
|
|
def forward(self, hidden_state): |
|
return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state)) |
|
|
|
|
|
class Qwen2_5_VisionPatchEmbed(nn.Module): |
|
def __init__( |
|
self, |
|
patch_size: int = 14, |
|
temporal_patch_size: int = 2, |
|
in_channels: int = 3, |
|
embed_dim: int = 1152, |
|
) -> None: |
|
super().__init__() |
|
self.patch_size = patch_size |
|
self.temporal_patch_size = temporal_patch_size |
|
self.in_channels = in_channels |
|
self.embed_dim = embed_dim |
|
|
|
kernel_size = [temporal_patch_size, patch_size, patch_size] |
|
self.proj = nn.Conv3d(in_channels, embed_dim, kernel_size=kernel_size, stride=kernel_size, bias=False) |
|
|
|
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: |
|
target_dtype = self.proj.weight.dtype |
|
hidden_states = hidden_states.view( |
|
-1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size |
|
) |
|
hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).view(-1, self.embed_dim) |
|
return hidden_states |
|
|
|
|
|
class Qwen2_5_VisionRotaryEmbedding(nn.Module): |
|
def __init__(self, dim: int, theta: float = 10000.0) -> None: |
|
super().__init__() |
|
inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim)) |
|
self.register_buffer("inv_freq", inv_freq, persistent=False) |
|
|
|
def forward(self, seqlen: int) -> torch.Tensor: |
|
seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype) |
|
freqs = torch.outer(seq, self.inv_freq) |
|
return freqs |
|
|
|
|
|
class Qwen2RMSNorm(nn.Module): |
|
def __init__(self, hidden_size, eps=1e-6): |
|
""" |
|
Qwen2RMSNorm is equivalent to T5LayerNorm |
|
""" |
|
super().__init__() |
|
self.weight = nn.Parameter(torch.ones(hidden_size)) |
|
self.variance_epsilon = eps |
|
|
|
def forward(self, hidden_states): |
|
input_dtype = hidden_states.dtype |
|
hidden_states = hidden_states.to(torch.float32) |
|
variance = hidden_states.pow(2).mean(-1, keepdim=True) |
|
hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) |
|
return self.weight * hidden_states.to(input_dtype) |
|
|
|
def extra_repr(self): |
|
return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" |
|
|
|
|
|
class Qwen2_5_VLPatchMerger(nn.Module): |
|
def __init__(self, dim: int, context_dim: int, spatial_merge_size: int = 2) -> None: |
|
super().__init__() |
|
self.hidden_size = context_dim * (spatial_merge_size**2) |
|
self.ln_q = Qwen2RMSNorm(context_dim, eps=1e-6) |
|
self.mlp = nn.Sequential( |
|
nn.Linear(self.hidden_size, self.hidden_size), |
|
nn.GELU(), |
|
nn.Linear(self.hidden_size, dim), |
|
) |
|
|
|
def forward(self, x: torch.Tensor) -> torch.Tensor: |
|
x = self.mlp(self.ln_q(x).view(-1, self.hidden_size)) |
|
return x |
|
|
|
|
|
def apply_rotary_pos_emb_flashatt( |
|
q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor |
|
) -> Tuple[torch.Tensor, torch.Tensor]: |
|
cos = cos.chunk(2, dim=-1)[0].contiguous() |
|
sin = sin.chunk(2, dim=-1)[0].contiguous() |
|
q_embed = apply_rotary_emb(q.float(), cos.float(), sin.float()).type_as(q) |
|
k_embed = apply_rotary_emb(k.float(), cos.float(), sin.float()).type_as(k) |
|
return q_embed, k_embed |
|
|
|
|
|
class Qwen2_5_VLVisionFlashAttention2(nn.Module): |
|
def __init__(self, dim: int, num_heads: int = 16) -> None: |
|
super().__init__() |
|
self.num_heads = num_heads |
|
self.qkv = nn.Linear(dim, dim * 3, bias=True) |
|
self.proj = nn.Linear(dim, dim) |
|
|
|
def forward( |
|
self, |
|
hidden_states: torch.Tensor, |
|
cu_seqlens: torch.Tensor, |
|
rotary_pos_emb: Optional[torch.Tensor] = None, |
|
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, |
|
) -> torch.Tensor: |
|
seq_length = hidden_states.shape[0] |
|
q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0) |
|
if position_embeddings is None: |
|
logger.warning_once( |
|
"The attention layers in this model are transitioning from computing the RoPE embeddings internally " |
|
"through `rotary_pos_emb` (2D tensor of RoPE theta values), to using externally computed " |
|
"`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.54 `rotary_pos_emb` will be " |
|
"removed and `position_embeddings` will be mandatory." |
|
) |
|
emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1) |
|
cos = emb.cos() |
|
sin = emb.sin() |
|
else: |
|
cos, sin = position_embeddings |
|
q, k = apply_rotary_pos_emb_flashatt(q.unsqueeze(0), k.unsqueeze(0), cos, sin) |
|
q = q.squeeze(0) |
|
k = k.squeeze(0) |
|
|
|
max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() |
|
attn_output = flash_attn_varlen_func(q, k, v, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen).reshape( |
|
seq_length, -1 |
|
) |
|
attn_output = self.proj(attn_output) |
|
return attn_output |
|
|
|
|
|
def rotate_half(x): |
|
"""Rotates half the hidden dims of the input.""" |
|
x1 = x[..., : x.shape[-1] // 2] |
|
x2 = x[..., x.shape[-1] // 2 :] |
|
return torch.cat((-x2, x1), dim=-1) |
|
|
|
|
|
def apply_rotary_pos_emb_vision( |
|
q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor |
|
) -> Tuple[torch.Tensor, torch.Tensor]: |
|
orig_q_dtype = q.dtype |
|
orig_k_dtype = k.dtype |
|
q, k = q.float(), k.float() |
|
cos, sin = cos.unsqueeze(-2).float(), sin.unsqueeze(-2).float() |
|
q_embed = (q * cos) + (rotate_half(q) * sin) |
|
k_embed = (k * cos) + (rotate_half(k) * sin) |
|
q_embed = q_embed.to(orig_q_dtype) |
|
k_embed = k_embed.to(orig_k_dtype) |
|
return q_embed, k_embed |
|
|
|
|
|
class Qwen2_5_VLVisionAttention(nn.Module): |
|
def __init__(self, dim: int, num_heads: int = 16) -> None: |
|
super().__init__() |
|
self.num_heads = num_heads |
|
self.head_dim = dim // num_heads |
|
self.qkv = nn.Linear(dim, dim * 3, bias=True) |
|
self.proj = nn.Linear(dim, dim) |
|
|
|
def forward( |
|
self, |
|
hidden_states: torch.Tensor, |
|
cu_seqlens: torch.Tensor, |
|
rotary_pos_emb: Optional[torch.Tensor] = None, |
|
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, |
|
) -> torch.Tensor: |
|
seq_length = hidden_states.shape[0] |
|
q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0) |
|
if position_embeddings is None: |
|
logger.warning_once( |
|
"The attention layers in this model are transitioning from computing the RoPE embeddings internally " |
|
"through `rotary_pos_emb` (2D tensor of RoPE theta values), to using externally computed " |
|
"`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.54 `rotary_pos_emb` will be " |
|
"removed and `position_embeddings` will be mandatory." |
|
) |
|
emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1) |
|
cos = emb.cos() |
|
sin = emb.sin() |
|
else: |
|
cos, sin = position_embeddings |
|
q, k = apply_rotary_pos_emb_vision(q, k, cos, sin) |
|
|
|
attention_mask = torch.full( |
|
[1, seq_length, seq_length], torch.finfo(q.dtype).min, device=q.device, dtype=q.dtype |
|
) |
|
for i in range(1, len(cu_seqlens)): |
|
attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = 0 |
|
|
|
q = q.transpose(0, 1) |
|
k = k.transpose(0, 1) |
|
v = v.transpose(0, 1) |
|
attn_weights = torch.matmul(q, k.transpose(1, 2)) / math.sqrt(self.head_dim) |
|
attn_weights = attn_weights + attention_mask |
|
attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(q.dtype) |
|
attn_output = torch.matmul(attn_weights, v) |
|
attn_output = attn_output.transpose(0, 1) |
|
attn_output = attn_output.reshape(seq_length, -1) |
|
attn_output = self.proj(attn_output) |
|
return attn_output |
|
|
|
|
|
class Qwen2_5_VLVisionSdpaAttention(nn.Module): |
|
def __init__(self, dim: int, num_heads: int = 16) -> None: |
|
super().__init__() |
|
self.num_heads = num_heads |
|
self.qkv = nn.Linear(dim, dim * 3, bias=True) |
|
self.proj = nn.Linear(dim, dim) |
|
|
|
def forward( |
|
self, |
|
hidden_states: torch.Tensor, |
|
cu_seqlens: torch.Tensor, |
|
rotary_pos_emb: Optional[torch.Tensor] = None, |
|
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, |
|
) -> torch.Tensor: |
|
seq_length = hidden_states.shape[0] |
|
q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0) |
|
if position_embeddings is None: |
|
logger.warning_once( |
|
"The attention layers in this model are transitioning from computing the RoPE embeddings internally " |
|
"through `rotary_pos_emb` (2D tensor of RoPE theta values), to using externally computed " |
|
"`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.54 `rotary_pos_emb` will be " |
|
"removed and `position_embeddings` will be mandatory." |
|
) |
|
emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1) |
|
cos = emb.cos() |
|
sin = emb.sin() |
|
else: |
|
cos, sin = position_embeddings |
|
q, k = apply_rotary_pos_emb_vision(q, k, cos, sin) |
|
|
|
attention_mask = torch.zeros([1, seq_length, seq_length], device=q.device, dtype=torch.bool) |
|
for i in range(1, len(cu_seqlens)): |
|
attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = True |
|
q = q.transpose(0, 1) |
|
k = k.transpose(0, 1) |
|
v = v.transpose(0, 1) |
|
attn_output = F.scaled_dot_product_attention( |
|
q.unsqueeze(0), k.unsqueeze(0), v.unsqueeze(0), attention_mask, dropout_p=0.0 |
|
) |
|
attn_output = attn_output.squeeze(0).transpose(0, 1) |
|
attn_output = attn_output.reshape(seq_length, -1) |
|
attn_output = self.proj(attn_output) |
|
return attn_output |
|
|
|
|
|
QWEN2_5_VL_VISION_ATTENTION_CLASSES = { |
|
"eager": Qwen2_5_VLVisionAttention, |
|
"flash_attention_2": Qwen2_5_VLVisionFlashAttention2, |
|
"sdpa": Qwen2_5_VLVisionSdpaAttention, |
|
} |
|
|
|
|
|
class Qwen2_5_VLVisionBlock(nn.Module): |
|
def __init__(self, config, attn_implementation: str = "sdpa") -> None: |
|
super().__init__() |
|
self.norm1 = Qwen2RMSNorm(config.hidden_size, eps=1e-6) |
|
self.norm2 = Qwen2RMSNorm(config.hidden_size, eps=1e-6) |
|
self.attn = QWEN2_5_VL_VISION_ATTENTION_CLASSES[attn_implementation]( |
|
config.hidden_size, num_heads=config.num_heads |
|
) |
|
self.mlp = Qwen2_5_VLMLP(config, bias=True) |
|
|
|
def forward( |
|
self, |
|
hidden_states: torch.Tensor, |
|
cu_seqlens: torch.Tensor, |
|
rotary_pos_emb: Optional[torch.Tensor] = None, |
|
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, |
|
) -> torch.Tensor: |
|
hidden_states = hidden_states + self.attn( |
|
self.norm1(hidden_states), |
|
cu_seqlens=cu_seqlens, |
|
rotary_pos_emb=rotary_pos_emb, |
|
position_embeddings=position_embeddings, |
|
) |
|
hidden_states = hidden_states + self.mlp(self.norm2(hidden_states)) |
|
return hidden_states |
|
|
|
|
|
@auto_docstring |
|
class Qwen2_5_VLPreTrainedModel(PreTrainedModel): |
|
config_class = Qwen2_5_VLConfig |
|
base_model_prefix = "model" |
|
supports_gradient_checkpointing = True |
|
_no_split_modules = ["Qwen2_5_VLDecoderLayer", "Qwen2_5_VLVisionBlock"] |
|
_skip_keys_device_placement = "past_key_values" |
|
_supports_flash_attn_2 = True |
|
_supports_sdpa = True |
|
_supports_cache_class = True |
|
_supports_static_cache = False |
|
|
|
def _init_weights(self, module): |
|
std = self.config.get_text_config().initializer_range |
|
if isinstance(module, (nn.Linear, nn.Conv3d)): |
|
module.weight.data.normal_(mean=0.0, std=std) |
|
if module.bias is not None: |
|
module.bias.data.zero_() |
|
elif isinstance(module, nn.Embedding): |
|
module.weight.data.normal_(mean=0.0, std=std) |
|
if module.padding_idx is not None: |
|
module.weight.data[module.padding_idx].zero_() |
|
elif isinstance(module, Qwen2RMSNorm): |
|
module.weight.data.fill_(1.0) |
|
|
|
|
|
class Qwen2_5_VisionTransformerPretrainedModel(Qwen2_5_VLPreTrainedModel): |
|
config_class = Qwen2_5_VLVisionConfig |
|
_no_split_modules = ["Qwen2_5_VLVisionBlock"] |
|
|
|
def __init__(self, config, *inputs, **kwargs) -> None: |
|
super().__init__(config, *inputs, **kwargs) |
|
self.spatial_merge_size = config.spatial_merge_size |
|
self.patch_size = config.patch_size |
|
self.fullatt_block_indexes = config.fullatt_block_indexes |
|
self.window_size = config.window_size |
|
self.spatial_merge_unit = self.spatial_merge_size * self.spatial_merge_size |
|
|
|
self.patch_embed = Qwen2_5_VisionPatchEmbed( |
|
patch_size=config.patch_size, |
|
temporal_patch_size=config.temporal_patch_size, |
|
in_channels=config.in_channels, |
|
embed_dim=config.hidden_size, |
|
) |
|
|
|
head_dim = config.hidden_size // config.num_heads |
|
self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2) |
|
|
|
self.blocks = nn.ModuleList( |
|
[Qwen2_5_VLVisionBlock(config, config._attn_implementation) for _ in range(config.depth)] |
|
) |
|
self.merger = Qwen2_5_VLPatchMerger( |
|
dim=config.out_hidden_size, |
|
context_dim=config.hidden_size, |
|
spatial_merge_size=config.spatial_merge_size, |
|
) |
|
self.gradient_checkpointing = False |
|
|
|
def rot_pos_emb(self, grid_thw): |
|
pos_ids = [] |
|
for t, h, w in grid_thw: |
|
hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) |
|
hpos_ids = hpos_ids.reshape( |
|
h // self.spatial_merge_size, |
|
self.spatial_merge_size, |
|
w // self.spatial_merge_size, |
|
self.spatial_merge_size, |
|
) |
|
hpos_ids = hpos_ids.permute(0, 2, 1, 3) |
|
hpos_ids = hpos_ids.flatten() |
|
|
|
wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) |
|
wpos_ids = wpos_ids.reshape( |
|
h // self.spatial_merge_size, |
|
self.spatial_merge_size, |
|
w // self.spatial_merge_size, |
|
self.spatial_merge_size, |
|
) |
|
wpos_ids = wpos_ids.permute(0, 2, 1, 3) |
|
wpos_ids = wpos_ids.flatten() |
|
pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) |
|
pos_ids = torch.cat(pos_ids, dim=0) |
|
max_grid_size = grid_thw[:, 1:].max() |
|
rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size) |
|
rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) |
|
return rotary_pos_emb |
|
|
|
def get_window_index(self, grid_thw): |
|
window_index: list = [] |
|
cu_window_seqlens: list = [0] |
|
window_index_id = 0 |
|
vit_merger_window_size = self.window_size // self.spatial_merge_size // self.patch_size |
|
|
|
for grid_t, grid_h, grid_w in grid_thw: |
|
llm_grid_h, llm_grid_w = ( |
|
grid_h // self.spatial_merge_size, |
|
grid_w // self.spatial_merge_size, |
|
) |
|
index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(grid_t, llm_grid_h, llm_grid_w) |
|
pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size |
|
pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size |
|
num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size |
|
num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size |
|
index_padded = F.pad(index, (0, pad_w, 0, pad_h), "constant", -100) |
|
index_padded = index_padded.reshape( |
|
grid_t, |
|
num_windows_h, |
|
vit_merger_window_size, |
|
num_windows_w, |
|
vit_merger_window_size, |
|
) |
|
index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape( |
|
grid_t, |
|
num_windows_h * num_windows_w, |
|
vit_merger_window_size, |
|
vit_merger_window_size, |
|
) |
|
seqlens = (index_padded != -100).sum([2, 3]).reshape(-1) |
|
index_padded = index_padded.reshape(-1) |
|
index_new = index_padded[index_padded != -100] |
|
window_index.append(index_new + window_index_id) |
|
cu_seqlens_tmp = seqlens.cumsum(0) * self.spatial_merge_unit + cu_window_seqlens[-1] |
|
cu_window_seqlens.extend(cu_seqlens_tmp.tolist()) |
|
window_index_id += (grid_t * llm_grid_h * llm_grid_w).item() |
|
window_index = torch.cat(window_index, dim=0) |
|
|
|
return window_index, cu_window_seqlens |
|
|
|
def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor) -> torch.Tensor: |
|
""" |
|
Args: |
|
hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`): |
|
The final hidden states of the model. |
|
grid_thw (`torch.Tensor` of shape `(num_images_or_videos, 3)`): |
|
The temporal, height and width of feature shape of each image in LLM. |
|
|
|
Returns: |
|
`torch.Tensor`: hidden_states. |
|
""" |
|
hidden_states = self.patch_embed(hidden_states) |
|
rotary_pos_emb = self.rot_pos_emb(grid_thw) |
|
window_index, cu_window_seqlens = self.get_window_index(grid_thw) |
|
cu_window_seqlens = torch.tensor( |
|
cu_window_seqlens, |
|
device=hidden_states.device, |
|
dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32, |
|
) |
|
cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens) |
|
|
|
seq_len, _ = hidden_states.size() |
|
hidden_states = hidden_states.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1) |
|
hidden_states = hidden_states[window_index, :, :] |
|
hidden_states = hidden_states.reshape(seq_len, -1) |
|
rotary_pos_emb = rotary_pos_emb.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1) |
|
rotary_pos_emb = rotary_pos_emb[window_index, :, :] |
|
rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1) |
|
emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1) |
|
position_embeddings = (emb.cos(), emb.sin()) |
|
|
|
cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum( |
|
dim=0, |
|
|
|
|
|
|
|
|
|
dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32, |
|
) |
|
cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0) |
|
|
|
for layer_num, blk in enumerate(self.blocks): |
|
if layer_num in self.fullatt_block_indexes: |
|
cu_seqlens_now = cu_seqlens |
|
else: |
|
cu_seqlens_now = cu_window_seqlens |
|
if self.gradient_checkpointing and self.training: |
|
hidden_states = self._gradient_checkpointing_func( |
|
blk.__call__, hidden_states, cu_seqlens_now, None, position_embeddings |
|
) |
|
else: |
|
hidden_states = blk(hidden_states, cu_seqlens=cu_seqlens_now, position_embeddings=position_embeddings) |
|
|
|
hidden_states = self.merger(hidden_states) |
|
reverse_indices = torch.argsort(window_index) |
|
hidden_states = hidden_states[reverse_indices, :] |
|
|
|
return hidden_states |
|
|
|
|
|
@dataclass |
|
class Qwen2_5_VLModelOutputWithPast(ModelOutput): |
|
""" |
|
Base class for Llava outputs, with hidden states and attentions. |
|
|
|
Args: |
|
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): |
|
Sequence of hidden-states at the output of the last layer of the model. |
|
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): |
|
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape |
|
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) |
|
|
|
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see |
|
`past_key_values` input) to speed up sequential decoding. |
|
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): |
|
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + |
|
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. |
|
|
|
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. |
|
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): |
|
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, |
|
sequence_length)`. |
|
|
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention |
|
heads. |
|
rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*): |
|
The rope index difference between sequence length and multimodal rope. |
|
""" |
|
|
|
last_hidden_state: torch.FloatTensor = None |
|
past_key_values: Optional[List[torch.FloatTensor]] = None |
|
hidden_states: Optional[Tuple[torch.FloatTensor]] = None |
|
attentions: Optional[Tuple[torch.FloatTensor]] = None |
|
rope_deltas: Optional[torch.LongTensor] = None |
|
|
|
|
|
class Qwen2_5_VLRotaryEmbedding(nn.Module): |
|
def __init__(self, config: Qwen2_5_VLTextConfig, device=None): |
|
super().__init__() |
|
|
|
if hasattr(config, "rope_scaling") and config.rope_scaling is not None: |
|
self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) |
|
else: |
|
self.rope_type = "default" |
|
self.max_seq_len_cached = config.max_position_embeddings |
|
self.original_max_seq_len = config.max_position_embeddings |
|
|
|
self.config = config |
|
self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] |
|
|
|
inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) |
|
self.register_buffer("inv_freq", inv_freq, persistent=False) |
|
self.original_inv_freq = self.inv_freq |
|
|
|
@torch.no_grad() |
|
@dynamic_rope_update |
|
def forward(self, x, position_ids): |
|
|
|
|
|
inv_freq_expanded = self.inv_freq[None, None, :, None].float().expand(3, position_ids.shape[1], -1, 1) |
|
position_ids_expanded = position_ids[:, :, None, :].float() |
|
|
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" |
|
with torch.autocast(device_type=device_type, enabled=False): |
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3) |
|
emb = torch.cat((freqs, freqs), dim=-1) |
|
cos = emb.cos() * self.attention_scaling |
|
sin = emb.sin() * self.attention_scaling |
|
|
|
return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) |
|
|
|
|
|
class Qwen2MLP(nn.Module): |
|
def __init__(self, config): |
|
super().__init__() |
|
self.config = config |
|
self.hidden_size = config.hidden_size |
|
self.intermediate_size = config.intermediate_size |
|
self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) |
|
self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) |
|
self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) |
|
self.act_fn = ACT2FN[config.hidden_act] |
|
|
|
def forward(self, x, task_label: Union[str, List[str]]): |
|
down_proj = self.down_proj(self.act_fn(self.gate_proj(x, task_label)) * self.up_proj(x, task_label), task_label) |
|
return down_proj |
|
|
|
|
|
def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim=1): |
|
"""Applies Rotary Position Embedding with Multimodal Sections to the query and key tensors (https://qwenlm.github.io/blog/qwen2-vl/). |
|
|
|
Explanation: |
|
Multimodal 3D rotary position embedding is an extension to 1D rotary position embedding. The input embedding |
|
sequence contains vision (images / videos) embedding and text embedding or just contains text embedding. For |
|
vision embedding part, we apply rotary position embedding on temporal, height and width dimension separately. |
|
Here we split the channel dimension to 3 chunks for the temporal, height and width rotary position embedding. |
|
For text embedding part, we just apply 1D rotary position embedding. The three rotary position index (temporal, |
|
height and width) of text embedding is always the same, so the text embedding rotary position embedding has no |
|
difference with modern LLMs. |
|
|
|
Args: |
|
q (`torch.Tensor`): The query tensor. |
|
k (`torch.Tensor`): The key tensor. |
|
cos (`torch.Tensor`): The cosine part of the rotary embedding. |
|
sin (`torch.Tensor`): The sine part of the rotary embedding. |
|
position_ids (`torch.Tensor`): |
|
The position indices of the tokens corresponding to the query and key tensors. For example, this can be |
|
used to pass offsetted position ids when working with a KV-cache. |
|
mrope_section(`List(int)`): |
|
Multimodal rope section is for channel dimension of temporal, height and width in rope calculation. |
|
unsqueeze_dim (`int`, *optional*, defaults to 1): |
|
The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and |
|
sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note |
|
that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and |
|
k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes |
|
cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have |
|
the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. |
|
Returns: |
|
`tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. |
|
""" |
|
mrope_section = mrope_section * 2 |
|
cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze( |
|
unsqueeze_dim |
|
) |
|
sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze( |
|
unsqueeze_dim |
|
) |
|
|
|
q_embed = (q * cos) + (rotate_half(q) * sin) |
|
k_embed = (k * cos) + (rotate_half(k) * sin) |
|
return q_embed, k_embed |
|
|
|
|
|
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: |
|
""" |
|
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, |
|
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) |
|
""" |
|
batch, num_key_value_heads, slen, head_dim = hidden_states.shape |
|
if n_rep == 1: |
|
return hidden_states |
|
hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) |
|
return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) |
|
|
|
|
|
class Qwen2_5_VLAttention(nn.Module): |
|
""" |
|
Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer |
|
and "Generating Long Sequences with Sparse Transformers". |
|
""" |
|
|
|
def __init__(self, config: Qwen2_5_VLTextConfig, layer_idx: Optional[int] = None): |
|
super().__init__() |
|
self.config = config |
|
self.layer_idx = layer_idx |
|
if layer_idx is None: |
|
logger.warning_once( |
|
f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will " |
|
"to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` " |
|
"when creating this class." |
|
) |
|
|
|
self.hidden_size = config.hidden_size |
|
self.num_heads = config.num_attention_heads |
|
self.head_dim = self.hidden_size // self.num_heads |
|
self.num_key_value_heads = config.num_key_value_heads |
|
self.num_key_value_groups = self.num_heads // self.num_key_value_heads |
|
self.is_causal = True |
|
self.attention_dropout = config.attention_dropout |
|
self.rope_scaling = config.rope_scaling |
|
|
|
if (self.head_dim * self.num_heads) != self.hidden_size: |
|
raise ValueError( |
|
f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" |
|
f" and `num_heads`: {self.num_heads})." |
|
) |
|
self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True) |
|
self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True) |
|
self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True) |
|
self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) |
|
|
|
self.rotary_emb = Qwen2_5_VLRotaryEmbedding(config=config) |
|
|
|
def forward( |
|
self, |
|
task_label: Union[str, List[str]], |
|
hidden_states: torch.Tensor, |
|
attention_mask: Optional[torch.Tensor] = None, |
|
position_ids: Optional[torch.LongTensor] = None, |
|
past_key_value: Optional[Cache] = None, |
|
output_attentions: bool = False, |
|
use_cache: bool = False, |
|
cache_position: Optional[torch.LongTensor] = None, |
|
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, |
|
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: |
|
bsz, q_len, _ = hidden_states.size() |
|
|
|
query_states = self.q_proj(hidden_states, task_label) |
|
key_states = self.k_proj(hidden_states, task_label) |
|
value_states = self.v_proj(hidden_states, task_label) |
|
|
|
query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) |
|
key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) |
|
value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) |
|
|
|
cos, sin = position_embeddings |
|
query_states, key_states = apply_multimodal_rotary_pos_emb( |
|
query_states, key_states, cos, sin, self.rope_scaling["mrope_section"] |
|
) |
|
|
|
if past_key_value is not None: |
|
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} |
|
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) |
|
|
|
|
|
key_states = repeat_kv(key_states, self.num_key_value_groups) |
|
value_states = repeat_kv(value_states, self.num_key_value_groups) |
|
|
|
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) |
|
|
|
if attention_mask is not None: |
|
causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] |
|
attn_weights = attn_weights + causal_mask |
|
|
|
|
|
|
|
if query_states.dtype == torch.float16: |
|
attn_weights = torch.where(torch.isinf(attn_weights), torch.zeros_like(attn_weights), attn_weights) |
|
|
|
|
|
attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) |
|
attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) |
|
attn_output = torch.matmul(attn_weights, value_states) |
|
|
|
if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): |
|
raise ValueError( |
|
f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" |
|
f" {attn_output.size()}" |
|
) |
|
|
|
attn_output = attn_output.transpose(1, 2).contiguous() |
|
attn_output = attn_output.reshape(bsz, q_len, -1) |
|
|
|
attn_output = self.o_proj(attn_output, task_label) |
|
|
|
if not output_attentions: |
|
attn_weights = None |
|
|
|
return attn_output, attn_weights, past_key_value |
|
|
|
|
|
class Qwen2_5_VLFlashAttention2(Qwen2_5_VLAttention): |
|
""" |
|
Qwen2_5_VL flash attention module, following Qwen2_5_VL attention module. This module inherits from `Qwen2_5_VLAttention` |
|
as the weights of the module stays untouched. The only required change would be on the forward pass |
|
where it needs to correctly call the public API of flash attention and deal with padding tokens |
|
in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom |
|
config.max_window_layers layers. |
|
""" |
|
|
|
def __init__(self, *args, **kwargs): |
|
super().__init__(*args, **kwargs) |
|
|
|
|
|
|
|
|
|
self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask() |
|
|
|
def forward( |
|
self, |
|
task_label: Union[str, List[str]], |
|
hidden_states: torch.Tensor, |
|
attention_mask: Optional[torch.Tensor] = None, |
|
position_ids: Optional[torch.LongTensor] = None, |
|
past_key_value: Optional[Cache] = None, |
|
output_attentions: bool = False, |
|
use_cache: bool = False, |
|
cache_position: Optional[torch.LongTensor] = None, |
|
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, |
|
): |
|
bsz, q_len, _ = hidden_states.size() |
|
|
|
query_states = self.q_proj(hidden_states, task_label) |
|
key_states = self.k_proj(hidden_states, task_label) |
|
value_states = self.v_proj(hidden_states, task_label) |
|
|
|
query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) |
|
key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) |
|
value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) |
|
|
|
|
|
cos, sin = position_embeddings |
|
query_states, key_states = apply_multimodal_rotary_pos_emb( |
|
query_states, key_states, cos, sin, self.rope_scaling["mrope_section"] |
|
) |
|
|
|
if past_key_value is not None: |
|
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} |
|
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) |
|
|
|
|
|
key_states = repeat_kv(key_states, self.num_key_value_groups) |
|
value_states = repeat_kv(value_states, self.num_key_value_groups) |
|
dropout_rate = 0.0 if not self.training else self.attention_dropout |
|
|
|
|
|
|
|
|
|
input_dtype = query_states.dtype |
|
if input_dtype == torch.float32: |
|
if torch.is_autocast_enabled(): |
|
target_dtype = torch.get_autocast_gpu_dtype() |
|
|
|
elif hasattr(self.config, "_pre_quantization_dtype"): |
|
target_dtype = self.config._pre_quantization_dtype |
|
else: |
|
target_dtype = self.q_proj.weight.dtype |
|
|
|
logger.warning_once( |
|
f"The input hidden states seems to be silently casted in float32, this might be related to" |
|
f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" |
|
f" {target_dtype}." |
|
) |
|
|
|
query_states = query_states.to(target_dtype) |
|
key_states = key_states.to(target_dtype) |
|
value_states = value_states.to(target_dtype) |
|
|
|
|
|
query_states = query_states.transpose(1, 2) |
|
key_states = key_states.transpose(1, 2) |
|
value_states = value_states.transpose(1, 2) |
|
|
|
if ( |
|
self.config.use_sliding_window |
|
and getattr(self.config, "sliding_window", None) is not None |
|
and self.layer_idx >= self.config.max_window_layers |
|
): |
|
sliding_window = self.config.sliding_window |
|
else: |
|
sliding_window = None |
|
|
|
attn_output = _flash_attention_forward( |
|
query_states, |
|
key_states, |
|
value_states, |
|
attention_mask, |
|
q_len, |
|
dropout=dropout_rate, |
|
sliding_window=sliding_window, |
|
is_causal=self.is_causal, |
|
use_top_left_mask=self._flash_attn_uses_top_left_mask, |
|
) |
|
|
|
attn_output = attn_output.reshape(bsz, q_len, -1).contiguous() |
|
attn_output = self.o_proj(attn_output, task_label) |
|
|
|
if not output_attentions: |
|
attn_weights = None |
|
|
|
return attn_output, attn_weights, past_key_value |
|
|
|
|
|
class Qwen2_5_VLSdpaAttention(Qwen2_5_VLAttention): |
|
""" |
|
Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from |
|
`Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to |
|
SDPA API. |
|
""" |
|
|
|
|
|
def forward( |
|
self, |
|
task_label: Union[str, List[str]], |
|
hidden_states: torch.Tensor, |
|
attention_mask: Optional[torch.Tensor] = None, |
|
position_ids: Optional[torch.LongTensor] = None, |
|
past_key_value: Optional[Cache] = None, |
|
output_attentions: bool = False, |
|
use_cache: bool = False, |
|
cache_position: Optional[torch.LongTensor] = None, |
|
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, |
|
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: |
|
if output_attentions: |
|
|
|
logger.warning_once( |
|
"Qwen2_5_VLModel is using Qwen2_5_VLSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " |
|
'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' |
|
) |
|
return super().forward( |
|
task_label=task_label, |
|
hidden_states=hidden_states, |
|
attention_mask=attention_mask, |
|
position_ids=position_ids, |
|
past_key_value=past_key_value, |
|
output_attentions=output_attentions, |
|
use_cache=use_cache, |
|
cache_position=cache_position, |
|
position_embeddings=position_embeddings, |
|
) |
|
|
|
bsz, q_len, _ = hidden_states.size() |
|
|
|
query_states = self.q_proj(hidden_states, task_label) |
|
key_states = self.k_proj(hidden_states, task_label) |
|
value_states = self.v_proj(hidden_states, task_label) |
|
|
|
query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) |
|
key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) |
|
value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) |
|
|
|
cos, sin = position_embeddings |
|
query_states, key_states = apply_multimodal_rotary_pos_emb( |
|
query_states, key_states, cos, sin, self.rope_scaling["mrope_section"] |
|
) |
|
|
|
if past_key_value is not None: |
|
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} |
|
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) |
|
|
|
key_states = repeat_kv(key_states, self.num_key_value_groups) |
|
value_states = repeat_kv(value_states, self.num_key_value_groups) |
|
|
|
causal_mask = attention_mask |
|
if attention_mask is not None: |
|
causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] |
|
|
|
|
|
|
|
if query_states.device.type == "cuda" and attention_mask is not None: |
|
query_states = query_states.contiguous() |
|
key_states = key_states.contiguous() |
|
value_states = value_states.contiguous() |
|
|
|
|
|
|
|
|
|
is_causal = True if causal_mask is None and q_len > 1 else False |
|
|
|
attn_output = torch.nn.functional.scaled_dot_product_attention( |
|
query_states, |
|
key_states, |
|
value_states, |
|
attn_mask=causal_mask, |
|
dropout_p=self.attention_dropout if self.training else 0.0, |
|
is_causal=is_causal, |
|
) |
|
|
|
attn_output = attn_output.transpose(1, 2).contiguous() |
|
attn_output = attn_output.view(bsz, q_len, -1) |
|
|
|
attn_output = self.o_proj(attn_output, task_label) |
|
|
|
return attn_output, None, past_key_value |
|
|
|
|
|
QWEN2_5_VL_ATTENTION_CLASSES = { |
|
"eager": Qwen2_5_VLAttention, |
|
"flash_attention_2": Qwen2_5_VLFlashAttention2, |
|
"sdpa": Qwen2_5_VLSdpaAttention, |
|
} |
|
|
|
|
|
class Qwen2_5_VLDecoderLayer(nn.Module): |
|
def __init__(self, config: Qwen2_5_VLTextConfig, layer_idx: int): |
|
super().__init__() |
|
self.hidden_size = config.hidden_size |
|
|
|
if config.use_sliding_window and config._attn_implementation != "flash_attention_2": |
|
logger.warning_once( |
|
f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; " |
|
"unexpected results may be encountered." |
|
) |
|
self.self_attn = QWEN2_5_VL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx) |
|
|
|
self.mlp = Qwen2MLP(config) |
|
self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) |
|
self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) |
|
|
|
def forward( |
|
self, |
|
task_label: Union[str, List[str]], |
|
hidden_states: torch.Tensor, |
|
attention_mask: Optional[torch.Tensor] = None, |
|
position_ids: Optional[torch.LongTensor] = None, |
|
past_key_value: Optional[Tuple[torch.Tensor]] = None, |
|
output_attentions: Optional[bool] = False, |
|
use_cache: Optional[bool] = False, |
|
cache_position: Optional[torch.LongTensor] = None, |
|
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, |
|
**kwargs, |
|
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: |
|
""" |
|
Args: |
|
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` |
|
attention_mask (`torch.FloatTensor`, *optional*): attention mask of size |
|
`(batch, sequence_length)` where padding elements are indicated by 0. |
|
output_attentions (`bool`, *optional*): |
|
Whether or not to return the attentions tensors of all attention layers. See `attentions` under |
|
returned tensors for more detail. |
|
use_cache (`bool`, *optional*): |
|
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding |
|
(see `past_key_values`). |
|
past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states |
|
cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): |
|
Indices depicting the position of the input sequence tokens in the sequence. |
|
position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*): |
|
Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`, |
|
with `head_dim` being the embedding dimension of each attention head. |
|
kwargs (`dict`, *optional*): |
|
Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code |
|
into the model |
|
""" |
|
|
|
residual = hidden_states |
|
|
|
hidden_states = self.input_layernorm(hidden_states) |
|
|
|
|
|
hidden_states, self_attn_weights, present_key_value = self.self_attn( |
|
task_label=task_label, |
|
hidden_states=hidden_states, |
|
attention_mask=attention_mask, |
|
position_ids=position_ids, |
|
past_key_value=past_key_value, |
|
output_attentions=output_attentions, |
|
use_cache=use_cache, |
|
cache_position=cache_position, |
|
position_embeddings=position_embeddings, |
|
) |
|
hidden_states = residual + hidden_states |
|
|
|
|
|
residual = hidden_states |
|
hidden_states = self.post_attention_layernorm(hidden_states) |
|
hidden_states = self.mlp(hidden_states, task_label) |
|
hidden_states = residual + hidden_states |
|
|
|
outputs = (hidden_states,) |
|
|
|
if output_attentions: |
|
outputs += (self_attn_weights,) |
|
|
|
if use_cache: |
|
outputs += (present_key_value,) |
|
|
|
return outputs |
|
|
|
|
|
@auto_docstring |
|
class Qwen2_5_VLTextModel(Qwen2_5_VLPreTrainedModel): |
|
config_class = Qwen2_5_VLTextConfig |
|
|
|
def __init__(self, config: Qwen2_5_VLTextConfig): |
|
super().__init__(config) |
|
self.padding_idx = config.pad_token_id |
|
self.vocab_size = config.vocab_size |
|
|
|
self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) |
|
self.layers = nn.ModuleList( |
|
[Qwen2_5_VLDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] |
|
) |
|
self._attn_implementation = config._attn_implementation |
|
self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) |
|
self.rotary_emb = Qwen2_5_VLRotaryEmbedding(config=config) |
|
|
|
self.gradient_checkpointing = False |
|
|
|
self.post_init() |
|
|
|
def get_input_embeddings(self): |
|
return self.embed_tokens |
|
|
|
def set_input_embeddings(self, value): |
|
self.embed_tokens = value |
|
|
|
@auto_docstring |
|
def forward( |
|
self, |
|
task_label: Union[str, List[str]], |
|
input_ids: Optional[torch.LongTensor] = None, |
|
attention_mask: Optional[torch.Tensor] = None, |
|
position_ids: Optional[torch.LongTensor] = None, |
|
past_key_values: Optional[List[torch.FloatTensor]] = None, |
|
inputs_embeds: Optional[torch.FloatTensor] = None, |
|
use_cache: Optional[bool] = None, |
|
output_attentions: Optional[bool] = None, |
|
output_hidden_states: Optional[bool] = None, |
|
return_dict: Optional[bool] = None, |
|
cache_position: Optional[torch.LongTensor] = None, |
|
) -> Union[Tuple, BaseModelOutputWithPast]: |
|
""" |
|
Args: |
|
task_label (`Union[str, List[str]]`): |
|
Task adapter to use for computing embeddings. If string, all batch examples use the same adapter. |
|
If list of strings, each example uses its corresponding adapter. Must be one of the supported |
|
task names (e.g., 'retrieval', 'text-matching', 'code'). |
|
""" |
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions |
|
output_hidden_states = ( |
|
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states |
|
) |
|
use_cache = use_cache if use_cache is not None else self.config.use_cache |
|
|
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict |
|
|
|
if (input_ids is None) ^ (inputs_embeds is not None): |
|
raise ValueError("You must specify exactly one of input_ids or inputs_embeds") |
|
|
|
if self.gradient_checkpointing and self.training: |
|
if use_cache: |
|
logger.warning_once( |
|
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." |
|
) |
|
use_cache = False |
|
|
|
|
|
if use_cache and past_key_values is None and not torch.jit.is_tracing(): |
|
past_key_values = DynamicCache() |
|
|
|
if inputs_embeds is None: |
|
inputs_embeds = self.embed_tokens(input_ids) |
|
|
|
if cache_position is None: |
|
past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 |
|
cache_position = torch.arange( |
|
past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device |
|
) |
|
|
|
|
|
if position_ids is None: |
|
position_ids = cache_position.view(1, 1, -1).expand(3, inputs_embeds.shape[0], -1) |
|
elif position_ids.dim() == 2: |
|
position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1) |
|
|
|
causal_mask = self._update_causal_mask( |
|
attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions |
|
) |
|
|
|
hidden_states = inputs_embeds |
|
|
|
|
|
position_embeddings = self.rotary_emb(hidden_states, position_ids) |
|
|
|
|
|
all_hidden_states = () if output_hidden_states else None |
|
all_self_attns = () if output_attentions else None |
|
next_decoder_cache = None |
|
|
|
for decoder_layer in self.layers: |
|
if output_hidden_states: |
|
all_hidden_states += (hidden_states,) |
|
|
|
if self.gradient_checkpointing and self.training: |
|
layer_outputs = self._gradient_checkpointing_func( |
|
decoder_layer.__call__, |
|
task_label, |
|
hidden_states, |
|
causal_mask, |
|
position_ids, |
|
past_key_values, |
|
output_attentions, |
|
use_cache, |
|
cache_position, |
|
position_embeddings, |
|
) |
|
else: |
|
layer_outputs = decoder_layer( |
|
task_label=task_label, |
|
hidden_states=hidden_states, |
|
attention_mask=causal_mask, |
|
position_ids=position_ids, |
|
past_key_value=past_key_values, |
|
output_attentions=output_attentions, |
|
use_cache=use_cache, |
|
cache_position=cache_position, |
|
position_embeddings=position_embeddings, |
|
) |
|
|
|
hidden_states = layer_outputs[0] |
|
|
|
if use_cache: |
|
next_decoder_cache = layer_outputs[2 if output_attentions else 1] |
|
|
|
if output_attentions: |
|
all_self_attns += (layer_outputs[1],) |
|
|
|
hidden_states = self.norm(hidden_states) |
|
|
|
|
|
if output_hidden_states: |
|
all_hidden_states += (hidden_states,) |
|
|
|
next_cache = next_decoder_cache if use_cache else None |
|
|
|
if not return_dict: |
|
return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) |
|
return BaseModelOutputWithPast( |
|
last_hidden_state=hidden_states, |
|
past_key_values=next_cache, |
|
hidden_states=all_hidden_states, |
|
attentions=all_self_attns, |
|
) |
|
|
|
def _update_causal_mask( |
|
self, |
|
attention_mask: Union[torch.Tensor, "BlockMask"], |
|
input_tensor: torch.Tensor, |
|
cache_position: torch.Tensor, |
|
past_key_values: Cache, |
|
output_attentions: bool = False, |
|
): |
|
if self.config._attn_implementation == "flash_attention_2": |
|
if attention_mask is not None and past_key_values is not None: |
|
is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0] |
|
if is_padding_right: |
|
raise ValueError( |
|
"You are attempting to perform batched generation with padding_side='right'" |
|
" this may lead to unexpected behaviour for Flash Attention version of Qwen2_5_VL. Make sure to " |
|
" call `tokenizer.padding_side = 'left'` before tokenizing the input. " |
|
) |
|
if attention_mask is not None and 0.0 in attention_mask: |
|
return attention_mask |
|
return None |
|
if self.config._attn_implementation == "flex_attention": |
|
if isinstance(attention_mask, torch.Tensor): |
|
attention_mask = make_flex_block_causal_mask(attention_mask) |
|
return attention_mask |
|
|
|
|
|
|
|
|
|
past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 |
|
using_static_cache = isinstance(past_key_values, StaticCache) |
|
using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache) |
|
|
|
|
|
if ( |
|
self.config._attn_implementation == "sdpa" |
|
and not (using_static_cache or using_sliding_window_cache) |
|
and not output_attentions |
|
): |
|
if AttentionMaskConverter._ignore_causal_mask_sdpa( |
|
attention_mask, |
|
inputs_embeds=input_tensor, |
|
past_key_values_length=past_seen_tokens, |
|
sliding_window=self.config.sliding_window, |
|
is_training=self.training, |
|
): |
|
return None |
|
|
|
dtype = input_tensor.dtype |
|
min_dtype = torch.finfo(dtype).min |
|
sequence_length = input_tensor.shape[1] |
|
|
|
if using_sliding_window_cache or using_static_cache: |
|
target_length = past_key_values.get_max_cache_shape() |
|
|
|
else: |
|
target_length = ( |
|
attention_mask.shape[-1] |
|
if isinstance(attention_mask, torch.Tensor) |
|
else past_seen_tokens + sequence_length + 1 |
|
) |
|
|
|
|
|
causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position( |
|
attention_mask, |
|
sequence_length=sequence_length, |
|
target_length=target_length, |
|
dtype=dtype, |
|
cache_position=cache_position, |
|
batch_size=input_tensor.shape[0], |
|
config=self.config, |
|
past_key_values=past_key_values, |
|
) |
|
|
|
if ( |
|
self.config._attn_implementation == "sdpa" |
|
and attention_mask is not None |
|
and attention_mask.device.type in ["cuda", "xpu", "npu"] |
|
and not output_attentions |
|
): |
|
|
|
|
|
|
|
causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) |
|
|
|
return causal_mask |
|
|
|
@staticmethod |
|
def _prepare_4d_causal_attention_mask_with_cache_position( |
|
attention_mask: torch.Tensor, |
|
sequence_length: int, |
|
target_length: int, |
|
dtype: torch.dtype, |
|
cache_position: torch.Tensor, |
|
batch_size: int, |
|
config: Qwen2_5_VLConfig, |
|
past_key_values: Cache, |
|
): |
|
""" |
|
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape |
|
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. |
|
|
|
Args: |
|
attention_mask (`torch.Tensor`): |
|
A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`. |
|
sequence_length (`int`): |
|
The sequence length being processed. |
|
target_length (`int`): |
|
The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet. |
|
dtype (`torch.dtype`): |
|
The dtype to use for the 4D attention mask. |
|
cache_position (`torch.Tensor`): |
|
Indices depicting the position of the input sequence tokens in the sequence. |
|
batch_size (`torch.Tensor`): |
|
Batch size. |
|
config (`Qwen2_5_VLConfig`): |
|
The model's configuration class |
|
past_key_values (`Cache`): |
|
The cache class that is being used currently to generate |
|
""" |
|
if attention_mask is not None and attention_mask.dim() == 4: |
|
|
|
causal_mask = attention_mask |
|
else: |
|
min_dtype = torch.finfo(dtype).min |
|
causal_mask = torch.full( |
|
(sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device |
|
) |
|
diagonal_attend_mask = torch.arange(target_length, device=cache_position.device) > cache_position.reshape( |
|
-1, 1 |
|
) |
|
text_config = config.get_text_config() |
|
if getattr(text_config, "use_sliding_window", True) and text_config.sliding_window is not None: |
|
|
|
|
|
if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length: |
|
sliding_attend_mask = torch.arange(target_length, device=cache_position.device) <= ( |
|
cache_position.reshape(-1, 1) - text_config.sliding_window |
|
) |
|
diagonal_attend_mask.bitwise_or_(sliding_attend_mask) |
|
causal_mask *= diagonal_attend_mask |
|
causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) |
|
if attention_mask is not None: |
|
causal_mask = causal_mask.clone() |
|
if attention_mask.shape[-1] > target_length: |
|
attention_mask = attention_mask[:, :target_length] |
|
mask_length = attention_mask.shape[-1] |
|
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to( |
|
causal_mask.device |
|
) |
|
padding_mask = padding_mask == 0 |
|
causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( |
|
padding_mask, min_dtype |
|
) |
|
return causal_mask |
|
|
|
|
|
@auto_docstring |
|
class Qwen2_5_VLModel(Qwen2_5_VLPreTrainedModel): |
|
base_model_prefix = "" |
|
_checkpoint_conversion_mapping = {"^model": "language_model"} |
|
config_class = Qwen2_5_VLConfig |
|
_no_split_modules = ["Qwen2_5_VLDecoderLayer", "Qwen2_5_VLVisionBlock"] |
|
|
|
def __init__(self, config): |
|
super().__init__(config) |
|
self.visual = Qwen2_5_VisionTransformerPretrainedModel._from_config(config.vision_config) |
|
self.language_model = Qwen2_5_VLTextModel._from_config(config.text_config) |
|
self.rope_deltas = None |
|
|
|
|
|
self.post_init() |
|
|
|
def get_input_embeddings(self): |
|
return self.language_model.get_input_embeddings() |
|
|
|
def set_input_embeddings(self, value): |
|
self.language_model.set_input_embeddings(value) |
|
|
|
def get_rope_index( |
|
self, |
|
input_ids: Optional[torch.LongTensor] = None, |
|
image_grid_thw: Optional[torch.LongTensor] = None, |
|
video_grid_thw: Optional[torch.LongTensor] = None, |
|
second_per_grid_ts: Optional[torch.Tensor] = None, |
|
attention_mask: Optional[torch.Tensor] = None, |
|
) -> Tuple[torch.Tensor, torch.Tensor]: |
|
""" |
|
Calculate the 3D rope index based on image and video's temporal, height and width in LLM. |
|
|
|
Explanation: |
|
Each embedding sequence contains vision embedding and text embedding or just contains text embedding. |
|
|
|
For pure text embedding sequence, the rotary position embedding has no difference with modern LLMs. |
|
Examples: |
|
input_ids: [T T T T T], here T is for text. |
|
temporal position_ids: [0, 1, 2, 3, 4] |
|
height position_ids: [0, 1, 2, 3, 4] |
|
width position_ids: [0, 1, 2, 3, 4] |
|
|
|
For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part |
|
and 1D rotary position embedding for text part. |
|
Examples: |
|
Temporal (Time): 3 patches, representing different segments of the video in time. |
|
Height: 2 patches, dividing each frame vertically. |
|
Width: 2 patches, dividing each frame horizontally. |
|
We also have some important parameters: |
|
fps (Frames Per Second): The video's frame rate, set to 1. This means one frame is processed each second. |
|
tokens_per_second: This is a crucial parameter. It dictates how many "time-steps" or "temporal tokens" are conceptually packed into a one-second interval of the video. In this case, we have 25 tokens per second. So each second of the video will be represented with 25 separate time points. It essentially defines the temporal granularity. |
|
temporal_patch_size: The number of frames that compose one temporal patch. Here, it's 2 frames. |
|
interval: The step size for the temporal position IDs, calculated as tokens_per_second * temporal_patch_size / fps. In this case, 25 * 2 / 1 = 50. This means that each temporal patch will be have a difference of 50 in the temporal position IDs. |
|
input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision. |
|
vision temporal position_ids: [0, 0, 0, 0, 50, 50, 50, 50, 100, 100, 100, 100] |
|
vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1] |
|
vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1] |
|
text temporal position_ids: [101, 102, 103, 104, 105] |
|
text height position_ids: [101, 102, 103, 104, 105] |
|
text width position_ids: [101, 102, 103, 104, 105] |
|
Here we calculate the text start position_ids as the max vision position_ids plus 1. |
|
|
|
Args: |
|
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): |
|
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide |
|
it. |
|
image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): |
|
The temporal, height and width of feature shape of each image in LLM. |
|
video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*): |
|
The temporal, height and width of feature shape of each video in LLM. |
|
second_per_grid_ts (`torch.Tensor` of shape `(num_videos)`, *optional*): |
|
The time interval (in seconds) for each grid along the temporal dimension in the 3D position IDs. |
|
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): |
|
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: |
|
|
|
- 1 for tokens that are **not masked**, |
|
- 0 for tokens that are **masked**. |
|
|
|
Returns: |
|
position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`) |
|
mrope_position_deltas (`torch.Tensor` of shape `(batch_size)`) |
|
""" |
|
spatial_merge_size = self.config.vision_config.spatial_merge_size |
|
image_token_id = self.config.image_token_id |
|
video_token_id = self.config.video_token_id |
|
vision_start_token_id = self.config.vision_start_token_id |
|
mrope_position_deltas = [] |
|
if input_ids is not None and (image_grid_thw is not None or video_grid_thw is not None): |
|
total_input_ids = input_ids |
|
if attention_mask is None: |
|
attention_mask = torch.ones_like(total_input_ids) |
|
position_ids = torch.ones( |
|
3, |
|
input_ids.shape[0], |
|
input_ids.shape[1], |
|
dtype=input_ids.dtype, |
|
device=input_ids.device, |
|
) |
|
image_index, video_index = 0, 0 |
|
attention_mask = attention_mask.to(total_input_ids.device) |
|
for i, input_ids in enumerate(total_input_ids): |
|
input_ids = input_ids[attention_mask[i] == 1] |
|
image_nums, video_nums = 0, 0 |
|
vision_start_indices = torch.argwhere(input_ids == vision_start_token_id).squeeze(1) |
|
vision_tokens = input_ids[vision_start_indices + 1] |
|
image_nums = (vision_tokens == image_token_id).sum() |
|
video_nums = (vision_tokens == video_token_id).sum() |
|
input_tokens = input_ids.tolist() |
|
llm_pos_ids_list: list = [] |
|
st = 0 |
|
remain_images, remain_videos = image_nums, video_nums |
|
for _ in range(image_nums + video_nums): |
|
if image_token_id in input_tokens and remain_images > 0: |
|
ed_image = input_tokens.index(image_token_id, st) |
|
else: |
|
ed_image = len(input_tokens) + 1 |
|
if video_token_id in input_tokens and remain_videos > 0: |
|
ed_video = input_tokens.index(video_token_id, st) |
|
else: |
|
ed_video = len(input_tokens) + 1 |
|
if ed_image < ed_video: |
|
t, h, w = ( |
|
image_grid_thw[image_index][0], |
|
image_grid_thw[image_index][1], |
|
image_grid_thw[image_index][2], |
|
) |
|
second_per_grid_t = 0 |
|
image_index += 1 |
|
remain_images -= 1 |
|
ed = ed_image |
|
|
|
else: |
|
t, h, w = ( |
|
video_grid_thw[video_index][0], |
|
video_grid_thw[video_index][1], |
|
video_grid_thw[video_index][2], |
|
) |
|
if second_per_grid_ts is not None: |
|
second_per_grid_t = second_per_grid_ts[video_index] |
|
else: |
|
second_per_grid_t = 1.0 |
|
video_index += 1 |
|
remain_videos -= 1 |
|
ed = ed_video |
|
llm_grid_t, llm_grid_h, llm_grid_w = ( |
|
t.item(), |
|
h.item() // spatial_merge_size, |
|
w.item() // spatial_merge_size, |
|
) |
|
text_len = ed - st |
|
|
|
st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 |
|
llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) |
|
|
|
range_tensor = torch.arange(llm_grid_t).view(-1, 1) |
|
expanded_range = range_tensor.expand(-1, llm_grid_h * llm_grid_w) |
|
|
|
|
|
second_per_grid_t = torch.as_tensor( |
|
second_per_grid_t, dtype=range_tensor.dtype, device=range_tensor.device |
|
) |
|
|
|
time_tensor = expanded_range * second_per_grid_t * self.config.vision_config.tokens_per_second |
|
|
|
time_tensor_long = time_tensor.long() |
|
t_index = time_tensor_long.flatten() |
|
|
|
h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten() |
|
w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten() |
|
llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx) |
|
st = ed + llm_grid_t * llm_grid_h * llm_grid_w |
|
|
|
if st < len(input_tokens): |
|
st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 |
|
text_len = len(input_tokens) - st |
|
llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) |
|
|
|
llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) |
|
position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device) |
|
mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i])) |
|
mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1) |
|
return position_ids, mrope_position_deltas |
|
else: |
|
if attention_mask is not None: |
|
position_ids = attention_mask.long().cumsum(-1) - 1 |
|
position_ids.masked_fill_(attention_mask == 0, 1) |
|
position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device) |
|
max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0] |
|
mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1] |
|
else: |
|
position_ids = ( |
|
torch.arange(input_ids.shape[1], device=input_ids.device) |
|
.view(1, 1, -1) |
|
.expand(3, input_ids.shape[0], -1) |
|
) |
|
mrope_position_deltas = torch.zeros( |
|
[input_ids.shape[0], 1], |
|
device=input_ids.device, |
|
dtype=input_ids.dtype, |
|
) |
|
|
|
return position_ids, mrope_position_deltas |
|
|
|
def get_video_features( |
|
self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None |
|
): |
|
""" |
|
Encodes videos into continuous embeddings that can be forwarded to the language model. |
|
|
|
Args: |
|
pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): |
|
The tensors corresponding to the input videos. |
|
video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*): |
|
The temporal, height and width of feature shape of each video in LLM. |
|
""" |
|
pixel_values_videos = pixel_values_videos.type(self.visual.dtype) |
|
video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw) |
|
return video_embeds |
|
|
|
def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None): |
|
""" |
|
Encodes images into continuous embeddings that can be forwarded to the language model. |
|
|
|
Args: |
|
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): |
|
The tensors corresponding to the input images. |
|
image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): |
|
The temporal, height and width of feature shape of each image in LLM. |
|
""" |
|
pixel_values = pixel_values.type(self.visual.dtype) |
|
image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw) |
|
return image_embeds |
|
|
|
@auto_docstring |
|
def forward( |
|
self, |
|
task_label: Union[str, List[str]], |
|
input_ids: torch.LongTensor = None, |
|
attention_mask: Optional[torch.Tensor] = None, |
|
position_ids: Optional[torch.LongTensor] = None, |
|
past_key_values: Optional[List[torch.FloatTensor]] = None, |
|
inputs_embeds: Optional[torch.FloatTensor] = None, |
|
use_cache: Optional[bool] = None, |
|
output_attentions: Optional[bool] = None, |
|
output_hidden_states: Optional[bool] = None, |
|
return_dict: Optional[bool] = None, |
|
pixel_values: Optional[torch.Tensor] = None, |
|
pixel_values_videos: Optional[torch.FloatTensor] = None, |
|
image_grid_thw: Optional[torch.LongTensor] = None, |
|
video_grid_thw: Optional[torch.LongTensor] = None, |
|
rope_deltas: Optional[torch.LongTensor] = None, |
|
cache_position: Optional[torch.LongTensor] = None, |
|
second_per_grid_ts: Optional[torch.Tensor] = None, |
|
) -> Union[Tuple, Qwen2_5_VLModelOutputWithPast]: |
|
r""" |
|
task_label (`Union[str, List[str]]`): |
|
Task adapter to use for computing embeddings. If string, all batch examples use the same adapter. |
|
If list of strings, each example uses its corresponding adapter. Must be one of the supported |
|
task names (e.g., 'retrieval', 'text-matching', 'code'). |
|
pixel_values_videos (`torch.FloatTensor` of shape `(seq_length, num_channels * temporal_size * image_size * image_size)): |
|
The tensors corresponding to the input videos. Pixel values can be obtained using |
|
[`AutoImageProcessor`]. See [`Qwen2_5_VLImageProcessor.__call__`] for details. [`Qwen2_5_VLProcessor`] uses |
|
[`Qwen2_5_VLImageProcessor`] for processing videos. |
|
image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): |
|
The temporal, height and width of feature shape of each image in LLM. |
|
video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*): |
|
The temporal, height and width of feature shape of each video in LLM. |
|
rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*): |
|
The rope index difference between sequence length and multimodal rope. |
|
second_per_grid_ts (`torch.Tensor` of shape `(num_videos)`, *optional*): |
|
The time interval (in seconds) for each grid along the temporal dimension in the 3D position IDs. |
|
""" |
|
|
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions |
|
output_hidden_states = ( |
|
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states |
|
) |
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict |
|
|
|
if inputs_embeds is None: |
|
inputs_embeds = self.get_input_embeddings()(input_ids) |
|
if pixel_values is not None: |
|
image_embeds = self.get_image_features(pixel_values, image_grid_thw) |
|
n_image_tokens = (input_ids == self.config.image_token_id).sum().item() |
|
n_image_features = image_embeds.shape[0] |
|
if n_image_tokens != n_image_features: |
|
raise ValueError( |
|
f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" |
|
) |
|
|
|
mask = input_ids == self.config.image_token_id |
|
mask_unsqueezed = mask.unsqueeze(-1) |
|
mask_expanded = mask_unsqueezed.expand_as(inputs_embeds) |
|
image_mask = mask_expanded.to(inputs_embeds.device) |
|
|
|
image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype) |
|
inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds) |
|
|
|
if pixel_values_videos is not None: |
|
video_embeds = self.get_video_features(pixel_values_videos, video_grid_thw) |
|
n_video_tokens = (input_ids == self.config.video_token_id).sum().item() |
|
n_video_features = video_embeds.shape[0] |
|
if n_video_tokens != n_video_features: |
|
raise ValueError( |
|
f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}" |
|
) |
|
|
|
mask = input_ids == self.config.video_token_id |
|
mask_unsqueezed = mask.unsqueeze(-1) |
|
mask_expanded = mask_unsqueezed.expand_as(inputs_embeds) |
|
video_mask = mask_expanded.to(inputs_embeds.device) |
|
|
|
video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype) |
|
inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds) |
|
|
|
if attention_mask is not None: |
|
attention_mask = attention_mask.to(inputs_embeds.device) |
|
|
|
|
|
if position_ids is None and (attention_mask is None or attention_mask.ndim == 2): |
|
|
|
if ( |
|
(cache_position is not None and cache_position[0] == 0) |
|
or self.rope_deltas is None |
|
or (past_key_values is None or past_key_values.get_seq_length() == 0) |
|
): |
|
position_ids, rope_deltas = self.get_rope_index( |
|
input_ids, |
|
image_grid_thw, |
|
video_grid_thw, |
|
second_per_grid_ts, |
|
attention_mask, |
|
) |
|
self.rope_deltas = rope_deltas |
|
|
|
else: |
|
batch_size, seq_length, _ = inputs_embeds.shape |
|
delta = ( |
|
(cache_position[0] + self.rope_deltas).to(inputs_embeds.device) |
|
if cache_position is not None |
|
else 0 |
|
) |
|
position_ids = torch.arange(seq_length, device=inputs_embeds.device) |
|
position_ids = position_ids.view(1, -1).expand(batch_size, -1) |
|
if cache_position is not None: |
|
delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0) |
|
position_ids = position_ids.add(delta) |
|
position_ids = position_ids.unsqueeze(0).expand(3, -1, -1) |
|
|
|
outputs = self.language_model( |
|
task_label=task_label, |
|
input_ids=None, |
|
position_ids=position_ids, |
|
attention_mask=attention_mask, |
|
past_key_values=past_key_values, |
|
inputs_embeds=inputs_embeds, |
|
use_cache=use_cache, |
|
output_attentions=output_attentions, |
|
output_hidden_states=output_hidden_states, |
|
return_dict=True, |
|
cache_position=cache_position, |
|
) |
|
|
|
output = Qwen2_5_VLModelOutputWithPast( |
|
last_hidden_state=outputs.last_hidden_state, |
|
past_key_values=outputs.past_key_values, |
|
hidden_states=outputs.hidden_states, |
|
attentions=outputs.attentions, |
|
rope_deltas=self.rope_deltas, |
|
) |
|
return output if return_dict else output.to_tuple() |
|
|
|
|
|
@dataclass |
|
class Qwen2_5_VLCausalLMOutputWithPast(ModelOutput): |
|
""" |
|
Base class for Qwen2_5_VL causal language model (or autoregressive) outputs. |
|
|
|
Args: |
|
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): |
|
Language modeling loss (for next-token prediction). |
|
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): |
|
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). |
|
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): |
|
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape |
|
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) |
|
|
|
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see |
|
`past_key_values` input) to speed up sequential decoding. |
|
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): |
|
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + |
|
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. |
|
|
|
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. |
|
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): |
|
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, |
|
sequence_length)`. |
|
|
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention |
|
heads. |
|
rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*): |
|
The rope index difference between sequence length and multimodal rope. |
|
""" |
|
|
|
loss: Optional[torch.FloatTensor] = None |
|
logits: Optional[torch.FloatTensor] = None |
|
past_key_values: Optional[List[torch.FloatTensor]] = None |
|
hidden_states: Optional[Tuple[torch.FloatTensor]] = None |
|
attentions: Optional[Tuple[torch.FloatTensor]] = None |
|
rope_deltas: Optional[torch.LongTensor] = None |
|
|
|
|
|
class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMixin): |
|
_checkpoint_conversion_mapping = { |
|
"^visual": "model.visual", |
|
r"^model(?!\.(language_model|visual))": "model.language_model", |
|
} |
|
_tied_weights_keys = ["lm_head.weight"] |
|
|
|
def __init__(self, config): |
|
super().__init__(config) |
|
self.model = Qwen2_5_VLModel(config) |
|
self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False) |
|
|
|
self.post_init() |
|
|
|
def get_input_embeddings(self): |
|
return self.model.get_input_embeddings() |
|
|
|
def set_input_embeddings(self, value): |
|
self.model.set_input_embeddings(value) |
|
|
|
def get_output_embeddings(self): |
|
return self.lm_head |
|
|
|
def set_output_embeddings(self, new_embeddings): |
|
self.lm_head = new_embeddings |
|
|
|
def set_decoder(self, decoder): |
|
self.model = decoder |
|
|
|
def get_decoder(self): |
|
return self.model |
|
|
|
|
|
@property |
|
def language_model(self): |
|
return self.model.language_model |
|
|
|
@property |
|
def visual(self): |
|
return self.model.visual |
|
|
|
@can_return_tuple |
|
@auto_docstring |
|
def forward( |
|
self, |
|
task_label: Union[str, List[str]], |
|
input_ids: torch.LongTensor = None, |
|
attention_mask: Optional[torch.Tensor] = None, |
|
position_ids: Optional[torch.LongTensor] = None, |
|
past_key_values: Optional[List[torch.FloatTensor]] = None, |
|
inputs_embeds: Optional[torch.FloatTensor] = None, |
|
labels: Optional[torch.LongTensor] = None, |
|
use_cache: Optional[bool] = None, |
|
output_attentions: Optional[bool] = None, |
|
output_hidden_states: Optional[bool] = None, |
|
return_dict: Optional[bool] = None, |
|
pixel_values: Optional[torch.Tensor] = None, |
|
pixel_values_videos: Optional[torch.FloatTensor] = None, |
|
image_grid_thw: Optional[torch.LongTensor] = None, |
|
video_grid_thw: Optional[torch.LongTensor] = None, |
|
rope_deltas: Optional[torch.LongTensor] = None, |
|
cache_position: Optional[torch.LongTensor] = None, |
|
second_per_grid_ts: Optional[torch.Tensor] = None, |
|
) -> Union[Tuple, Qwen2_5_VLCausalLMOutputWithPast]: |
|
r""" |
|
task_label (`Union[str, List[str]]`): |
|
Task adapter to use for computing embeddings. If string, all batch examples use the same adapter. |
|
If list of strings, each example uses its corresponding adapter. Must be one of the supported |
|
task names (e.g., 'retrieval', 'text-matching', 'code'). |
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): |
|
Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., |
|
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored |
|
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. |
|
pixel_values_videos (`torch.FloatTensor` of shape `(seq_length, num_channels * temporal_size * image_size * image_size)): |
|
The tensors corresponding to the input videos. Pixel values can be obtained using |
|
[`AutoImageProcessor`]. See [`Qwen2_5_VLImageProcessor.__call__`] for details. [`Qwen2_5_VLProcessor`] uses |
|
[`Qwen2_5_VLImageProcessor`] for processing videos. |
|
image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): |
|
The temporal, height and width of feature shape of each image in LLM. |
|
video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*): |
|
The temporal, height and width of feature shape of each video in LLM. |
|
rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*): |
|
The rope index difference between sequence length and multimodal rope. |
|
second_per_grid_ts (`torch.Tensor` of shape `(num_videos)`, *optional*): |
|
The time interval (in seconds) for each grid along the temporal dimension in the 3D position IDs. |
|
|
|
Example: |
|
|
|
```python |
|
>>> from PIL import Image |
|
>>> import requests |
|
>>> from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration |
|
|
|
>>> model = Qwen2_5_VLForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct") |
|
>>> processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct") |
|
|
|
>>> messages = [ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{"type": "image"}, |
|
{"type": "text", "text": "What is shown in this image?"}, |
|
], |
|
}, |
|
] |
|
>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg" |
|
>>> image = Image.open(requests.get(url, stream=True).raw) |
|
|
|
>>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) |
|
>>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos]) |
|
|
|
>>> # Generate |
|
>>> generate_ids = model.generate(inputs.input_ids, max_length=30) |
|
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] |
|
"The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..." |
|
```""" |
|
|
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions |
|
output_hidden_states = ( |
|
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states |
|
) |
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict |
|
|
|
outputs = self.model( |
|
task_label=task_label, |
|
input_ids=input_ids, |
|
pixel_values=pixel_values, |
|
pixel_values_videos=pixel_values_videos, |
|
image_grid_thw=image_grid_thw, |
|
video_grid_thw=video_grid_thw, |
|
second_per_grid_ts=second_per_grid_ts, |
|
position_ids=position_ids, |
|
attention_mask=attention_mask, |
|
past_key_values=past_key_values, |
|
inputs_embeds=inputs_embeds, |
|
use_cache=use_cache, |
|
output_attentions=output_attentions, |
|
output_hidden_states=output_hidden_states, |
|
return_dict=return_dict, |
|
cache_position=cache_position, |
|
) |
|
|
|
hidden_states = outputs[0] |
|
logits = self.lm_head(hidden_states) |
|
|
|
loss = None |
|
if labels is not None: |
|
loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size) |
|
|
|
if not return_dict: |
|
output = (logits,) + outputs[1:] |
|
return (loss,) + output if loss is not None else output |
|
|
|
return Qwen2_5_VLCausalLMOutputWithPast( |
|
loss=loss, |
|
logits=logits, |
|
past_key_values=outputs.past_key_values, |
|
hidden_states=outputs.hidden_states, |
|
attentions=outputs.attentions, |
|
rope_deltas=outputs.rope_deltas, |
|
) |
|
|
|
def prepare_inputs_for_generation( |
|
self, |
|
input_ids, |
|
past_key_values=None, |
|
attention_mask=None, |
|
inputs_embeds=None, |
|
cache_position=None, |
|
position_ids=None, |
|
use_cache=True, |
|
pixel_values=None, |
|
pixel_values_videos=None, |
|
image_grid_thw=None, |
|
video_grid_thw=None, |
|
second_per_grid_ts=None, |
|
**kwargs, |
|
): |
|
|
|
|
|
model_inputs = super().prepare_inputs_for_generation( |
|
input_ids, |
|
past_key_values=past_key_values, |
|
attention_mask=attention_mask, |
|
inputs_embeds=inputs_embeds, |
|
cache_position=cache_position, |
|
position_ids=position_ids, |
|
pixel_values=pixel_values, |
|
pixel_values_videos=pixel_values_videos, |
|
image_grid_thw=image_grid_thw, |
|
video_grid_thw=video_grid_thw, |
|
second_per_grid_ts=second_per_grid_ts, |
|
use_cache=use_cache, |
|
**kwargs, |
|
) |
|
|
|
|
|
model_inputs["position_ids"] = None |
|
|
|
if cache_position[0] != 0: |
|
model_inputs["pixel_values"] = None |
|
model_inputs["pixel_values_videos"] = None |
|
|
|
return model_inputs |
|
|
|
def _get_image_nums_and_video_nums( |
|
self, |
|
input_ids: Optional[torch.LongTensor], |
|
) -> Tuple[torch.Tensor, torch.Tensor]: |
|
""" |
|
Get the number of images and videos for each sample to calculate the separation length of the sample tensor. |
|
These parameters are not passed through the processor to avoid unpredictable impacts from interface modifications. |
|
|
|
Args: |
|
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): |
|
Indices of input sequence tokens in the vocabulary. |
|
|
|
Returns: |
|
image_nums (`torch.LongTensor` of shape `(batch_size, num_images_sample)`) |
|
video_nums (`torch.LongTensor` of shape `(batch_size, num_videos_sample)`) |
|
""" |
|
image_token_id = self.config.image_token_id |
|
video_token_id = self.config.video_token_id |
|
vision_start_token_id = self.config.vision_start_token_id |
|
|
|
vision_start_mask = input_ids == vision_start_token_id |
|
vision_first_mask = torch.roll(vision_start_mask, shifts=1, dims=1) |
|
image_mask = input_ids == image_token_id |
|
video_mask = input_ids == video_token_id |
|
image_nums = torch.sum(vision_first_mask & image_mask, dim=1) |
|
video_nums = torch.sum(vision_first_mask & video_mask, dim=1) |
|
|
|
return image_nums, video_nums |
|
|
|
def _expand_inputs_for_generation( |
|
self, |
|
expand_size: int = 1, |
|
is_encoder_decoder: bool = False, |
|
input_ids: Optional[torch.LongTensor] = None, |
|
**model_kwargs, |
|
) -> Tuple[torch.LongTensor, Dict[str, Any]]: |
|
|
|
|
|
|
|
|
|
|
|
if expand_size == 1: |
|
return input_ids, model_kwargs |
|
|
|
visual_keys = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw", "second_per_grid_ts"] |
|
|
|
def _expand_dict_for_generation_visual(dict_to_expand): |
|
image_grid_thw = model_kwargs.get("image_grid_thw", None) |
|
video_grid_thw = model_kwargs.get("video_grid_thw", None) |
|
image_nums, video_nums = self._get_image_nums_and_video_nums(input_ids) |
|
|
|
def _repeat_interleave_samples(x, lengths, repeat_times): |
|
samples = torch.split(x, lengths) |
|
repeat_args = [repeat_times] + [1] * (x.dim() - 1) |
|
result = torch.cat([sample.repeat(*repeat_args) for sample in samples], dim=0) |
|
return result |
|
|
|
for key in dict_to_expand: |
|
if key == "pixel_values": |
|
|
|
samples = torch.split(image_grid_thw, list(image_nums)) |
|
|
|
lengths = [torch.prod(sample, dim=1).sum() for sample in samples] |
|
dict_to_expand[key] = _repeat_interleave_samples( |
|
dict_to_expand[key], lengths=lengths, repeat_times=expand_size |
|
) |
|
elif key == "image_grid_thw": |
|
|
|
lengths = list(image_nums) |
|
dict_to_expand[key] = _repeat_interleave_samples( |
|
dict_to_expand[key], lengths=lengths, repeat_times=expand_size |
|
) |
|
elif key == "pixel_values_videos": |
|
samples = torch.split(video_grid_thw, list(video_nums)) |
|
lengths = [torch.prod(sample, dim=1).sum() for sample in samples] |
|
dict_to_expand[key] = _repeat_interleave_samples( |
|
dict_to_expand[key], lengths=lengths, repeat_times=expand_size |
|
) |
|
elif key == "video_grid_thw": |
|
lengths = list(video_nums) |
|
dict_to_expand[key] = _repeat_interleave_samples( |
|
dict_to_expand[key], lengths=lengths, repeat_times=expand_size |
|
) |
|
elif key == "second_per_grid_ts": |
|
if not isinstance(dict_to_expand[key], list): |
|
raise TypeError( |
|
f"Expected value for key '{key}' to be a list, but got {type(dict_to_expand[key])} instead." |
|
) |
|
tensor = torch.tensor(dict_to_expand[key]) |
|
lengths = list(video_nums) |
|
tensor = _repeat_interleave_samples(tensor, lengths=lengths, repeat_times=expand_size) |
|
dict_to_expand[key] = tensor.tolist() |
|
return dict_to_expand |
|
|
|
def _expand_dict_for_generation(dict_to_expand): |
|
for key in dict_to_expand: |
|
if ( |
|
key != "cache_position" |
|
and dict_to_expand[key] is not None |
|
and isinstance(dict_to_expand[key], torch.Tensor) |
|
and key not in visual_keys |
|
): |
|
dict_to_expand[key] = dict_to_expand[key].repeat_interleave(expand_size, dim=0) |
|
return dict_to_expand |
|
|
|
|
|
|
|
if input_ids is not None and input_ids.numel() != 0: |
|
model_kwargs = _expand_dict_for_generation_visual(model_kwargs) |
|
|
|
if input_ids is not None: |
|
input_ids = input_ids.repeat_interleave(expand_size, dim=0) |
|
|
|
model_kwargs = _expand_dict_for_generation(model_kwargs) |
|
|
|
if is_encoder_decoder: |
|
if model_kwargs.get("encoder_outputs") is None: |
|
raise ValueError("If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.") |
|
model_kwargs["encoder_outputs"] = _expand_dict_for_generation(model_kwargs["encoder_outputs"]) |
|
|
|
return input_ids, model_kwargs |
|
|
|
@staticmethod |
|
def _prepare_4d_causal_attention_mask_with_cache_position( |
|
attention_mask: torch.Tensor, |
|
sequence_length: int, |
|
target_length: int, |
|
dtype: torch.dtype, |
|
cache_position: torch.Tensor, |
|
batch_size: int, |
|
**kwargs, |
|
): |
|
""" |
|
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape |
|
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. |
|
|
|
Args: |
|
attention_mask (`torch.Tensor`): |
|
A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape |
|
`(batch_size, 1, query_length, key_value_length)`. |
|
sequence_length (`int`): |
|
The sequence length being processed. |
|
target_length (`int`): |
|
The target length: when generating with static cache, the mask should be as long as the static cache, |
|
to account for the 0 padding, the part of the cache that is not filled yet. |
|
dtype (`torch.dtype`): |
|
The dtype to use for the 4D attention mask. |
|
cache_position (`torch.Tensor`): |
|
Indices depicting the position of the input sequence tokens in the sequence. |
|
batch_size (`torch.Tensor`): |
|
Batch size. |
|
""" |
|
if attention_mask is not None and attention_mask.dim() == 4: |
|
|
|
causal_mask = attention_mask |
|
else: |
|
min_dtype = torch.finfo(dtype).min |
|
causal_mask = torch.full( |
|
(sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device |
|
) |
|
if sequence_length != 1: |
|
causal_mask = torch.triu(causal_mask, diagonal=1) |
|
causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1) |
|
causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) |
|
if attention_mask is not None: |
|
causal_mask = causal_mask.clone() |
|
mask_length = attention_mask.shape[-1] |
|
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to( |
|
causal_mask.device |
|
) |
|
padding_mask = padding_mask == 0 |
|
causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( |
|
padding_mask, min_dtype |
|
) |
|
|
|
return causal_mask |
|
|
|
|
|
|
|
from typing import List, Optional, Union |
|
|
|
from transformers.feature_extraction_utils import BatchFeature |
|
from transformers.image_utils import ImageInput |
|
from transformers.processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs |
|
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput |
|
from transformers.video_utils import VideoInput |
|
|
|
|
|
class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False): |
|
fps: Union[List[float], float] |
|
|
|
|
|
class Qwen2_5_VLImagesKwargs(ImagesKwargs): |
|
min_pixels: Optional[int] |
|
max_pixels: Optional[int] |
|
patch_size: Optional[int] |
|
temporal_patch_size: Optional[int] |
|
merge_size: Optional[int] |
|
|
|
|
|
class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False): |
|
images_kwargs: Qwen2_5_VLImagesKwargs |
|
videos_kwargs: Qwen2_5_VLVideosProcessorKwargs |
|
_defaults = { |
|
"text_kwargs": { |
|
"padding": False, |
|
}, |
|
"videos_kwargs": {"fps": 2.0}, |
|
} |
|
|
|
|
|
class Qwen2_5_VLProcessor(ProcessorMixin): |
|
r""" |
|
Constructs a Qwen2.5-VL processor which wraps a Qwen2.5-VL image processor and a Qwen2 tokenizer into a single processor. |
|
[`Qwen2_5_VLProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the |
|
[`~Qwen2_5_VLProcessor.__call__`] and [`~Qwen2_5_VLProcessor.decode`] for more information. |
|
Args: |
|
image_processor ([`Qwen2VLImageProcessor`], *optional*): |
|
The image processor is a required input. |
|
tokenizer ([`Qwen2TokenizerFast`], *optional*): |
|
The tokenizer is a required input. |
|
video_processor ([`Qwen2_5_VLVideoProcessor`], *optional*): |
|
The video processor is a required input. |
|
chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages |
|
in a chat into a tokenizable string. |
|
""" |
|
|
|
attributes = ["image_processor", "tokenizer", "video_processor"] |
|
valid_kwargs = ["chat_template"] |
|
|
|
image_processor_class = "AutoImageProcessor" |
|
video_processor_class = "AutoVideoProcessor" |
|
tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast") |
|
|
|
def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs): |
|
self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token |
|
self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token |
|
self.image_token_id = ( |
|
tokenizer.image_token_id |
|
if getattr(tokenizer, "image_token_id", None) |
|
else tokenizer.convert_tokens_to_ids(self.image_token) |
|
) |
|
self.video_token_id = ( |
|
tokenizer.video_token_id |
|
if getattr(tokenizer, "video_token_id", None) |
|
else tokenizer.convert_tokens_to_ids(self.video_token) |
|
) |
|
super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template) |
|
|
|
def __call__( |
|
self, |
|
images: ImageInput = None, |
|
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, |
|
videos: VideoInput = None, |
|
**kwargs: Unpack[Qwen2_5_VLProcessorKwargs], |
|
) -> BatchFeature: |
|
""" |
|
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` |
|
and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode |
|
the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to |
|
Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`. |
|
|
|
Args: |
|
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): |
|
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch |
|
tensor. Both channels-first and channels-last formats are supported. |
|
text (`str`, `List[str]`, `List[List[str]]`): |
|
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings |
|
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set |
|
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences). |
|
videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`): |
|
The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch |
|
tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported. |
|
return_tensors (`str` or [`~utils.TensorType`], *optional*): |
|
If set, will return tensors of a particular framework. Acceptable values are: |
|
- `'tf'`: Return TensorFlow `tf.constant` objects. |
|
- `'pt'`: Return PyTorch `torch.Tensor` objects. |
|
- `'np'`: Return NumPy `np.ndarray` objects. |
|
- `'jax'`: Return JAX `jnp.ndarray` objects. |
|
|
|
Returns: |
|
[`BatchFeature`]: A [`BatchFeature`] with the following fields: |
|
|
|
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. |
|
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when |
|
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not |
|
`None`). |
|
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. |
|
- **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`. |
|
- **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`. |
|
- **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`. |
|
- **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`. |
|
""" |
|
output_kwargs = self._merge_kwargs( |
|
Qwen2_5_VLProcessorKwargs, |
|
tokenizer_init_kwargs=self.tokenizer.init_kwargs, |
|
**kwargs, |
|
) |
|
|
|
image_inputs = videos_inputs = {} |
|
if images is not None: |
|
image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"]) |
|
image_grid_thw = image_inputs["image_grid_thw"] |
|
|
|
if videos is not None: |
|
videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"]) |
|
video_grid_thw = videos_inputs["video_grid_thw"] |
|
|
|
fps = output_kwargs["videos_kwargs"].pop("fps", 2.0) |
|
if isinstance(fps, (int, float)): |
|
second_per_grid_ts = [self.video_processor.temporal_patch_size / fps] * len(video_grid_thw) |
|
elif hasattr(fps, "__len__") and len(fps) == len(video_grid_thw): |
|
second_per_grid_ts = [self.video_processor.temporal_patch_size / tmp for tmp in fps] |
|
else: |
|
raise ValueError( |
|
f"The length of fps ({len(fps) if hasattr(fps, '__len__') else fps}) must be equal to the length of video_grid_thw ({len(video_grid_thw)}) or fps should be a single number." |
|
) |
|
videos_inputs.update({"second_per_grid_ts": second_per_grid_ts}) |
|
|
|
if not isinstance(text, list): |
|
text = [text] |
|
|
|
text = text.copy() |
|
if images is not None: |
|
merge_length = self.image_processor.merge_size**2 |
|
index = 0 |
|
for i in range(len(text)): |
|
while self.image_token in text[i]: |
|
num_image_tokens = image_grid_thw[index].prod() // merge_length |
|
text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1) |
|
index += 1 |
|
text[i] = text[i].replace("<|placeholder|>", self.image_token) |
|
|
|
if videos is not None: |
|
merge_length = self.video_processor.merge_size**2 |
|
index = 0 |
|
for i in range(len(text)): |
|
while self.video_token in text[i]: |
|
num_video_tokens = video_grid_thw[index].prod() // merge_length |
|
text[i] = text[i].replace(self.video_token, "<|placeholder|>" * num_video_tokens, 1) |
|
index += 1 |
|
text[i] = text[i].replace("<|placeholder|>", self.video_token) |
|
|
|
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) |
|
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"]) |
|
self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"]) |
|
|
|
return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors) |
|
|
|
def batch_decode(self, *args, **kwargs): |
|
""" |
|
This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please |
|
refer to the docstring of this method for more information. |
|
""" |
|
return self.tokenizer.batch_decode(*args, **kwargs) |
|
|
|
def decode(self, *args, **kwargs): |
|
""" |
|
This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to |
|
the docstring of this method for more information. |
|
""" |
|
return self.tokenizer.decode(*args, **kwargs) |
|
|
|
def post_process_image_text_to_text( |
|
self, generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False, **kwargs |
|
): |
|
""" |
|
Post-process the output of the model to decode the text. |
|
|
|
Args: |
|
generated_outputs (`torch.Tensor` or `np.ndarray`): |
|
The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)` |
|
or `(sequence_length,)`. |
|
skip_special_tokens (`bool`, *optional*, defaults to `True`): |
|
Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method. |
|
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`): |
|
Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method. |
|
**kwargs: |
|
Additional arguments to be passed to the tokenizer's `batch_decode method`. |
|
|
|
Returns: |
|
`List[str]`: The decoded text. |
|
""" |
|
return self.tokenizer.batch_decode( |
|
generated_outputs, |
|
skip_special_tokens=skip_special_tokens, |
|
clean_up_tokenization_spaces=clean_up_tokenization_spaces, |
|
**kwargs, |
|
) |
|
|
|
@property |
|
def model_input_names(self): |
|
tokenizer_input_names = self.tokenizer.model_input_names |
|
image_processor_input_names = self.image_processor.model_input_names |
|
names_from_processor = list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) |
|
return names_from_processor + ["second_per_grid_ts"] |
|
|
|
|
|
|
|
__all__ = ["Qwen2_5_VLForConditionalGeneration", "Qwen2_5_VLModel", "Qwen2_5_VLTextModel", "Qwen2_5_VLVisionConfig", "Qwen2_5_VLTextConfig", "Qwen2_5_VLPreTrainedModel", "Qwen2_5_VLProcessor", "Qwen2_5_VLConfig"] |
|
|
|
|