jupyterjazz commited on
Commit
da7134d
·
verified ·
1 Parent(s): bc22368

update-qwen-implementation (#14)

Browse files

- feat: update everything to support tr 4.52.0+ (f8c79f56aca106798a6ecb49d462a99fc6c46349)
- refactor: minor changes (ea42d55cf66de4e9120930b0b2e9deefcc0009e0)

adapters/adapter_config.json CHANGED
@@ -5,7 +5,7 @@
5
  "bias": "none",
6
  "corda_config": null,
7
  "eva_config": null,
8
- "exclude_modules": null,
9
  "fan_in_fan_out": false,
10
  "inference_mode": true,
11
  "init_lora_weights": "gaussian",
 
5
  "bias": "none",
6
  "corda_config": null,
7
  "eva_config": null,
8
+ "exclude_modules": ".*visual.*",
9
  "fan_in_fan_out": false,
10
  "inference_mode": true,
11
  "init_lora_weights": "gaussian",
adapters/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7a5cb8cc0f4e10f184ccc10f8864999098b887dbc4107221ec0e400d927f4555
3
- size 360095344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9799872132988d3689a35300538fb97fc5b0e02c1c42f7afd914fd1d8b59a88
3
+ size 360118024
config.json CHANGED
@@ -37,7 +37,7 @@
37
  "sliding_window": 32768,
38
  "tie_word_embeddings": true,
39
  "torch_dtype": "bfloat16",
40
- "transformers_version": "4.50.0.dev0",
41
  "use_cache": true,
42
  "use_sliding_window": false,
43
  "video_token_id": 151656,
 
37
  "sliding_window": 32768,
38
  "tie_word_embeddings": true,
39
  "torch_dtype": "bfloat16",
40
+ "transformers_version": "4.52.0",
41
  "use_cache": true,
42
  "use_sliding_window": false,
43
  "video_token_id": 151656,
modeling_jina_embeddings_v4.py CHANGED
@@ -11,7 +11,6 @@ import numpy as np
11
  import torch
12
  from huggingface_hub import snapshot_download
13
  from peft import PeftModel, LoraConfig
14
- from peft.utils.hotswap import hotswap_adapter
15
  from PIL import Image
16
  from torch import nn
17
  from torch.utils.data import DataLoader
@@ -19,7 +18,6 @@ from tqdm import tqdm
19
  from transformers import BatchFeature
20
  from .qwen2_5_vl import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLProcessor
21
  from .configuration_jina_embeddings_v4 import JinaEmbeddingsV4Config
22
- import peft
23
  from .custom_lora_module import MultiAdapterLinear
24
 
25
 
@@ -177,8 +175,7 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
177
  kwargs["pixel_values"] = torch.cat(
178
  [pv[:o] for pv, o in zip(kwargs["pixel_values"], offsets)], dim=0
179
  )
180
-
181
- position_ids, rope_deltas = super().get_rope_index(
182
  input_ids=input_ids,
183
  image_grid_thw=kwargs.get("image_grid_thw", None),
184
  attention_mask=attention_mask,
@@ -209,12 +206,12 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
209
  self.config.multi_vector_projector_dim = config.multi_vector_projector_dim
210
 
211
  self.single_vector_projector = nn.Linear(
212
- in_features=self.config.hidden_size,
213
  out_features=self.config.single_vector_projector_dim,
214
  )
215
 
216
  self.multi_vector_projector = nn.Linear(
217
- in_features=self.config.hidden_size,
218
  out_features=self.config.multi_vector_projector_dim,
219
  )
220
 
@@ -525,6 +522,8 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
525
 
526
  if torch.cuda.is_available() and "attn_implementation" not in kwargs:
527
  kwargs["attn_implementation"] = "flash_attention_2"
 
 
528
 
529
  base_model = super().from_pretrained(
530
  pretrained_model_name_or_path, *args, **kwargs
 
11
  import torch
12
  from huggingface_hub import snapshot_download
13
  from peft import PeftModel, LoraConfig
 
14
  from PIL import Image
15
  from torch import nn
16
  from torch.utils.data import DataLoader
 
18
  from transformers import BatchFeature
19
  from .qwen2_5_vl import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLProcessor
20
  from .configuration_jina_embeddings_v4 import JinaEmbeddingsV4Config
 
21
  from .custom_lora_module import MultiAdapterLinear
22
 
23
 
 
175
  kwargs["pixel_values"] = torch.cat(
176
  [pv[:o] for pv, o in zip(kwargs["pixel_values"], offsets)], dim=0
177
  )
178
+ position_ids, rope_deltas = self.model.get_rope_index(
 
179
  input_ids=input_ids,
180
  image_grid_thw=kwargs.get("image_grid_thw", None),
181
  attention_mask=attention_mask,
 
206
  self.config.multi_vector_projector_dim = config.multi_vector_projector_dim
207
 
208
  self.single_vector_projector = nn.Linear(
209
+ in_features=self.config.text_config.hidden_size,
210
  out_features=self.config.single_vector_projector_dim,
211
  )
212
 
213
  self.multi_vector_projector = nn.Linear(
214
+ in_features=self.config.text_config.hidden_size,
215
  out_features=self.config.multi_vector_projector_dim,
216
  )
217
 
 
522
 
523
  if torch.cuda.is_available() and "attn_implementation" not in kwargs:
524
  kwargs["attn_implementation"] = "flash_attention_2"
525
+
526
+ kwargs["key_mapping"] = super()._checkpoint_conversion_mapping
527
 
528
  base_model = super().from_pretrained(
529
  pretrained_model_name_or_path, *args, **kwargs
preprocessor_config.json CHANGED
@@ -21,6 +21,7 @@
21
  "processor_class": "JinaEmbeddingsV4Processor",
22
  "resample": 3,
23
  "rescale_factor": 0.00392156862745098,
 
24
  "size": {
25
  "longest_edge": 602112,
26
  "shortest_edge": 3136
 
21
  "processor_class": "JinaEmbeddingsV4Processor",
22
  "resample": 3,
23
  "rescale_factor": 0.00392156862745098,
24
+ "video_processor_type": "Qwen2VLVideoProcessor",
25
  "size": {
26
  "longest_edge": 602112,
27
  "shortest_edge": 3136
qwen2_5_vl.py CHANGED
@@ -24,6 +24,7 @@ class Qwen2_5_VLVisionConfig(PretrainedConfig):
24
  window_size=112,
25
  out_hidden_size=3584,
26
  fullatt_block_indexes=[7, 15, 23, 31],
 
27
  **kwargs,
28
  ):
29
  super().__init__(**kwargs)
@@ -41,11 +42,12 @@ class Qwen2_5_VLVisionConfig(PretrainedConfig):
41
  self.window_size = window_size
42
  self.fullatt_block_indexes = fullatt_block_indexes
43
  self.out_hidden_size = out_hidden_size
 
44
 
45
 
46
- class Qwen2_5_VLConfig(PretrainedConfig):
47
  r"""
48
- This is the configuration class to store the configuration of a [`Qwen2_5_VLModel`]. It is used to instantiate a
49
  Qwen2-VL model according to the specified arguments, defining the model architecture. Instantiating a configuration
50
  with the defaults will yield a similar configuration to that of
51
  Qwen2-VL-7B-Instruct [Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct).
@@ -53,7 +55,6 @@ class Qwen2_5_VLConfig(PretrainedConfig):
53
  Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
54
  documentation from [`PretrainedConfig`] for more information.
55
 
56
-
57
  Args:
58
  vocab_size (`int`, *optional*, defaults to 152064):
59
  Vocabulary size of the Qwen2_5_VL model. Defines the number of different tokens that can be represented by the
@@ -96,8 +97,6 @@ class Qwen2_5_VLConfig(PretrainedConfig):
96
  The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
97
  attention_dropout (`float`, *optional*, defaults to 0.0):
98
  The dropout ratio for the attention probabilities.
99
- vision_config (`Dict`, *optional*):
100
- The config for the visual encoder initialization.
101
  rope_scaling (`Dict`, *optional*):
102
  Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
103
  and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
@@ -135,22 +134,26 @@ class Qwen2_5_VLConfig(PretrainedConfig):
135
  Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
136
  `high_freq_factor` (`float`, *optional*):
137
  Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
 
 
 
 
138
 
139
  ```python
140
- >>> from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLConfig
141
 
142
  >>> # Initializing a Qwen2_5_VL style configuration
143
  >>> configuration = Qwen2_5_VLConfig()
144
 
145
  >>> # Initializing a model from the Qwen2-VL-7B style configuration
146
- >>> model = Qwen2_5_VLForConditionalGeneration(configuration)
147
 
148
  >>> # Accessing the model configuration
149
  >>> configuration = model.config
150
  ```"""
151
 
152
- model_type = "qwen2_5_vl"
153
- sub_configs = {"vision_config": Qwen2_5_VLVisionConfig}
154
  keys_to_ignore_at_inference = ["past_key_values"]
155
  # Default tensor parallel plan for base model `Qwen2_5_VL`
156
  base_model_tp_plan = {
@@ -187,15 +190,11 @@ class Qwen2_5_VLConfig(PretrainedConfig):
187
  sliding_window=4096,
188
  max_window_layers=80,
189
  attention_dropout=0.0,
190
- vision_config=None,
191
  rope_scaling=None,
 
 
192
  **kwargs,
193
  ):
194
- if isinstance(vision_config, dict):
195
- self.vision_config = self.sub_configs["vision_config"](**vision_config)
196
- elif vision_config is None:
197
- self.vision_config = self.sub_configs["vision_config"]()
198
-
199
  self.vocab_size = vocab_size
200
  self.max_position_embeddings = max_position_embeddings
201
  self.hidden_size = hidden_size
@@ -221,7 +220,7 @@ class Qwen2_5_VLConfig(PretrainedConfig):
221
 
222
  # Validate the correctness of rotary position embeddings parameters
223
  # BC: if there is a 'type' field, move it to 'rope_type'.
224
- # and change type from 'mrope' to 'default' because `mrope` does defeault RoPE calculations
225
  # one can set it to "linear"/"dynamic" etc. to have scaled RoPE
226
  # TODO: @raushan update config in the hub
227
  if self.rope_scaling is not None and "type" in self.rope_scaling:
@@ -229,10 +228,102 @@ class Qwen2_5_VLConfig(PretrainedConfig):
229
  self.rope_scaling["type"] = "default"
230
  self.rope_scaling["rope_type"] = self.rope_scaling["type"]
231
  rope_config_validation(self, ignore_keys={"mrope_section"})
 
 
232
 
233
  super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
234
 
235
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
 
237
  import math
238
  from dataclasses import dataclass
@@ -241,42 +332,32 @@ from typing import Any, Dict, List, Optional, Tuple, Union
241
  import torch
242
  import torch.nn as nn
243
  import torch.nn.functional as F
244
- from torch.nn import CrossEntropyLoss
245
 
246
  from transformers.activations import ACT2FN
247
  from transformers.cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
248
  from transformers.generation import GenerationMixin
249
  from transformers.modeling_attn_mask_utils import AttentionMaskConverter
 
250
  from transformers.modeling_outputs import BaseModelOutputWithPast, ModelOutput
251
- from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
252
  from transformers.modeling_utils import PreTrainedModel
253
- from transformers.utils import (
254
- add_start_docstrings,
255
- add_start_docstrings_to_model_forward,
256
- is_flash_attn_2_available,
257
- is_flash_attn_greater_or_equal_2_10,
258
- logging,
259
- replace_return_docstrings,
260
- )
261
 
262
- if is_flash_attn_2_available():
263
- from flash_attn import flash_attn_varlen_func
264
- from flash_attn.layers.rotary import apply_rotary_emb
265
 
266
- else:
267
- flash_attn_varlen_func = None
268
- apply_rotary_emb = None
269
 
270
 
271
- if is_flash_attn_2_available():
272
  from transformers.modeling_flash_attention_utils import _flash_attention_forward
273
- else:
274
- flash_attn_varlen_func = None
275
 
 
 
276
 
277
- logger = logging.get_logger(__name__)
278
 
279
- _CONFIG_FOR_DOC = "Qwen2_5_VLConfig"
 
280
 
281
 
282
  class Qwen2_5_VLMLP(nn.Module):
@@ -524,8 +605,10 @@ class Qwen2_5_VLVisionSdpaAttention(nn.Module):
524
  q = q.transpose(0, 1)
525
  k = k.transpose(0, 1)
526
  v = v.transpose(0, 1)
527
- attn_output = F.scaled_dot_product_attention(q, k, v, attention_mask, dropout_p=0.0)
528
- attn_output = attn_output.transpose(0, 1)
 
 
529
  attn_output = attn_output.reshape(seq_length, -1)
530
  attn_output = self.proj(attn_output)
531
  return attn_output
@@ -565,27 +648,7 @@ class Qwen2_5_VLVisionBlock(nn.Module):
565
  return hidden_states
566
 
567
 
568
- Qwen2_5_VL_START_DOCSTRING = r"""
569
- This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
570
- library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
571
- etc.)
572
-
573
- This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
574
- Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
575
- and behavior.
576
-
577
- Parameters:
578
- config ([`Qwen2_5_VLConfig`]):
579
- Model configuration class with all the parameters of the model. Initializing with a config file does not
580
- load the weights associated with the model, only the configuration. Check out the
581
- [`~PreTrainedModel.from_pretrained`] method to load the model weights.
582
- """
583
-
584
-
585
- @add_start_docstrings(
586
- "The bare Qwen2_5_VL Model outputting raw hidden-states without any specific head on top.",
587
- Qwen2_5_VL_START_DOCSTRING,
588
- )
589
  class Qwen2_5_VLPreTrainedModel(PreTrainedModel):
590
  config_class = Qwen2_5_VLConfig
591
  base_model_prefix = "model"
@@ -598,7 +661,7 @@ class Qwen2_5_VLPreTrainedModel(PreTrainedModel):
598
  _supports_static_cache = False # TODO (joao): fix. torch.compile failing probably due to `cache_positions`
599
 
600
  def _init_weights(self, module):
601
- std = self.config.initializer_range
602
  if isinstance(module, (nn.Linear, nn.Conv3d)):
603
  module.weight.data.normal_(mean=0.0, std=std)
604
  if module.bias is not None:
@@ -607,6 +670,8 @@ class Qwen2_5_VLPreTrainedModel(PreTrainedModel):
607
  module.weight.data.normal_(mean=0.0, std=std)
608
  if module.padding_idx is not None:
609
  module.weight.data[module.padding_idx].zero_()
 
 
610
 
611
 
612
  class Qwen2_5_VisionTransformerPretrainedModel(Qwen2_5_VLPreTrainedModel):
@@ -771,8 +836,44 @@ class Qwen2_5_VisionTransformerPretrainedModel(Qwen2_5_VLPreTrainedModel):
771
  return hidden_states
772
 
773
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
774
  class Qwen2_5_VLRotaryEmbedding(nn.Module):
775
- def __init__(self, config: Qwen2_5_VLConfig, device=None):
776
  super().__init__()
777
  # BC: "rope_type" was originally "type"
778
  if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
@@ -789,45 +890,20 @@ class Qwen2_5_VLRotaryEmbedding(nn.Module):
789
  self.register_buffer("inv_freq", inv_freq, persistent=False)
790
  self.original_inv_freq = self.inv_freq
791
 
792
- def _dynamic_frequency_update(self, position_ids, device):
793
- """
794
- dynamic RoPE layers should recompute `inv_freq` in the following situations:
795
- 1 - growing beyond the cached sequence length (allow scaling)
796
- 2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
797
- """
798
- seq_len = torch.max(position_ids) + 1
799
- if seq_len > self.max_seq_len_cached: # growth
800
- inv_freq, self.attention_scaling = self.rope_init_fn(
801
- self.config, device, seq_len=seq_len, **self.rope_kwargs
802
- )
803
- self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: may break with compilation
804
- self.max_seq_len_cached = seq_len
805
-
806
- if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len: # reset
807
- self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
808
- self.max_seq_len_cached = self.original_max_seq_len
809
-
810
  @torch.no_grad()
 
811
  def forward(self, x, position_ids):
812
- if "dynamic" in self.rope_type:
813
- self._dynamic_frequency_update(position_ids, device=x.device)
814
-
815
- # Core RoPE block. In contrast to other models, Qwen2_5_VL has different position ids for thw grids
816
  # So we expand the inv_freq to shape (3, ...)
817
  inv_freq_expanded = self.inv_freq[None, None, :, None].float().expand(3, position_ids.shape[1], -1, 1)
818
  position_ids_expanded = position_ids[:, :, None, :].float() # shape (3, bs, 1, positions)
819
- # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
820
- device_type = x.device.type
821
- device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
822
- with torch.autocast(device_type=device_type, enabled=False):
823
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3)
824
  emb = torch.cat((freqs, freqs), dim=-1)
825
- cos = emb.cos()
826
- sin = emb.sin()
827
-
828
- # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
829
- cos = cos * self.attention_scaling
830
- sin = sin * self.attention_scaling
831
 
832
  return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
833
 
@@ -844,7 +920,7 @@ class Qwen2MLP(nn.Module):
844
  self.act_fn = ACT2FN[config.hidden_act]
845
 
846
  def forward(self, x, task_label: Union[str, List[str]]):
847
- down_proj = self.down_proj(self.act_fn(self.gate_proj(x, task_label=task_label)) * self.up_proj(x, task_label=task_label), task_label=task_label)
848
  return down_proj
849
 
850
 
@@ -854,7 +930,7 @@ def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim
854
  Explanation:
855
  Multimodal 3D rotary position embedding is an extension to 1D rotary position embedding. The input embedding
856
  sequence contains vision (images / videos) embedding and text embedding or just contains text embedding. For
857
- vision embedding part, we apply rotary position embedding on temporal, height and width dimension seperately.
858
  Here we split the channel dimension to 3 chunks for the temporal, height and width rotary position embedding.
859
  For text embedding part, we just apply 1D rotary position embedding. The three rotary position index (temporal,
860
  height and width) of text embedding is always the same, so the text embedding rotary position embedding has no
@@ -911,7 +987,7 @@ class Qwen2_5_VLAttention(nn.Module):
911
  and "Generating Long Sequences with Sparse Transformers".
912
  """
913
 
914
- def __init__(self, config: Qwen2_5_VLConfig, layer_idx: Optional[int] = None):
915
  super().__init__()
916
  self.config = config
917
  self.layer_idx = layer_idx
@@ -957,9 +1033,9 @@ class Qwen2_5_VLAttention(nn.Module):
957
  ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
958
  bsz, q_len, _ = hidden_states.size()
959
 
960
- query_states = self.q_proj(hidden_states, task_label=task_label)
961
- key_states = self.k_proj(hidden_states, task_label=task_label)
962
- value_states = self.v_proj(hidden_states, task_label=task_label)
963
 
964
  query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
965
  key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
@@ -1003,7 +1079,7 @@ class Qwen2_5_VLAttention(nn.Module):
1003
  attn_output = attn_output.transpose(1, 2).contiguous()
1004
  attn_output = attn_output.reshape(bsz, q_len, -1)
1005
 
1006
- attn_output = self.o_proj(attn_output, task_label=task_label)
1007
 
1008
  if not output_attentions:
1009
  attn_weights = None
@@ -1022,10 +1098,11 @@ class Qwen2_5_VLFlashAttention2(Qwen2_5_VLAttention):
1022
 
1023
  def __init__(self, *args, **kwargs):
1024
  super().__init__(*args, **kwargs)
 
1025
  # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
1026
- # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
1027
  # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
1028
- self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
1029
 
1030
  def forward(
1031
  self,
@@ -1041,9 +1118,9 @@ class Qwen2_5_VLFlashAttention2(Qwen2_5_VLAttention):
1041
  ):
1042
  bsz, q_len, _ = hidden_states.size()
1043
 
1044
- query_states = self.q_proj(hidden_states, task_label=task_label)
1045
- key_states = self.k_proj(hidden_states, task_label=task_label)
1046
- value_states = self.v_proj(hidden_states, task_label=task_label)
1047
 
1048
  query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
1049
  key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
@@ -1113,8 +1190,8 @@ class Qwen2_5_VLFlashAttention2(Qwen2_5_VLAttention):
1113
  use_top_left_mask=self._flash_attn_uses_top_left_mask,
1114
  )
1115
 
1116
- attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
1117
- attn_output = self.o_proj(attn_output, task_label=task_label)
1118
 
1119
  if not output_attentions:
1120
  attn_weights = None
@@ -1149,6 +1226,7 @@ class Qwen2_5_VLSdpaAttention(Qwen2_5_VLAttention):
1149
  'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
1150
  )
1151
  return super().forward(
 
1152
  hidden_states=hidden_states,
1153
  attention_mask=attention_mask,
1154
  position_ids=position_ids,
@@ -1161,9 +1239,9 @@ class Qwen2_5_VLSdpaAttention(Qwen2_5_VLAttention):
1161
 
1162
  bsz, q_len, _ = hidden_states.size()
1163
 
1164
- query_states = self.q_proj(hidden_states, task_label=task_label)
1165
- key_states = self.k_proj(hidden_states, task_label=task_label)
1166
- value_states = self.v_proj(hidden_states, task_label=task_label)
1167
 
1168
  query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
1169
  key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
@@ -1207,9 +1285,9 @@ class Qwen2_5_VLSdpaAttention(Qwen2_5_VLAttention):
1207
  )
1208
 
1209
  attn_output = attn_output.transpose(1, 2).contiguous()
1210
- attn_output = attn_output.view(bsz, q_len, self.hidden_size)
1211
 
1212
- attn_output = self.o_proj(attn_output, task_label=task_label)
1213
 
1214
  return attn_output, None, past_key_value
1215
 
@@ -1222,7 +1300,7 @@ QWEN2_5_VL_ATTENTION_CLASSES = {
1222
 
1223
 
1224
  class Qwen2_5_VLDecoderLayer(nn.Module):
1225
- def __init__(self, config: Qwen2_5_VLConfig, layer_idx: int):
1226
  super().__init__()
1227
  self.hidden_size = config.hidden_size
1228
 
@@ -1293,7 +1371,7 @@ class Qwen2_5_VLDecoderLayer(nn.Module):
1293
  # Fully Connected
1294
  residual = hidden_states
1295
  hidden_states = self.post_attention_layernorm(hidden_states)
1296
- hidden_states = self.mlp(hidden_states, task_label=task_label)
1297
  hidden_states = residual + hidden_states
1298
 
1299
  outputs = (hidden_states,)
@@ -1307,12 +1385,11 @@ class Qwen2_5_VLDecoderLayer(nn.Module):
1307
  return outputs
1308
 
1309
 
1310
- @add_start_docstrings(
1311
- "The bare Qwen2_5_VL Model outputting raw hidden-states without any specific head on top.",
1312
- Qwen2_5_VL_START_DOCSTRING,
1313
- )
1314
- class Qwen2_5_VLModel(Qwen2_5_VLPreTrainedModel):
1315
- def __init__(self, config: Qwen2_5_VLConfig):
1316
  super().__init__(config)
1317
  self.padding_idx = config.pad_token_id
1318
  self.vocab_size = config.vocab_size
@@ -1335,10 +1412,11 @@ class Qwen2_5_VLModel(Qwen2_5_VLPreTrainedModel):
1335
  def set_input_embeddings(self, value):
1336
  self.embed_tokens = value
1337
 
 
1338
  def forward(
1339
  self,
1340
  task_label: Union[str, List[str]],
1341
- input_ids: torch.LongTensor = None,
1342
  attention_mask: Optional[torch.Tensor] = None,
1343
  position_ids: Optional[torch.LongTensor] = None,
1344
  past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -1349,6 +1427,13 @@ class Qwen2_5_VLModel(Qwen2_5_VLPreTrainedModel):
1349
  return_dict: Optional[bool] = None,
1350
  cache_position: Optional[torch.LongTensor] = None,
1351
  ) -> Union[Tuple, BaseModelOutputWithPast]:
 
 
 
 
 
 
 
1352
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1353
  output_hidden_states = (
1354
  output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1407,6 +1492,7 @@ class Qwen2_5_VLModel(Qwen2_5_VLPreTrainedModel):
1407
  if self.gradient_checkpointing and self.training:
1408
  layer_outputs = self._gradient_checkpointing_func(
1409
  decoder_layer.__call__,
 
1410
  hidden_states,
1411
  causal_mask,
1412
  position_ids,
@@ -1456,11 +1542,11 @@ class Qwen2_5_VLModel(Qwen2_5_VLPreTrainedModel):
1456
 
1457
  def _update_causal_mask(
1458
  self,
1459
- attention_mask: torch.Tensor,
1460
  input_tensor: torch.Tensor,
1461
  cache_position: torch.Tensor,
1462
  past_key_values: Cache,
1463
- output_attentions: bool,
1464
  ):
1465
  if self.config._attn_implementation == "flash_attention_2":
1466
  if attention_mask is not None and past_key_values is not None:
@@ -1474,6 +1560,10 @@ class Qwen2_5_VLModel(Qwen2_5_VLPreTrainedModel):
1474
  if attention_mask is not None and 0.0 in attention_mask:
1475
  return attention_mask
1476
  return None
 
 
 
 
1477
 
1478
  # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
1479
  # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@@ -1497,7 +1587,7 @@ class Qwen2_5_VLModel(Qwen2_5_VLPreTrainedModel):
1497
  ):
1498
  return None
1499
 
1500
- dtype, device = input_tensor.dtype, input_tensor.device
1501
  min_dtype = torch.finfo(dtype).min
1502
  sequence_length = input_tensor.shape[1]
1503
  # SlidingWindowCache or StaticCache
@@ -1517,7 +1607,6 @@ class Qwen2_5_VLModel(Qwen2_5_VLPreTrainedModel):
1517
  sequence_length=sequence_length,
1518
  target_length=target_length,
1519
  dtype=dtype,
1520
- device=device,
1521
  cache_position=cache_position,
1522
  batch_size=input_tensor.shape[0],
1523
  config=self.config,
@@ -1527,7 +1616,7 @@ class Qwen2_5_VLModel(Qwen2_5_VLPreTrainedModel):
1527
  if (
1528
  self.config._attn_implementation == "sdpa"
1529
  and attention_mask is not None
1530
- and attention_mask.device.type in ["cuda", "xpu"]
1531
  and not output_attentions
1532
  ):
1533
  # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
@@ -1543,7 +1632,6 @@ class Qwen2_5_VLModel(Qwen2_5_VLPreTrainedModel):
1543
  sequence_length: int,
1544
  target_length: int,
1545
  dtype: torch.dtype,
1546
- device: torch.device,
1547
  cache_position: torch.Tensor,
1548
  batch_size: int,
1549
  config: Qwen2_5_VLConfig,
@@ -1562,8 +1650,6 @@ class Qwen2_5_VLModel(Qwen2_5_VLPreTrainedModel):
1562
  The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
1563
  dtype (`torch.dtype`):
1564
  The dtype to use for the 4D attention mask.
1565
- device (`torch.device`):
1566
- The device to plcae the 4D attention mask on.
1567
  cache_position (`torch.Tensor`):
1568
  Indices depicting the position of the input sequence tokens in the sequence.
1569
  batch_size (`torch.Tensor`):
@@ -1579,15 +1665,18 @@ class Qwen2_5_VLModel(Qwen2_5_VLPreTrainedModel):
1579
  else:
1580
  min_dtype = torch.finfo(dtype).min
1581
  causal_mask = torch.full(
1582
- (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
1583
  )
1584
- diagonal_attend_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
1585
- if config.sliding_window is not None:
 
 
 
1586
  # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
1587
  # the check is needed to verify is current checkpoint was trained with sliding window or not
1588
  if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
1589
- sliding_attend_mask = torch.arange(target_length, device=device) <= (
1590
- cache_position.reshape(-1, 1) - config.sliding_window
1591
  )
1592
  diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
1593
  causal_mask *= diagonal_attend_mask
@@ -1607,154 +1696,27 @@ class Qwen2_5_VLModel(Qwen2_5_VLPreTrainedModel):
1607
  return causal_mask
1608
 
1609
 
1610
- @dataclass
1611
- class Qwen2_5_VLCausalLMOutputWithPast(ModelOutput):
1612
- """
1613
- Base class for Qwen2_5_VL causal language model (or autoregressive) outputs.
1614
-
1615
- Args:
1616
- loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
1617
- Language modeling loss (for next-token prediction).
1618
- logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
1619
- Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
1620
- past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
1621
- Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
1622
- `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
1623
-
1624
- Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
1625
- `past_key_values` input) to speed up sequential decoding.
1626
- hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
1627
- Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
1628
- one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
1629
-
1630
- Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
1631
- attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
1632
- Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
1633
- sequence_length)`.
1634
-
1635
- Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
1636
- heads.
1637
- rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
1638
- The rope index difference between sequence length and multimodal rope.
1639
- """
1640
-
1641
- loss: Optional[torch.FloatTensor] = None
1642
- logits: torch.FloatTensor = None
1643
- past_key_values: Optional[List[torch.FloatTensor]] = None
1644
- hidden_states: Optional[Tuple[torch.FloatTensor]] = None
1645
- attentions: Optional[Tuple[torch.FloatTensor]] = None
1646
- rope_deltas: Optional[torch.LongTensor] = None
1647
-
1648
-
1649
- QWEN2_5_VL_INPUTS_DOCSTRING = r"""
1650
- Args:
1651
- input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
1652
- Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
1653
- it.
1654
-
1655
- Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
1656
- [`PreTrainedTokenizer.__call__`] for details.
1657
-
1658
- [What are input IDs?](../glossary#input-ids)
1659
- attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
1660
- Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
1661
-
1662
- - 1 for tokens that are **not masked**,
1663
- - 0 for tokens that are **masked**.
1664
-
1665
- [What are attention masks?](../glossary#attention-mask)
1666
-
1667
- Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
1668
- [`PreTrainedTokenizer.__call__`] for details.
1669
-
1670
- If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
1671
- `past_key_values`).
1672
-
1673
- If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
1674
- and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
1675
- information on the default strategy.
1676
-
1677
- - 1 indicates the head is **not masked**,
1678
- - 0 indicates the head is **masked**.
1679
- position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1680
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
1681
- config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
1682
- past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
1683
- Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
1684
- `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
1685
- `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
1686
-
1687
- Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
1688
- blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
1689
-
1690
- If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
1691
- don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
1692
- `decoder_input_ids` of shape `(batch_size, sequence_length)`.
1693
- inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
1694
- Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
1695
- is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
1696
- model's internal embedding lookup matrix.
1697
- use_cache (`bool`, *optional*):
1698
- If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
1699
- `past_key_values`).
1700
- output_attentions (`bool`, *optional*):
1701
- Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
1702
- tensors for more detail.
1703
- output_hidden_states (`bool`, *optional*):
1704
- Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
1705
- more detail.
1706
- return_dict (`bool`, *optional*):
1707
- Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
1708
- pixel_values (`torch.FloatTensor` of shape `(seq_length, num_channels * image_size * image_size)):
1709
- The tensors corresponding to the input images. Pixel values can be obtained using
1710
- [`AutoImageProcessor`]. See [`Qwen2_5_VLImageProcessor.__call__`] for details. [`Qwen2_5_VLProcessor`] uses
1711
- [`Qwen2_5_VLImageProcessor`] for processing images.
1712
- pixel_values_videos (`torch.FloatTensor` of shape `(seq_length, num_channels * temporal_size * image_size * image_size)):
1713
- The tensors corresponding to the input videos. Pixel values can be obtained using
1714
- [`AutoImageProcessor`]. See [`Qwen2_5_VLImageProcessor.__call__`] for details. [`Qwen2_5_VLProcessor`] uses
1715
- [`Qwen2_5_VLImageProcessor`] for processing videos.
1716
- image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
1717
- The temporal, height and width of feature shape of each image in LLM.
1718
- video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
1719
- The temporal, height and width of feature shape of each video in LLM.
1720
- rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
1721
- The rope index difference between sequence length and multimodal rope.
1722
- """
1723
-
1724
-
1725
- class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMixin):
1726
- _tied_weights_keys = ["lm_head.weight"]
1727
  config_class = Qwen2_5_VLConfig
1728
  _no_split_modules = ["Qwen2_5_VLDecoderLayer", "Qwen2_5_VLVisionBlock"]
1729
 
1730
  def __init__(self, config):
1731
  super().__init__(config)
1732
  self.visual = Qwen2_5_VisionTransformerPretrainedModel._from_config(config.vision_config)
1733
- self.model = Qwen2_5_VLModel(config)
1734
- self.vocab_size = config.vocab_size
1735
- self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
1736
  self.rope_deltas = None # cache rope_deltas here
1737
 
1738
  # Initialize weights and apply final processing
1739
  self.post_init()
1740
 
1741
  def get_input_embeddings(self):
1742
- return self.model.embed_tokens
1743
 
1744
  def set_input_embeddings(self, value):
1745
- self.model.embed_tokens = value
1746
-
1747
- def get_output_embeddings(self):
1748
- return self.lm_head
1749
-
1750
- def set_output_embeddings(self, new_embeddings):
1751
- self.lm_head = new_embeddings
1752
-
1753
- def set_decoder(self, decoder):
1754
- self.model = decoder
1755
-
1756
- def get_decoder(self):
1757
- return self.model
1758
 
1759
  def get_rope_index(
1760
  self,
@@ -1778,7 +1740,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
1778
  width position_ids: [0, 1, 2, 3, 4]
1779
 
1780
  For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
1781
- and 1D rotary position embeddin for text part.
1782
  Examples:
1783
  Temporal (Time): 3 patches, representing different segments of the video in time.
1784
  Height: 2 patches, dividing each frame vertically.
@@ -1892,6 +1854,11 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
1892
  range_tensor = torch.arange(llm_grid_t).view(-1, 1)
1893
  expanded_range = range_tensor.expand(-1, llm_grid_h * llm_grid_w)
1894
 
 
 
 
 
 
1895
  time_tensor = expanded_range * second_per_grid_t * self.config.vision_config.tokens_per_second
1896
 
1897
  time_tensor_long = time_tensor.long()
@@ -1933,8 +1900,37 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
1933
 
1934
  return position_ids, mrope_position_deltas
1935
 
1936
- @add_start_docstrings_to_model_forward(QWEN2_5_VL_INPUTS_DOCSTRING)
1937
- @replace_return_docstrings(output_type=Qwen2_5_VLCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1938
  def forward(
1939
  self,
1940
  task_label: Union[str, List[str]],
@@ -1943,7 +1939,6 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
1943
  position_ids: Optional[torch.LongTensor] = None,
1944
  past_key_values: Optional[List[torch.FloatTensor]] = None,
1945
  inputs_embeds: Optional[torch.FloatTensor] = None,
1946
- labels: Optional[torch.LongTensor] = None,
1947
  use_cache: Optional[bool] = None,
1948
  output_attentions: Optional[bool] = None,
1949
  output_hidden_states: Optional[bool] = None,
@@ -1955,45 +1950,25 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
1955
  rope_deltas: Optional[torch.LongTensor] = None,
1956
  cache_position: Optional[torch.LongTensor] = None,
1957
  second_per_grid_ts: Optional[torch.Tensor] = None,
1958
- ) -> Union[Tuple, Qwen2_5_VLCausalLMOutputWithPast]:
1959
  r"""
1960
- labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1961
- Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
1962
- config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1963
- (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
1964
-
1965
- Returns:
1966
-
1967
- Example:
1968
-
1969
- ```python
1970
- >>> from PIL import Image
1971
- >>> import requests
1972
- >>> from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
1973
-
1974
- >>> model = Qwen2_5_VLForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
1975
- >>> processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
1976
-
1977
- >>> messages = [
1978
- {
1979
- "role": "user",
1980
- "content": [
1981
- {"type": "image"},
1982
- {"type": "text", "text": "What is shown in this image?"},
1983
- ],
1984
- },
1985
- ]
1986
- >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
1987
- >>> image = Image.open(requests.get(url, stream=True).raw)
1988
-
1989
- >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
1990
- >>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])
1991
-
1992
- >>> # Generate
1993
- >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1994
- >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
1995
- "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
1996
- ```"""
1997
 
1998
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1999
  output_hidden_states = (
@@ -2002,10 +1977,9 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
2002
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
2003
 
2004
  if inputs_embeds is None:
2005
- inputs_embeds = self.model.embed_tokens(input_ids)
2006
  if pixel_values is not None:
2007
- pixel_values = pixel_values.type(self.visual.dtype)
2008
- image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
2009
  n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
2010
  n_image_features = image_embeds.shape[0]
2011
  if n_image_tokens != n_image_features:
@@ -2022,8 +1996,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
2022
  inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
2023
 
2024
  if pixel_values_videos is not None:
2025
- pixel_values_videos = pixel_values_videos.type(self.visual.dtype)
2026
- video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
2027
  n_video_tokens = (input_ids == self.config.video_token_id).sum().item()
2028
  n_video_features = video_embeds.shape[0]
2029
  if n_video_tokens != n_video_features:
@@ -2073,7 +2046,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
2073
  position_ids = position_ids.add(delta)
2074
  position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
2075
 
2076
- outputs = self.model(
2077
  task_label=task_label,
2078
  input_ids=None,
2079
  position_ids=position_ids,
@@ -2083,6 +2056,197 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
2083
  use_cache=use_cache,
2084
  output_attentions=output_attentions,
2085
  output_hidden_states=output_hidden_states,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2086
  return_dict=return_dict,
2087
  cache_position=cache_position,
2088
  )
@@ -2092,18 +2256,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
2092
 
2093
  loss = None
2094
  if labels is not None:
2095
- # Upcast to float if we need to compute the loss to avoid potential precision issues
2096
- logits = logits.float()
2097
- # Shift so that tokens < n predict n
2098
- shift_logits = logits[..., :-1, :].contiguous()
2099
- shift_labels = labels[..., 1:].contiguous()
2100
- # Flatten the tokens
2101
- loss_fct = CrossEntropyLoss()
2102
- shift_logits = shift_logits.view(-1, self.config.vocab_size)
2103
- shift_labels = shift_labels.view(-1)
2104
- # Enable model parallelism
2105
- shift_labels = shift_labels.to(shift_logits.device)
2106
- loss = loss_fct(shift_logits, shift_labels)
2107
 
2108
  if not return_dict:
2109
  output = (logits,) + outputs[1:]
@@ -2115,7 +2268,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
2115
  past_key_values=outputs.past_key_values,
2116
  hidden_states=outputs.hidden_states,
2117
  attentions=outputs.attentions,
2118
- rope_deltas=self.rope_deltas,
2119
  )
2120
 
2121
  def prepare_inputs_for_generation(
@@ -2283,20 +2436,86 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
2283
 
2284
  return input_ids, model_kwargs
2285
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2286
 
2287
- from typing import List, Union
 
2288
 
2289
  from transformers.feature_extraction_utils import BatchFeature
2290
- from transformers.image_utils import ImageInput, VideoInput
2291
- from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs
2292
  from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
 
2293
 
2294
 
2295
  class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False):
2296
  fps: Union[List[float], float]
2297
 
2298
 
 
 
 
 
 
 
 
 
2299
  class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False):
 
2300
  videos_kwargs: Qwen2_5_VLVideosProcessorKwargs
2301
  _defaults = {
2302
  "text_kwargs": {
@@ -2316,20 +2535,33 @@ class Qwen2_5_VLProcessor(ProcessorMixin):
2316
  The image processor is a required input.
2317
  tokenizer ([`Qwen2TokenizerFast`], *optional*):
2318
  The tokenizer is a required input.
 
 
2319
  chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
2320
  in a chat into a tokenizable string.
2321
  """
2322
 
2323
- attributes = ["image_processor", "tokenizer"]
2324
  valid_kwargs = ["chat_template"]
2325
 
2326
  image_processor_class = "AutoImageProcessor"
 
2327
  tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
2328
 
2329
- def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
2330
  self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
2331
  self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
2332
- super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
 
 
 
 
 
 
 
 
 
2333
 
2334
  def __call__(
2335
  self,
@@ -2380,64 +2612,56 @@ class Qwen2_5_VLProcessor(ProcessorMixin):
2380
  tokenizer_init_kwargs=self.tokenizer.init_kwargs,
2381
  **kwargs,
2382
  )
 
 
2383
  if images is not None:
2384
- image_inputs = self.image_processor(images=images, videos=None, **output_kwargs["images_kwargs"])
2385
  image_grid_thw = image_inputs["image_grid_thw"]
2386
- else:
2387
- image_inputs = {}
2388
- image_grid_thw = None
2389
 
2390
  if videos is not None:
2391
- videos_inputs = self.image_processor(images=None, videos=videos, **output_kwargs["images_kwargs"])
2392
  video_grid_thw = videos_inputs["video_grid_thw"]
2393
 
2394
  fps = output_kwargs["videos_kwargs"].pop("fps", 2.0)
2395
  if isinstance(fps, (int, float)):
2396
- second_per_grid_ts = [self.image_processor.temporal_patch_size / fps] * len(video_grid_thw)
2397
  elif hasattr(fps, "__len__") and len(fps) == len(video_grid_thw):
2398
- second_per_grid_ts = [self.image_processor.temporal_patch_size / tmp for tmp in fps]
2399
  else:
2400
  raise ValueError(
2401
  f"The length of fps ({len(fps) if hasattr(fps, '__len__') else fps}) must be equal to the length of video_grid_thw ({len(video_grid_thw)}) or fps should be a single number."
2402
  )
2403
  videos_inputs.update({"second_per_grid_ts": second_per_grid_ts})
2404
 
2405
- else:
2406
- videos_inputs = {}
2407
- video_grid_thw = None
2408
-
2409
  if not isinstance(text, list):
2410
  text = [text]
2411
 
2412
- if image_grid_thw is not None:
 
2413
  merge_length = self.image_processor.merge_size**2
2414
  index = 0
2415
  for i in range(len(text)):
2416
  while self.image_token in text[i]:
2417
- text[i] = text[i].replace(
2418
- self.image_token,
2419
- "<|placeholder|>" * (image_grid_thw[index].prod() // merge_length),
2420
- 1,
2421
- )
2422
  index += 1
2423
  text[i] = text[i].replace("<|placeholder|>", self.image_token)
2424
 
2425
- if video_grid_thw is not None:
2426
- merge_length = self.image_processor.merge_size**2
2427
  index = 0
2428
  for i in range(len(text)):
2429
  while self.video_token in text[i]:
2430
- text[i] = text[i].replace(
2431
- self.video_token,
2432
- "<|placeholder|>" * (video_grid_thw[index].prod() // merge_length),
2433
- 1,
2434
- )
2435
  index += 1
2436
  text[i] = text[i].replace("<|placeholder|>", self.video_token)
2437
 
 
2438
  text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
 
2439
 
2440
- return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})
2441
 
2442
  def batch_decode(self, *args, **kwargs):
2443
  """
@@ -2465,7 +2689,7 @@ class Qwen2_5_VLProcessor(ProcessorMixin):
2465
  or `(sequence_length,)`.
2466
  skip_special_tokens (`bool`, *optional*, defaults to `True`):
2467
  Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
2468
- Clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
2469
  Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
2470
  **kwargs:
2471
  Additional arguments to be passed to the tokenizer's `batch_decode method`.
@@ -2488,5 +2712,6 @@ class Qwen2_5_VLProcessor(ProcessorMixin):
2488
  return names_from_processor + ["second_per_grid_ts"]
2489
 
2490
 
2491
- __all__ = ["Qwen2_5_VLForConditionalGeneration", "Qwen2_5_VLModel", "Qwen2_5_VLPreTrainedModel", "Qwen2_5_VLProcessor", "Qwen2_5_VLConfig"]
 
2492
 
 
24
  window_size=112,
25
  out_hidden_size=3584,
26
  fullatt_block_indexes=[7, 15, 23, 31],
27
+ initializer_range=0.02,
28
  **kwargs,
29
  ):
30
  super().__init__(**kwargs)
 
42
  self.window_size = window_size
43
  self.fullatt_block_indexes = fullatt_block_indexes
44
  self.out_hidden_size = out_hidden_size
45
+ self.initializer_range = initializer_range
46
 
47
 
48
+ class Qwen2_5_VLTextConfig(PretrainedConfig):
49
  r"""
50
+ This is the configuration class to store the configuration of a [`Qwen2_5_VLTextModel`]. It is used to instantiate a
51
  Qwen2-VL model according to the specified arguments, defining the model architecture. Instantiating a configuration
52
  with the defaults will yield a similar configuration to that of
53
  Qwen2-VL-7B-Instruct [Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct).
 
55
  Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
56
  documentation from [`PretrainedConfig`] for more information.
57
 
 
58
  Args:
59
  vocab_size (`int`, *optional*, defaults to 152064):
60
  Vocabulary size of the Qwen2_5_VL model. Defines the number of different tokens that can be represented by the
 
97
  The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
98
  attention_dropout (`float`, *optional*, defaults to 0.0):
99
  The dropout ratio for the attention probabilities.
 
 
100
  rope_scaling (`Dict`, *optional*):
101
  Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
102
  and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
 
134
  Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
135
  `high_freq_factor` (`float`, *optional*):
136
  Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
137
+ image_token_id (`int`, *optional*):
138
+ Token index used as placeholder for image embeddings.
139
+ video_token_id (`int`, *optional*):
140
+ Token index used as placeholder for video embeddings.
141
 
142
  ```python
143
+ >>> from transformers import Qwen2_5_VLTextModel, Qwen2_5_VLConfig
144
 
145
  >>> # Initializing a Qwen2_5_VL style configuration
146
  >>> configuration = Qwen2_5_VLConfig()
147
 
148
  >>> # Initializing a model from the Qwen2-VL-7B style configuration
149
+ >>> model = Qwen2_5_VLTextModel(configuration)
150
 
151
  >>> # Accessing the model configuration
152
  >>> configuration = model.config
153
  ```"""
154
 
155
+ model_type = "qwen2_5_vl_text"
156
+ base_config_key = "text_config"
157
  keys_to_ignore_at_inference = ["past_key_values"]
158
  # Default tensor parallel plan for base model `Qwen2_5_VL`
159
  base_model_tp_plan = {
 
190
  sliding_window=4096,
191
  max_window_layers=80,
192
  attention_dropout=0.0,
 
193
  rope_scaling=None,
194
+ image_token_id=None,
195
+ video_token_id=None,
196
  **kwargs,
197
  ):
 
 
 
 
 
198
  self.vocab_size = vocab_size
199
  self.max_position_embeddings = max_position_embeddings
200
  self.hidden_size = hidden_size
 
220
 
221
  # Validate the correctness of rotary position embeddings parameters
222
  # BC: if there is a 'type' field, move it to 'rope_type'.
223
+ # and change type from 'mrope' to 'default' because `mrope` does default RoPE calculations
224
  # one can set it to "linear"/"dynamic" etc. to have scaled RoPE
225
  # TODO: @raushan update config in the hub
226
  if self.rope_scaling is not None and "type" in self.rope_scaling:
 
228
  self.rope_scaling["type"] = "default"
229
  self.rope_scaling["rope_type"] = self.rope_scaling["type"]
230
  rope_config_validation(self, ignore_keys={"mrope_section"})
231
+ self.image_token_id = image_token_id
232
+ self.video_token_id = video_token_id
233
 
234
  super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
235
 
236
 
237
+ class Qwen2_5_VLConfig(PretrainedConfig):
238
+ r"""
239
+ This is the configuration class to store the configuration of a [`Qwen2_5_VLModel`]. It is used to instantiate a
240
+ Qwen2-VL model according to the specified arguments, defining the model architecture. Instantiating a configuration
241
+ with the defaults will yield a similar configuration to that of
242
+ Qwen2-VL-7B-Instruct [Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct).
243
+
244
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
245
+ documentation from [`PretrainedConfig`] for more information.
246
+
247
+
248
+ Args:
249
+ text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Qwen2_5_VLTextConfig`):
250
+ The config object or dictionary of the text backbone.
251
+ vision_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Qwen2_5_VLVisionConfig`):
252
+ The config object or dictionary of the vision backbone.
253
+ image_token_id (`int`, *optional*, defaults to 151655):
254
+ The image token index to encode the image prompt.
255
+ video_token_id (`int`, *optional*, defaults to 151656):
256
+ The video token index to encode the image prompt.
257
+
258
+ ```python
259
+ >>> from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLConfig
260
+
261
+ >>> # Initializing a Qwen2_5_VL style configuration
262
+ >>> configuration = Qwen2_5_VLConfig()
263
+
264
+ >>> # Initializing a model from the Qwen2-VL-7B style configuration
265
+ >>> model = Qwen2_5_VLForConditionalGeneration(configuration)
266
+
267
+ >>> # Accessing the model configuration
268
+ >>> configuration = model.config
269
+ ```"""
270
+
271
+ model_type = "qwen2_5_vl"
272
+ sub_configs = {"vision_config": Qwen2_5_VLVisionConfig, "text_config": Qwen2_5_VLTextConfig}
273
+ keys_to_ignore_at_inference = ["past_key_values"]
274
+
275
+ def __init__(
276
+ self,
277
+ text_config=None,
278
+ vision_config=None,
279
+ image_token_id=151655,
280
+ video_token_id=151656,
281
+ **kwargs,
282
+ ):
283
+ if isinstance(vision_config, dict):
284
+ self.vision_config = self.sub_configs["vision_config"](**vision_config)
285
+ elif vision_config is None:
286
+ self.vision_config = self.sub_configs["vision_config"]()
287
+
288
+ if isinstance(text_config, dict):
289
+ self.text_config = self.sub_configs["text_config"](**text_config)
290
+ elif text_config is None:
291
+ # For BC use all kwargs to init `TextConfig`
292
+ self.text_config = self.sub_configs["text_config"](**kwargs)
293
+
294
+ self.image_token_id = image_token_id
295
+ self.video_token_id = video_token_id
296
+
297
+ super().__init__(**kwargs)
298
+
299
+
300
+
301
+
302
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
303
+ # This file was automatically generated from src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py.
304
+ # Do NOT edit this file manually as any edits will be overwritten by the generation of
305
+ # the file from the modular. If any change should be done, please apply the change to the
306
+ # modular_qwen2_5_vl.py file directly. One of our CI enforces this.
307
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
308
+ # coding=utf-8
309
+ # Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
310
+ #
311
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
312
+ # and OPT implementations in this library. It has been modified from its
313
+ # original forms to accommodate minor architectural differences compared
314
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
315
+ #
316
+ # Licensed under the Apache License, Version 2.0 (the "License");
317
+ # you may not use this file except in compliance with the License.
318
+ # You may obtain a copy of the License at
319
+ #
320
+ # http://www.apache.org/licenses/LICENSE-2.0
321
+ #
322
+ # Unless required by applicable law or agreed to in writing, software
323
+ # distributed under the License is distributed on an "AS IS" BASIS,
324
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
325
+ # See the License for the specific language governing permissions and
326
+ # limitations under the License.
327
 
328
  import math
329
  from dataclasses import dataclass
 
332
  import torch
333
  import torch.nn as nn
334
  import torch.nn.functional as F
 
335
 
336
  from transformers.activations import ACT2FN
337
  from transformers.cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
338
  from transformers.generation import GenerationMixin
339
  from transformers.modeling_attn_mask_utils import AttentionMaskConverter
340
+ from transformers.modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
341
  from transformers.modeling_outputs import BaseModelOutputWithPast, ModelOutput
342
+ from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
343
  from transformers.modeling_utils import PreTrainedModel
344
+ from transformers.utils import auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging
 
 
 
 
 
 
 
345
 
 
 
 
346
 
347
+ if is_flash_attn_available():
348
+ from transformers.modeling_flash_attention_utils import apply_rotary_emb, flash_attn_varlen_func
 
349
 
350
 
351
+ if is_flash_attn_available():
352
  from transformers.modeling_flash_attention_utils import _flash_attention_forward
 
 
353
 
354
+ if is_torch_flex_attn_available():
355
+ from torch.nn.attention.flex_attention import BlockMask
356
 
357
+ from transformers.integrations.flex_attention import make_flex_block_causal_mask
358
 
359
+
360
+ logger = logging.get_logger(__name__)
361
 
362
 
363
  class Qwen2_5_VLMLP(nn.Module):
 
605
  q = q.transpose(0, 1)
606
  k = k.transpose(0, 1)
607
  v = v.transpose(0, 1)
608
+ attn_output = F.scaled_dot_product_attention(
609
+ q.unsqueeze(0), k.unsqueeze(0), v.unsqueeze(0), attention_mask, dropout_p=0.0
610
+ )
611
+ attn_output = attn_output.squeeze(0).transpose(0, 1)
612
  attn_output = attn_output.reshape(seq_length, -1)
613
  attn_output = self.proj(attn_output)
614
  return attn_output
 
648
  return hidden_states
649
 
650
 
651
+ @auto_docstring
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
652
  class Qwen2_5_VLPreTrainedModel(PreTrainedModel):
653
  config_class = Qwen2_5_VLConfig
654
  base_model_prefix = "model"
 
661
  _supports_static_cache = False # TODO (joao): fix. torch.compile failing probably due to `cache_positions`
662
 
663
  def _init_weights(self, module):
664
+ std = self.config.get_text_config().initializer_range
665
  if isinstance(module, (nn.Linear, nn.Conv3d)):
666
  module.weight.data.normal_(mean=0.0, std=std)
667
  if module.bias is not None:
 
670
  module.weight.data.normal_(mean=0.0, std=std)
671
  if module.padding_idx is not None:
672
  module.weight.data[module.padding_idx].zero_()
673
+ elif isinstance(module, Qwen2RMSNorm):
674
+ module.weight.data.fill_(1.0)
675
 
676
 
677
  class Qwen2_5_VisionTransformerPretrainedModel(Qwen2_5_VLPreTrainedModel):
 
836
  return hidden_states
837
 
838
 
839
+ @dataclass
840
+ class Qwen2_5_VLModelOutputWithPast(ModelOutput):
841
+ """
842
+ Base class for Llava outputs, with hidden states and attentions.
843
+
844
+ Args:
845
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
846
+ Sequence of hidden-states at the output of the last layer of the model.
847
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
848
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
849
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
850
+
851
+ Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
852
+ `past_key_values` input) to speed up sequential decoding.
853
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
854
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
855
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
856
+
857
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
858
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
859
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
860
+ sequence_length)`.
861
+
862
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
863
+ heads.
864
+ rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
865
+ The rope index difference between sequence length and multimodal rope.
866
+ """
867
+
868
+ last_hidden_state: torch.FloatTensor = None
869
+ past_key_values: Optional[List[torch.FloatTensor]] = None
870
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
871
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
872
+ rope_deltas: Optional[torch.LongTensor] = None
873
+
874
+
875
  class Qwen2_5_VLRotaryEmbedding(nn.Module):
876
+ def __init__(self, config: Qwen2_5_VLTextConfig, device=None):
877
  super().__init__()
878
  # BC: "rope_type" was originally "type"
879
  if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
 
890
  self.register_buffer("inv_freq", inv_freq, persistent=False)
891
  self.original_inv_freq = self.inv_freq
892
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
893
  @torch.no_grad()
894
+ @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope)
895
  def forward(self, x, position_ids):
896
+ # In contrast to other models, Qwen2_5_VL has different position ids for the grids
 
 
 
897
  # So we expand the inv_freq to shape (3, ...)
898
  inv_freq_expanded = self.inv_freq[None, None, :, None].float().expand(3, position_ids.shape[1], -1, 1)
899
  position_ids_expanded = position_ids[:, :, None, :].float() # shape (3, bs, 1, positions)
900
+
901
+ device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
902
+ with torch.autocast(device_type=device_type, enabled=False): # Force float32
 
903
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3)
904
  emb = torch.cat((freqs, freqs), dim=-1)
905
+ cos = emb.cos() * self.attention_scaling
906
+ sin = emb.sin() * self.attention_scaling
 
 
 
 
907
 
908
  return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
909
 
 
920
  self.act_fn = ACT2FN[config.hidden_act]
921
 
922
  def forward(self, x, task_label: Union[str, List[str]]):
923
+ down_proj = self.down_proj(self.act_fn(self.gate_proj(x, task_label)) * self.up_proj(x, task_label), task_label)
924
  return down_proj
925
 
926
 
 
930
  Explanation:
931
  Multimodal 3D rotary position embedding is an extension to 1D rotary position embedding. The input embedding
932
  sequence contains vision (images / videos) embedding and text embedding or just contains text embedding. For
933
+ vision embedding part, we apply rotary position embedding on temporal, height and width dimension separately.
934
  Here we split the channel dimension to 3 chunks for the temporal, height and width rotary position embedding.
935
  For text embedding part, we just apply 1D rotary position embedding. The three rotary position index (temporal,
936
  height and width) of text embedding is always the same, so the text embedding rotary position embedding has no
 
987
  and "Generating Long Sequences with Sparse Transformers".
988
  """
989
 
990
+ def __init__(self, config: Qwen2_5_VLTextConfig, layer_idx: Optional[int] = None):
991
  super().__init__()
992
  self.config = config
993
  self.layer_idx = layer_idx
 
1033
  ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
1034
  bsz, q_len, _ = hidden_states.size()
1035
 
1036
+ query_states = self.q_proj(hidden_states, task_label)
1037
+ key_states = self.k_proj(hidden_states, task_label)
1038
+ value_states = self.v_proj(hidden_states, task_label)
1039
 
1040
  query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
1041
  key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
 
1079
  attn_output = attn_output.transpose(1, 2).contiguous()
1080
  attn_output = attn_output.reshape(bsz, q_len, -1)
1081
 
1082
+ attn_output = self.o_proj(attn_output, task_label)
1083
 
1084
  if not output_attentions:
1085
  attn_weights = None
 
1098
 
1099
  def __init__(self, *args, **kwargs):
1100
  super().__init__(*args, **kwargs)
1101
+
1102
  # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
1103
+ # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
1104
  # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
1105
+ self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
1106
 
1107
  def forward(
1108
  self,
 
1118
  ):
1119
  bsz, q_len, _ = hidden_states.size()
1120
 
1121
+ query_states = self.q_proj(hidden_states, task_label)
1122
+ key_states = self.k_proj(hidden_states, task_label)
1123
+ value_states = self.v_proj(hidden_states, task_label)
1124
 
1125
  query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
1126
  key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
 
1190
  use_top_left_mask=self._flash_attn_uses_top_left_mask,
1191
  )
1192
 
1193
+ attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
1194
+ attn_output = self.o_proj(attn_output, task_label)
1195
 
1196
  if not output_attentions:
1197
  attn_weights = None
 
1226
  'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
1227
  )
1228
  return super().forward(
1229
+ task_label=task_label,
1230
  hidden_states=hidden_states,
1231
  attention_mask=attention_mask,
1232
  position_ids=position_ids,
 
1239
 
1240
  bsz, q_len, _ = hidden_states.size()
1241
 
1242
+ query_states = self.q_proj(hidden_states, task_label)
1243
+ key_states = self.k_proj(hidden_states, task_label)
1244
+ value_states = self.v_proj(hidden_states, task_label)
1245
 
1246
  query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
1247
  key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
 
1285
  )
1286
 
1287
  attn_output = attn_output.transpose(1, 2).contiguous()
1288
+ attn_output = attn_output.view(bsz, q_len, -1)
1289
 
1290
+ attn_output = self.o_proj(attn_output, task_label)
1291
 
1292
  return attn_output, None, past_key_value
1293
 
 
1300
 
1301
 
1302
  class Qwen2_5_VLDecoderLayer(nn.Module):
1303
+ def __init__(self, config: Qwen2_5_VLTextConfig, layer_idx: int):
1304
  super().__init__()
1305
  self.hidden_size = config.hidden_size
1306
 
 
1371
  # Fully Connected
1372
  residual = hidden_states
1373
  hidden_states = self.post_attention_layernorm(hidden_states)
1374
+ hidden_states = self.mlp(hidden_states, task_label)
1375
  hidden_states = residual + hidden_states
1376
 
1377
  outputs = (hidden_states,)
 
1385
  return outputs
1386
 
1387
 
1388
+ @auto_docstring
1389
+ class Qwen2_5_VLTextModel(Qwen2_5_VLPreTrainedModel):
1390
+ config_class = Qwen2_5_VLTextConfig
1391
+
1392
+ def __init__(self, config: Qwen2_5_VLTextConfig):
 
1393
  super().__init__(config)
1394
  self.padding_idx = config.pad_token_id
1395
  self.vocab_size = config.vocab_size
 
1412
  def set_input_embeddings(self, value):
1413
  self.embed_tokens = value
1414
 
1415
+ @auto_docstring
1416
  def forward(
1417
  self,
1418
  task_label: Union[str, List[str]],
1419
+ input_ids: Optional[torch.LongTensor] = None,
1420
  attention_mask: Optional[torch.Tensor] = None,
1421
  position_ids: Optional[torch.LongTensor] = None,
1422
  past_key_values: Optional[List[torch.FloatTensor]] = None,
 
1427
  return_dict: Optional[bool] = None,
1428
  cache_position: Optional[torch.LongTensor] = None,
1429
  ) -> Union[Tuple, BaseModelOutputWithPast]:
1430
+ """
1431
+ Args:
1432
+ task_label (`Union[str, List[str]]`):
1433
+ Task adapter to use for computing embeddings. If string, all batch examples use the same adapter.
1434
+ If list of strings, each example uses its corresponding adapter. Must be one of the supported
1435
+ task names (e.g., 'retrieval', 'text-matching', 'code').
1436
+ """
1437
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1438
  output_hidden_states = (
1439
  output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
 
1492
  if self.gradient_checkpointing and self.training:
1493
  layer_outputs = self._gradient_checkpointing_func(
1494
  decoder_layer.__call__,
1495
+ task_label,
1496
  hidden_states,
1497
  causal_mask,
1498
  position_ids,
 
1542
 
1543
  def _update_causal_mask(
1544
  self,
1545
+ attention_mask: Union[torch.Tensor, "BlockMask"],
1546
  input_tensor: torch.Tensor,
1547
  cache_position: torch.Tensor,
1548
  past_key_values: Cache,
1549
+ output_attentions: bool = False,
1550
  ):
1551
  if self.config._attn_implementation == "flash_attention_2":
1552
  if attention_mask is not None and past_key_values is not None:
 
1560
  if attention_mask is not None and 0.0 in attention_mask:
1561
  return attention_mask
1562
  return None
1563
+ if self.config._attn_implementation == "flex_attention":
1564
+ if isinstance(attention_mask, torch.Tensor):
1565
+ attention_mask = make_flex_block_causal_mask(attention_mask)
1566
+ return attention_mask
1567
 
1568
  # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
1569
  # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
 
1587
  ):
1588
  return None
1589
 
1590
+ dtype = input_tensor.dtype
1591
  min_dtype = torch.finfo(dtype).min
1592
  sequence_length = input_tensor.shape[1]
1593
  # SlidingWindowCache or StaticCache
 
1607
  sequence_length=sequence_length,
1608
  target_length=target_length,
1609
  dtype=dtype,
 
1610
  cache_position=cache_position,
1611
  batch_size=input_tensor.shape[0],
1612
  config=self.config,
 
1616
  if (
1617
  self.config._attn_implementation == "sdpa"
1618
  and attention_mask is not None
1619
+ and attention_mask.device.type in ["cuda", "xpu", "npu"]
1620
  and not output_attentions
1621
  ):
1622
  # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
 
1632
  sequence_length: int,
1633
  target_length: int,
1634
  dtype: torch.dtype,
 
1635
  cache_position: torch.Tensor,
1636
  batch_size: int,
1637
  config: Qwen2_5_VLConfig,
 
1650
  The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
1651
  dtype (`torch.dtype`):
1652
  The dtype to use for the 4D attention mask.
 
 
1653
  cache_position (`torch.Tensor`):
1654
  Indices depicting the position of the input sequence tokens in the sequence.
1655
  batch_size (`torch.Tensor`):
 
1665
  else:
1666
  min_dtype = torch.finfo(dtype).min
1667
  causal_mask = torch.full(
1668
+ (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
1669
  )
1670
+ diagonal_attend_mask = torch.arange(target_length, device=cache_position.device) > cache_position.reshape(
1671
+ -1, 1
1672
+ )
1673
+ text_config = config.get_text_config()
1674
+ if getattr(text_config, "use_sliding_window", True) and text_config.sliding_window is not None:
1675
  # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
1676
  # the check is needed to verify is current checkpoint was trained with sliding window or not
1677
  if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
1678
+ sliding_attend_mask = torch.arange(target_length, device=cache_position.device) <= (
1679
+ cache_position.reshape(-1, 1) - text_config.sliding_window
1680
  )
1681
  diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
1682
  causal_mask *= diagonal_attend_mask
 
1696
  return causal_mask
1697
 
1698
 
1699
+ @auto_docstring
1700
+ class Qwen2_5_VLModel(Qwen2_5_VLPreTrainedModel):
1701
+ base_model_prefix = ""
1702
+ _checkpoint_conversion_mapping = {"^model": "language_model"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1703
  config_class = Qwen2_5_VLConfig
1704
  _no_split_modules = ["Qwen2_5_VLDecoderLayer", "Qwen2_5_VLVisionBlock"]
1705
 
1706
  def __init__(self, config):
1707
  super().__init__(config)
1708
  self.visual = Qwen2_5_VisionTransformerPretrainedModel._from_config(config.vision_config)
1709
+ self.language_model = Qwen2_5_VLTextModel._from_config(config.text_config)
 
 
1710
  self.rope_deltas = None # cache rope_deltas here
1711
 
1712
  # Initialize weights and apply final processing
1713
  self.post_init()
1714
 
1715
  def get_input_embeddings(self):
1716
+ return self.language_model.get_input_embeddings()
1717
 
1718
  def set_input_embeddings(self, value):
1719
+ self.language_model.set_input_embeddings(value)
 
 
 
 
 
 
 
 
 
 
 
 
1720
 
1721
  def get_rope_index(
1722
  self,
 
1740
  width position_ids: [0, 1, 2, 3, 4]
1741
 
1742
  For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
1743
+ and 1D rotary position embedding for text part.
1744
  Examples:
1745
  Temporal (Time): 3 patches, representing different segments of the video in time.
1746
  Height: 2 patches, dividing each frame vertically.
 
1854
  range_tensor = torch.arange(llm_grid_t).view(-1, 1)
1855
  expanded_range = range_tensor.expand(-1, llm_grid_h * llm_grid_w)
1856
 
1857
+ ## normalize type, send to device.
1858
+ second_per_grid_t = torch.as_tensor(
1859
+ second_per_grid_t, dtype=range_tensor.dtype, device=range_tensor.device
1860
+ )
1861
+
1862
  time_tensor = expanded_range * second_per_grid_t * self.config.vision_config.tokens_per_second
1863
 
1864
  time_tensor_long = time_tensor.long()
 
1900
 
1901
  return position_ids, mrope_position_deltas
1902
 
1903
+ def get_video_features(
1904
+ self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None
1905
+ ):
1906
+ """
1907
+ Encodes videos into continuous embeddings that can be forwarded to the language model.
1908
+
1909
+ Args:
1910
+ pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
1911
+ The tensors corresponding to the input videos.
1912
+ video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
1913
+ The temporal, height and width of feature shape of each video in LLM.
1914
+ """
1915
+ pixel_values_videos = pixel_values_videos.type(self.visual.dtype)
1916
+ video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
1917
+ return video_embeds
1918
+
1919
+ def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None):
1920
+ """
1921
+ Encodes images into continuous embeddings that can be forwarded to the language model.
1922
+
1923
+ Args:
1924
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
1925
+ The tensors corresponding to the input images.
1926
+ image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
1927
+ The temporal, height and width of feature shape of each image in LLM.
1928
+ """
1929
+ pixel_values = pixel_values.type(self.visual.dtype)
1930
+ image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
1931
+ return image_embeds
1932
+
1933
+ @auto_docstring
1934
  def forward(
1935
  self,
1936
  task_label: Union[str, List[str]],
 
1939
  position_ids: Optional[torch.LongTensor] = None,
1940
  past_key_values: Optional[List[torch.FloatTensor]] = None,
1941
  inputs_embeds: Optional[torch.FloatTensor] = None,
 
1942
  use_cache: Optional[bool] = None,
1943
  output_attentions: Optional[bool] = None,
1944
  output_hidden_states: Optional[bool] = None,
 
1950
  rope_deltas: Optional[torch.LongTensor] = None,
1951
  cache_position: Optional[torch.LongTensor] = None,
1952
  second_per_grid_ts: Optional[torch.Tensor] = None,
1953
+ ) -> Union[Tuple, Qwen2_5_VLModelOutputWithPast]:
1954
  r"""
1955
+ task_label (`Union[str, List[str]]`):
1956
+ Task adapter to use for computing embeddings. If string, all batch examples use the same adapter.
1957
+ If list of strings, each example uses its corresponding adapter. Must be one of the supported
1958
+ task names (e.g., 'retrieval', 'text-matching', 'code').
1959
+ pixel_values_videos (`torch.FloatTensor` of shape `(seq_length, num_channels * temporal_size * image_size * image_size)):
1960
+ The tensors corresponding to the input videos. Pixel values can be obtained using
1961
+ [`AutoImageProcessor`]. See [`Qwen2_5_VLImageProcessor.__call__`] for details. [`Qwen2_5_VLProcessor`] uses
1962
+ [`Qwen2_5_VLImageProcessor`] for processing videos.
1963
+ image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
1964
+ The temporal, height and width of feature shape of each image in LLM.
1965
+ video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
1966
+ The temporal, height and width of feature shape of each video in LLM.
1967
+ rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
1968
+ The rope index difference between sequence length and multimodal rope.
1969
+ second_per_grid_ts (`torch.Tensor` of shape `(num_videos)`, *optional*):
1970
+ The time interval (in seconds) for each grid along the temporal dimension in the 3D position IDs.
1971
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1972
 
1973
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1974
  output_hidden_states = (
 
1977
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1978
 
1979
  if inputs_embeds is None:
1980
+ inputs_embeds = self.get_input_embeddings()(input_ids)
1981
  if pixel_values is not None:
1982
+ image_embeds = self.get_image_features(pixel_values, image_grid_thw)
 
1983
  n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
1984
  n_image_features = image_embeds.shape[0]
1985
  if n_image_tokens != n_image_features:
 
1996
  inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
1997
 
1998
  if pixel_values_videos is not None:
1999
+ video_embeds = self.get_video_features(pixel_values_videos, video_grid_thw)
 
2000
  n_video_tokens = (input_ids == self.config.video_token_id).sum().item()
2001
  n_video_features = video_embeds.shape[0]
2002
  if n_video_tokens != n_video_features:
 
2046
  position_ids = position_ids.add(delta)
2047
  position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
2048
 
2049
+ outputs = self.language_model(
2050
  task_label=task_label,
2051
  input_ids=None,
2052
  position_ids=position_ids,
 
2056
  use_cache=use_cache,
2057
  output_attentions=output_attentions,
2058
  output_hidden_states=output_hidden_states,
2059
+ return_dict=True,
2060
+ cache_position=cache_position,
2061
+ )
2062
+
2063
+ output = Qwen2_5_VLModelOutputWithPast(
2064
+ last_hidden_state=outputs.last_hidden_state,
2065
+ past_key_values=outputs.past_key_values,
2066
+ hidden_states=outputs.hidden_states,
2067
+ attentions=outputs.attentions,
2068
+ rope_deltas=self.rope_deltas,
2069
+ )
2070
+ return output if return_dict else output.to_tuple()
2071
+
2072
+
2073
+ @dataclass
2074
+ class Qwen2_5_VLCausalLMOutputWithPast(ModelOutput):
2075
+ """
2076
+ Base class for Qwen2_5_VL causal language model (or autoregressive) outputs.
2077
+
2078
+ Args:
2079
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
2080
+ Language modeling loss (for next-token prediction).
2081
+ logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
2082
+ Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
2083
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
2084
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
2085
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
2086
+
2087
+ Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
2088
+ `past_key_values` input) to speed up sequential decoding.
2089
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
2090
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
2091
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
2092
+
2093
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
2094
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
2095
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
2096
+ sequence_length)`.
2097
+
2098
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
2099
+ heads.
2100
+ rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
2101
+ The rope index difference between sequence length and multimodal rope.
2102
+ """
2103
+
2104
+ loss: Optional[torch.FloatTensor] = None
2105
+ logits: Optional[torch.FloatTensor] = None
2106
+ past_key_values: Optional[List[torch.FloatTensor]] = None
2107
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
2108
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
2109
+ rope_deltas: Optional[torch.LongTensor] = None
2110
+
2111
+
2112
+ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMixin):
2113
+ _checkpoint_conversion_mapping = {
2114
+ "^visual": "model.visual",
2115
+ r"^model(?!\.(language_model|visual))": "model.language_model",
2116
+ }
2117
+ _tied_weights_keys = ["lm_head.weight"]
2118
+
2119
+ def __init__(self, config):
2120
+ super().__init__(config)
2121
+ self.model = Qwen2_5_VLModel(config)
2122
+ self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
2123
+
2124
+ self.post_init()
2125
+
2126
+ def get_input_embeddings(self):
2127
+ return self.model.get_input_embeddings()
2128
+
2129
+ def set_input_embeddings(self, value):
2130
+ self.model.set_input_embeddings(value)
2131
+
2132
+ def get_output_embeddings(self):
2133
+ return self.lm_head
2134
+
2135
+ def set_output_embeddings(self, new_embeddings):
2136
+ self.lm_head = new_embeddings
2137
+
2138
+ def set_decoder(self, decoder):
2139
+ self.model = decoder
2140
+
2141
+ def get_decoder(self):
2142
+ return self.model
2143
+
2144
+ # Make modules available throught conditional class for BC
2145
+ @property
2146
+ def language_model(self):
2147
+ return self.model.language_model
2148
+
2149
+ @property
2150
+ def visual(self):
2151
+ return self.model.visual
2152
+
2153
+ @can_return_tuple
2154
+ @auto_docstring
2155
+ def forward(
2156
+ self,
2157
+ task_label: Union[str, List[str]],
2158
+ input_ids: torch.LongTensor = None,
2159
+ attention_mask: Optional[torch.Tensor] = None,
2160
+ position_ids: Optional[torch.LongTensor] = None,
2161
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
2162
+ inputs_embeds: Optional[torch.FloatTensor] = None,
2163
+ labels: Optional[torch.LongTensor] = None,
2164
+ use_cache: Optional[bool] = None,
2165
+ output_attentions: Optional[bool] = None,
2166
+ output_hidden_states: Optional[bool] = None,
2167
+ return_dict: Optional[bool] = None,
2168
+ pixel_values: Optional[torch.Tensor] = None,
2169
+ pixel_values_videos: Optional[torch.FloatTensor] = None,
2170
+ image_grid_thw: Optional[torch.LongTensor] = None,
2171
+ video_grid_thw: Optional[torch.LongTensor] = None,
2172
+ rope_deltas: Optional[torch.LongTensor] = None,
2173
+ cache_position: Optional[torch.LongTensor] = None,
2174
+ second_per_grid_ts: Optional[torch.Tensor] = None,
2175
+ ) -> Union[Tuple, Qwen2_5_VLCausalLMOutputWithPast]:
2176
+ r"""
2177
+ task_label (`Union[str, List[str]]`):
2178
+ Task adapter to use for computing embeddings. If string, all batch examples use the same adapter.
2179
+ If list of strings, each example uses its corresponding adapter. Must be one of the supported
2180
+ task names (e.g., 'retrieval', 'text-matching', 'code').
2181
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
2182
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
2183
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
2184
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
2185
+ pixel_values_videos (`torch.FloatTensor` of shape `(seq_length, num_channels * temporal_size * image_size * image_size)):
2186
+ The tensors corresponding to the input videos. Pixel values can be obtained using
2187
+ [`AutoImageProcessor`]. See [`Qwen2_5_VLImageProcessor.__call__`] for details. [`Qwen2_5_VLProcessor`] uses
2188
+ [`Qwen2_5_VLImageProcessor`] for processing videos.
2189
+ image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
2190
+ The temporal, height and width of feature shape of each image in LLM.
2191
+ video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
2192
+ The temporal, height and width of feature shape of each video in LLM.
2193
+ rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
2194
+ The rope index difference between sequence length and multimodal rope.
2195
+ second_per_grid_ts (`torch.Tensor` of shape `(num_videos)`, *optional*):
2196
+ The time interval (in seconds) for each grid along the temporal dimension in the 3D position IDs.
2197
+
2198
+ Example:
2199
+
2200
+ ```python
2201
+ >>> from PIL import Image
2202
+ >>> import requests
2203
+ >>> from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
2204
+
2205
+ >>> model = Qwen2_5_VLForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
2206
+ >>> processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
2207
+
2208
+ >>> messages = [
2209
+ {
2210
+ "role": "user",
2211
+ "content": [
2212
+ {"type": "image"},
2213
+ {"type": "text", "text": "What is shown in this image?"},
2214
+ ],
2215
+ },
2216
+ ]
2217
+ >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
2218
+ >>> image = Image.open(requests.get(url, stream=True).raw)
2219
+
2220
+ >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
2221
+ >>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])
2222
+
2223
+ >>> # Generate
2224
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
2225
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
2226
+ "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
2227
+ ```"""
2228
+
2229
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
2230
+ output_hidden_states = (
2231
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
2232
+ )
2233
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
2234
+
2235
+ outputs = self.model(
2236
+ task_label=task_label,
2237
+ input_ids=input_ids,
2238
+ pixel_values=pixel_values,
2239
+ pixel_values_videos=pixel_values_videos,
2240
+ image_grid_thw=image_grid_thw,
2241
+ video_grid_thw=video_grid_thw,
2242
+ second_per_grid_ts=second_per_grid_ts,
2243
+ position_ids=position_ids,
2244
+ attention_mask=attention_mask,
2245
+ past_key_values=past_key_values,
2246
+ inputs_embeds=inputs_embeds,
2247
+ use_cache=use_cache,
2248
+ output_attentions=output_attentions,
2249
+ output_hidden_states=output_hidden_states,
2250
  return_dict=return_dict,
2251
  cache_position=cache_position,
2252
  )
 
2256
 
2257
  loss = None
2258
  if labels is not None:
2259
+ loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size)
 
 
 
 
 
 
 
 
 
 
 
2260
 
2261
  if not return_dict:
2262
  output = (logits,) + outputs[1:]
 
2268
  past_key_values=outputs.past_key_values,
2269
  hidden_states=outputs.hidden_states,
2270
  attentions=outputs.attentions,
2271
+ rope_deltas=outputs.rope_deltas,
2272
  )
2273
 
2274
  def prepare_inputs_for_generation(
 
2436
 
2437
  return input_ids, model_kwargs
2438
 
2439
+ @staticmethod
2440
+ def _prepare_4d_causal_attention_mask_with_cache_position(
2441
+ attention_mask: torch.Tensor,
2442
+ sequence_length: int,
2443
+ target_length: int,
2444
+ dtype: torch.dtype,
2445
+ cache_position: torch.Tensor,
2446
+ batch_size: int,
2447
+ **kwargs,
2448
+ ):
2449
+ """
2450
+ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
2451
+ `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
2452
+
2453
+ Args:
2454
+ attention_mask (`torch.Tensor`):
2455
+ A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
2456
+ `(batch_size, 1, query_length, key_value_length)`.
2457
+ sequence_length (`int`):
2458
+ The sequence length being processed.
2459
+ target_length (`int`):
2460
+ The target length: when generating with static cache, the mask should be as long as the static cache,
2461
+ to account for the 0 padding, the part of the cache that is not filled yet.
2462
+ dtype (`torch.dtype`):
2463
+ The dtype to use for the 4D attention mask.
2464
+ cache_position (`torch.Tensor`):
2465
+ Indices depicting the position of the input sequence tokens in the sequence.
2466
+ batch_size (`torch.Tensor`):
2467
+ Batch size.
2468
+ """
2469
+ if attention_mask is not None and attention_mask.dim() == 4:
2470
+ # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
2471
+ causal_mask = attention_mask
2472
+ else:
2473
+ min_dtype = torch.finfo(dtype).min
2474
+ causal_mask = torch.full(
2475
+ (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
2476
+ )
2477
+ if sequence_length != 1:
2478
+ causal_mask = torch.triu(causal_mask, diagonal=1)
2479
+ causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
2480
+ causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
2481
+ if attention_mask is not None:
2482
+ causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
2483
+ mask_length = attention_mask.shape[-1]
2484
+ padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
2485
+ causal_mask.device
2486
+ )
2487
+ padding_mask = padding_mask == 0
2488
+ causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
2489
+ padding_mask, min_dtype
2490
+ )
2491
+
2492
+ return causal_mask
2493
+
2494
 
2495
+
2496
+ from typing import List, Optional, Union
2497
 
2498
  from transformers.feature_extraction_utils import BatchFeature
2499
+ from transformers.image_utils import ImageInput
2500
+ from transformers.processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs
2501
  from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
2502
+ from transformers.video_utils import VideoInput
2503
 
2504
 
2505
  class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False):
2506
  fps: Union[List[float], float]
2507
 
2508
 
2509
+ class Qwen2_5_VLImagesKwargs(ImagesKwargs):
2510
+ min_pixels: Optional[int]
2511
+ max_pixels: Optional[int]
2512
+ patch_size: Optional[int]
2513
+ temporal_patch_size: Optional[int]
2514
+ merge_size: Optional[int]
2515
+
2516
+
2517
  class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False):
2518
+ images_kwargs: Qwen2_5_VLImagesKwargs
2519
  videos_kwargs: Qwen2_5_VLVideosProcessorKwargs
2520
  _defaults = {
2521
  "text_kwargs": {
 
2535
  The image processor is a required input.
2536
  tokenizer ([`Qwen2TokenizerFast`], *optional*):
2537
  The tokenizer is a required input.
2538
+ video_processor ([`Qwen2_5_VLVideoProcessor`], *optional*):
2539
+ The video processor is a required input.
2540
  chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
2541
  in a chat into a tokenizable string.
2542
  """
2543
 
2544
+ attributes = ["image_processor", "tokenizer", "video_processor"]
2545
  valid_kwargs = ["chat_template"]
2546
 
2547
  image_processor_class = "AutoImageProcessor"
2548
+ video_processor_class = "AutoVideoProcessor"
2549
  tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
2550
 
2551
+ def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
2552
  self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
2553
  self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
2554
+ self.image_token_id = (
2555
+ tokenizer.image_token_id
2556
+ if getattr(tokenizer, "image_token_id", None)
2557
+ else tokenizer.convert_tokens_to_ids(self.image_token)
2558
+ )
2559
+ self.video_token_id = (
2560
+ tokenizer.video_token_id
2561
+ if getattr(tokenizer, "video_token_id", None)
2562
+ else tokenizer.convert_tokens_to_ids(self.video_token)
2563
+ )
2564
+ super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
2565
 
2566
  def __call__(
2567
  self,
 
2612
  tokenizer_init_kwargs=self.tokenizer.init_kwargs,
2613
  **kwargs,
2614
  )
2615
+
2616
+ image_inputs = videos_inputs = {}
2617
  if images is not None:
2618
+ image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
2619
  image_grid_thw = image_inputs["image_grid_thw"]
 
 
 
2620
 
2621
  if videos is not None:
2622
+ videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"])
2623
  video_grid_thw = videos_inputs["video_grid_thw"]
2624
 
2625
  fps = output_kwargs["videos_kwargs"].pop("fps", 2.0)
2626
  if isinstance(fps, (int, float)):
2627
+ second_per_grid_ts = [self.video_processor.temporal_patch_size / fps] * len(video_grid_thw)
2628
  elif hasattr(fps, "__len__") and len(fps) == len(video_grid_thw):
2629
+ second_per_grid_ts = [self.video_processor.temporal_patch_size / tmp for tmp in fps]
2630
  else:
2631
  raise ValueError(
2632
  f"The length of fps ({len(fps) if hasattr(fps, '__len__') else fps}) must be equal to the length of video_grid_thw ({len(video_grid_thw)}) or fps should be a single number."
2633
  )
2634
  videos_inputs.update({"second_per_grid_ts": second_per_grid_ts})
2635
 
 
 
 
 
2636
  if not isinstance(text, list):
2637
  text = [text]
2638
 
2639
+ text = text.copy() # below lines change text in-place
2640
+ if images is not None:
2641
  merge_length = self.image_processor.merge_size**2
2642
  index = 0
2643
  for i in range(len(text)):
2644
  while self.image_token in text[i]:
2645
+ num_image_tokens = image_grid_thw[index].prod() // merge_length
2646
+ text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
 
 
 
2647
  index += 1
2648
  text[i] = text[i].replace("<|placeholder|>", self.image_token)
2649
 
2650
+ if videos is not None:
2651
+ merge_length = self.video_processor.merge_size**2
2652
  index = 0
2653
  for i in range(len(text)):
2654
  while self.video_token in text[i]:
2655
+ num_video_tokens = video_grid_thw[index].prod() // merge_length
2656
+ text[i] = text[i].replace(self.video_token, "<|placeholder|>" * num_video_tokens, 1)
 
 
 
2657
  index += 1
2658
  text[i] = text[i].replace("<|placeholder|>", self.video_token)
2659
 
2660
+ return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
2661
  text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
2662
+ self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"])
2663
 
2664
+ return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors)
2665
 
2666
  def batch_decode(self, *args, **kwargs):
2667
  """
 
2689
  or `(sequence_length,)`.
2690
  skip_special_tokens (`bool`, *optional*, defaults to `True`):
2691
  Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
2692
+ clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
2693
  Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
2694
  **kwargs:
2695
  Additional arguments to be passed to the tokenizer's `batch_decode method`.
 
2712
  return names_from_processor + ["second_per_grid_ts"]
2713
 
2714
 
2715
+
2716
+ __all__ = ["Qwen2_5_VLForConditionalGeneration", "Qwen2_5_VLModel", "Qwen2_5_VLTextModel", "Qwen2_5_VLVisionConfig", "Qwen2_5_VLTextConfig", "Qwen2_5_VLPreTrainedModel", "Qwen2_5_VLProcessor", "Qwen2_5_VLConfig"]
2717