Flourish commited on
Commit
1f2f3ed
·
verified ·
1 Parent(s): 4948ef8

Delete Ovis2.5-2B/.ipynb_checkpoints

Browse files
Ovis2.5-2B/.ipynb_checkpoints/added_tokens-checkpoint.json DELETED
@@ -1,28 +0,0 @@
1
- {
2
- "</think>": 151668,
3
- "</tool_call>": 151658,
4
- "</tool_response>": 151666,
5
- "<think>": 151667,
6
- "<tool_call>": 151657,
7
- "<tool_response>": 151665,
8
- "<|box_end|>": 151649,
9
- "<|box_start|>": 151648,
10
- "<|endoftext|>": 151643,
11
- "<|file_sep|>": 151664,
12
- "<|fim_middle|>": 151660,
13
- "<|fim_pad|>": 151662,
14
- "<|fim_prefix|>": 151659,
15
- "<|fim_suffix|>": 151661,
16
- "<|im_end|>": 151645,
17
- "<|im_start|>": 151644,
18
- "<|image_pad|>": 151655,
19
- "<|object_ref_end|>": 151647,
20
- "<|object_ref_start|>": 151646,
21
- "<|quad_end|>": 151651,
22
- "<|quad_start|>": 151650,
23
- "<|repo_name|>": 151663,
24
- "<|video_pad|>": 151656,
25
- "<|vision_end|>": 151653,
26
- "<|vision_pad|>": 151654,
27
- "<|vision_start|>": 151652
28
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Ovis2.5-2B/.ipynb_checkpoints/chat_template-checkpoint.json DELETED
@@ -1,3 +0,0 @@
1
- {
2
- "chat_template": "{%- for message in messages %}{{- '<|im_start|>' + message.role + '\n'}}{%- if message.role == 'system' or message.role == 'user' %}{%- if message.content is string %}{{- message.content | replace('<image>', '') | replace('<video>', '') }}{%- else %}{%- for item in message.content %}{%- if item.type == 'text' and 'text' in item %}{{- item.text | replace('<image>', '') | replace('<video>', '') }}{%- elif item.type == 'image' %}{{- '<image>'}}{%- elif item.type == 'video' %}{{- '<video>'}}{%- else %}{{- raise_exception('Invalid content type. Supported types for system and user are text, image, video.')}}{%- endif %}{%- if not loop.last %}{{- '\n'}}{%- endif %}{%- endfor %}{%- endif %}{%- elif message.role == 'assistant' %}{%- set content = '' %}{%- if message.content is string %}{%- set content = message.content | replace('<image>', '') | replace('<video>', '') %}{%- else %}{%- for item in message.content %}{%- if item.type == 'text' and 'text' in item %}{%- set content = content ~ (item.text | replace('<image>', '') | replace('<video>', '')) %}{%- else %}{{- raise_exception('Invalid content type. Supported type for assistant is text.')}}{%- endif %}{%- endfor %}{%- endif %}{%- set content = content.split('</think>')[-1].lstrip('\n') %}{{- content }}{%- else %}{{- raise_exception('Invalid role. Supported roles are system, user, assistant.')}}{%- endif %}{{- '<|im_end|>\n'}}{%- endfor %}{%- if add_generation_prompt %}{{- '<|im_start|>assistant\n' }}{%- if enable_thinking is defined and enable_thinking is false %}{{- '<think>\n\n</think>\n\n' }}{%- endif %}{%- endif %}"
3
- }
 
 
 
 
Ovis2.5-2B/.ipynb_checkpoints/config-checkpoint.json DELETED
@@ -1,73 +0,0 @@
1
- {
2
- "architectures": [
3
- "Ovis2_5"
4
- ],
5
- "auto_map": {
6
- "AutoConfig": "configuration_ovis2_5.Ovis2_5_Config",
7
- "AutoModelForCausalLM": "modeling_ovis2_5.Ovis2_5"
8
- },
9
- "conversation_formatter_class": "Qwen3ConversationFormatter",
10
- "hidden_size": 2048,
11
- "vocab_size": 151936,
12
- "num_attention_heads": 32,
13
- "max_position_embeddings": 40960,
14
- "llm_config": {
15
- "_attn_implementation_autoset": true,
16
- "_name_or_path": "Qwen/Qwen3-1.7B",
17
- "architectures": [
18
- "Qwen3ForCausalLM"
19
- ],
20
- "attention_bias": false,
21
- "attention_dropout": 0.0,
22
- "bos_token_id": 151643,
23
- "eos_token_id": 151645,
24
- "head_dim": 128,
25
- "hidden_act": "silu",
26
- "hidden_size": 2048,
27
- "initializer_range": 0.02,
28
- "intermediate_size": 6144,
29
- "max_position_embeddings": 40960,
30
- "max_window_layers": 28,
31
- "model_type": "qwen3",
32
- "num_attention_heads": 16,
33
- "num_hidden_layers": 28,
34
- "num_key_value_heads": 8,
35
- "rms_norm_eps": 1e-06,
36
- "rope_scaling": null,
37
- "rope_theta": 1000000,
38
- "sliding_window": null,
39
- "tie_word_embeddings": true,
40
- "torch_dtype": "bfloat16",
41
- "use_cache": true,
42
- "use_sliding_window": false,
43
- "vocab_size": 151936
44
- },
45
- "model_type": "ovis2_5",
46
- "torch_dtype": "bfloat16",
47
- "transformers_version": "4.51.3",
48
- "use_cache": true,
49
- "visual_vocab_size": 65536,
50
- "vit_config": {
51
- "_attn_implementation_autoset": true,
52
- "_name_or_path": "google/siglip2-so400m-patch16-512",
53
- "attention_dropout": 0.0,
54
- "fullatt_block_indexes": null,
55
- "hidden_act": "gelu_pytorch_tanh",
56
- "hidden_size": 1152,
57
- "hidden_stride": 2,
58
- "image_size": 512,
59
- "intermediate_size": 4304,
60
- "layer_norm_eps": 1e-06,
61
- "model_type": "siglip2_navit",
62
- "num_attention_heads": 16,
63
- "num_channels": 3,
64
- "num_hidden_layers": 27,
65
- "num_patches": -1,
66
- "patch_size": 16,
67
- "preserve_original_pe": true,
68
- "temporal_patch_size": 1,
69
- "torch_dtype": "bfloat16",
70
- "use_rope": true,
71
- "window_size": 112
72
- }
73
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Ovis2.5-2B/.ipynb_checkpoints/configuration_ovis2_5-checkpoint.py DELETED
@@ -1,96 +0,0 @@
1
- from typing import Any, Optional, List, Union
2
-
3
- from transformers import Qwen3Config
4
- from transformers.configuration_utils import PretrainedConfig
5
-
6
- __all__ = ["Siglip2NavitConfig", "Ovis2_5_Config"]
7
-
8
-
9
- class Siglip2NavitConfig(PretrainedConfig):
10
- """This is the configuration class to store the configuration of an [`AIMv2Model`].
11
-
12
- Instantiating a configuration with the defaults will yield a similar configuration
13
- to that of the [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224).
14
-
15
- Args:
16
- hidden_size: Dimension of the hidden representations.
17
- intermediate_size: Dimension of the SwiGLU representations.
18
- num_hidden_layers: Number of hidden layers in the Transformer.
19
- num_attention_heads: Number of attention heads for each attention layer
20
- in the Transformer.
21
- num_channels: Number of input channels.
22
- image_size: Image size.
23
- patch_size: Patch size.
24
- rms_norm_eps: Epsilon value used for the RMS normalization layer.
25
- attention_dropout: Dropout ratio for attention probabilities.
26
- projection_dropout: Dropout ratio for the projection layer after the attention.
27
- qkv_bias: Whether to add a bias to the queries, keys and values.
28
- use_bias: Whether to add a bias in the feed-forward and projection layers.
29
- kwargs: Keyword arguments for the [`PretrainedConfig`].
30
- """
31
-
32
- model_type: str = "siglip2_navit"
33
-
34
- def __init__(
35
- self,
36
- hidden_size: int = 1024,
37
- intermediate_size: int = 4096,
38
- num_hidden_layers: int = 24,
39
- num_attention_heads: int = 16,
40
- num_channels: int = 3,
41
- num_patches: int = -1,
42
- image_size: int = 512,
43
- patch_size: int = 16,
44
- hidden_act: str="gelu_pytorch_tanh",
45
- layer_norm_eps: float = 1e-6,
46
- attention_dropout: float = 0.0,
47
- hidden_stride: int = 2,
48
- window_size: int = 112,
49
- fullatt_block_indexes: Optional[list] = None,
50
- temporal_patch_size: int = 1,
51
- preserve_original_pe: bool = True,
52
- use_rope: bool = True,
53
- **kwargs: Any,
54
- ):
55
- super().__init__(**kwargs)
56
- self.hidden_size = hidden_size
57
- self.intermediate_size = intermediate_size
58
- self.num_hidden_layers = num_hidden_layers
59
- self.num_attention_heads = num_attention_heads
60
- self.num_channels = num_channels
61
- self.num_patches = num_patches
62
- self.patch_size = patch_size
63
- self.image_size = image_size
64
- self.hidden_act = hidden_act
65
- self.attention_dropout = attention_dropout
66
- self.layer_norm_eps = layer_norm_eps
67
- self.hidden_stride = hidden_stride
68
- self.window_size = window_size
69
- self.fullatt_block_indexes = fullatt_block_indexes
70
- self.temporal_patch_size = temporal_patch_size
71
- self.preserve_original_pe = preserve_original_pe
72
- self.use_rope = use_rope
73
-
74
- class Ovis2_5_Config(PretrainedConfig):
75
- model_type = "ovis2_5"
76
- sub_configs = dict(llm_config=Qwen3Config, vit_config=Siglip2NavitConfig)
77
-
78
- def __init__(self,
79
- llm_config: Optional[Union[Qwen3Config, dict]] = None,
80
- vit_config: Optional[Union[Siglip2NavitConfig, dict]] = None,
81
- visual_vocab_size=65536,
82
- hidden_size=None,
83
- **kwargs
84
- ):
85
- super().__init__(**kwargs)
86
- if isinstance(llm_config, dict):
87
- llm_config = Qwen3Config(**llm_config)
88
- self.llm_config = llm_config
89
- if isinstance(vit_config, dict):
90
- vit_config = Siglip2NavitConfig(**vit_config)
91
- self.vit_config = vit_config
92
- self.visual_vocab_size = visual_vocab_size
93
- self.hidden_size = hidden_size
94
- if kwargs.get('attn_implementation'):
95
- self.llm_config._attn_implementation = kwargs['attn_implementation']
96
- self.vit_config._attn_implementation = kwargs['attn_implementation']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Ovis2.5-2B/.ipynb_checkpoints/generation_config-checkpoint.json DELETED
@@ -1,15 +0,0 @@
1
- {
2
- "bos_token_id": 151643,
3
- "do_sample": true,
4
- "eos_token_id": [
5
- 151645,
6
- 151643
7
- ],
8
- "multimodal_max_length": 8192,
9
- "pad_token_id": 151643,
10
- "repetition_penalty": 1.05,
11
- "temperature": 0.6,
12
- "top_k": 20,
13
- "top_p": 0.95,
14
- "transformers_version": "4.51.3"
15
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Ovis2.5-2B/.ipynb_checkpoints/modeling_ovis2_5-checkpoint.py DELETED
@@ -1,949 +0,0 @@
1
- import math
2
- from typing import Dict, List, Optional, Tuple, Union
3
-
4
- import PIL.Image
5
- import numpy as np
6
- import torch
7
- from flash_attn import flash_attn_varlen_func
8
- from flash_attn.layers.rotary import apply_rotary_emb
9
- from torch import Tensor, nn
10
- from torch.nn import functional as F
11
- from transformers import (
12
- AutoConfig,
13
- AutoImageProcessor,
14
- AutoModel,
15
- AutoModelForCausalLM,
16
- AutoTokenizer,
17
- )
18
- from transformers.activations import ACT2FN
19
- from transformers.generation.utils import GenerateOutput
20
- from transformers.modeling_outputs import BaseModelOutputWithNoAttention
21
- from transformers.modeling_utils import PreTrainedModel
22
-
23
- from .configuration_ovis2_5 import Siglip2NavitConfig, Ovis2_5_Config
24
-
25
- IMAGE_PLACEHOLDER = "<image>"
26
- IMAGE_PLACEHOLDER_ID = -200
27
- VIDEO_PLACEHOLDER = "<video>"
28
- VIDEO_PLACEHOLDER_ID = -201
29
-
30
- VISUAL_ATOM_ID = -300
31
- INDICATOR_IDS = [-301, -302, -303, -304]
32
-
33
- # copied from qwen2.5-vl
34
- class VisionRotaryEmbedding(nn.Module):
35
- def __init__(self, dim: int, theta: float = 10000.0) -> None:
36
- super().__init__()
37
- inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
38
- self.register_buffer("inv_freq", inv_freq, persistent=False)
39
-
40
- def forward(self, seqlen: int) -> torch.Tensor:
41
- seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
42
- freqs = torch.outer(seq, self.inv_freq)
43
- return freqs
44
-
45
-
46
- class Siglip2VisionEmbeddings(nn.Module):
47
- def __init__(self, config: Siglip2NavitConfig):
48
- super().__init__()
49
- self.config = config
50
- self.embed_dim = config.hidden_size
51
- self.patch_size = config.patch_size
52
- self.image_size = config.image_size
53
- self.num_patches = config.num_patches
54
- self.preserve_original_pe = config.preserve_original_pe
55
- self.hidden_stride = config.hidden_stride
56
-
57
-
58
- # siglip2 naflex
59
- if self.num_patches > 0:
60
- self.patch_embedding = nn.Linear(
61
- in_features=config.num_channels * self.patch_size * self.patch_size,
62
- out_features=self.embed_dim,
63
- )
64
- if self.preserve_original_pe:
65
- self.position_embedding_size = int(self.num_patches**0.5)
66
- self.position_embedding = nn.Embedding(self.num_patches, self.embed_dim)
67
-
68
- else:
69
- self.patch_embedding = nn.Conv2d(
70
- in_channels=config.num_channels,
71
- out_channels=self.embed_dim,
72
- kernel_size=self.patch_size,
73
- stride=self.patch_size,
74
- padding="valid",
75
- )
76
- if self.preserve_original_pe:
77
- self.num_patches = (self.image_size // self.patch_size) ** 2
78
- self.position_embedding_size = self.image_size // self.patch_size
79
- self.position_embedding = nn.Embedding(self.num_patches, self.embed_dim)
80
-
81
- @staticmethod
82
- def resize_positional_embeddings(
83
- positional_embeddings: torch.Tensor,
84
- spatial_shapes: torch.LongTensor,
85
- max_length: int,
86
- ) -> torch.Tensor:
87
- """
88
- Resize positional embeddings to image-specific size and pad to a fixed size.
89
-
90
- Args:
91
- positional_embeddings (`torch.Tensor`):
92
- Position embeddings of shape (height, width, embed_dim)
93
- spatial_shapes (`torch.LongTensor`):
94
- Spatial shapes of shape (batch_size, 2) to resize the positional embeddings to
95
- max_length (`int`):
96
- Maximum length of the positional embeddings to pad resized positional embeddings to
97
-
98
- Returns:
99
- `torch.Tensor`: Embeddings of shape (batch_size, max_length, embed_dim)
100
- """
101
- batch_size = spatial_shapes.shape[0]
102
- embed_dim = positional_embeddings.shape[-1]
103
- source_dtype = positional_embeddings.dtype
104
-
105
- resulted_positional_embeddings = torch.empty(
106
- (batch_size, max_length, embed_dim),
107
- device=positional_embeddings.device,
108
- dtype=source_dtype,
109
- )
110
-
111
- # (height, width, embed_dim) -> (1, embed_dim, height, width) for interpolation
112
- positional_embeddings = positional_embeddings.permute(2, 0, 1).unsqueeze(0)
113
-
114
- # Upcast to float32 on CPU because antialias is not supported for bfloat16/float16 on CPU
115
- if positional_embeddings.device.type == "cpu":
116
- positional_embeddings = positional_embeddings.to(torch.float32)
117
-
118
- for i in range(batch_size):
119
- # (1, dim, height, width) -> (1, dim, target_height, target_width)
120
- height, width = spatial_shapes[i]
121
- resized_embeddings = F.interpolate(
122
- positional_embeddings,
123
- size=(height, width),
124
- mode="bilinear",
125
- align_corners=False,
126
- antialias=True,
127
- )
128
-
129
- # (1, dim, target_height, target_width) -> (target_height * target_width, dim)
130
- resized_embeddings = resized_embeddings.reshape(embed_dim, height * width).transpose(0, 1)
131
-
132
- # Cast to original dtype
133
- resized_embeddings = resized_embeddings.to(source_dtype)
134
-
135
- resulted_positional_embeddings[i, : height * width] = resized_embeddings
136
- resulted_positional_embeddings[i, height * width :] = resized_embeddings[0]
137
-
138
- return resulted_positional_embeddings
139
-
140
- def forward(self, pixel_values: torch.FloatTensor,
141
- grid_thws: Optional[torch.LongTensor] = None) -> torch.Tensor:
142
- """
143
- Args:
144
- pixel_values (`torch.FloatTensor`):
145
- Pixel values of shape (num_patches, num_channels * temporal_patch_size * patch_size * patch_size)
146
- grid_thws: (`torch.LongTensor`):
147
- grid shape (num_patches, 3)
148
- """
149
-
150
- # Apply patch embeddings to already patchified pixel values
151
- target_dtype = self.patch_embedding.weight.dtype
152
- if isinstance(self.patch_embedding, nn.Linear):
153
- patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))
154
- elif isinstance(self.patch_embedding, nn.Conv2d):
155
- pixel_values = pixel_values.view(-1, self.config.num_channels * self.config.temporal_patch_size, self.patch_size,
156
- self.patch_size)
157
- patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))
158
- patch_embeds = patch_embeds.reshape(-1, self.embed_dim)
159
-
160
-
161
- if self.preserve_original_pe:
162
- assert grid_thws is not None
163
- pos_embed_new = torch.zeros_like(patch_embeds)
164
- ori_h = ori_w = self.position_embedding_size
165
- positional_embeddings = self.position_embedding.weight.reshape(
166
- self.position_embedding_size, self.position_embedding_size, -1
167
- ).unsqueeze(0).permute(0,3,1,2)
168
- # pos_embed = self.pos_embed.reshape(1, ori_h, ori_w, -1).permute(0, 3, 1, 2)
169
- cnt = 0
170
- for t, h, w in grid_thws:
171
- thw = t * h * w
172
- pe = F.interpolate(positional_embeddings, size=(h, w), mode='bicubic', align_corners=False)
173
- pe = pe.permute(0, 2, 3, 1).reshape(1, h * w, -1)
174
- pe = pe[0].repeat(t, 1)
175
- pe = pe.reshape(t, h // self.hidden_stride, self.hidden_stride, w // self.hidden_stride,
176
- self.hidden_stride, -1)
177
- pe = pe.permute(0, 1, 3, 2, 4, 5).reshape(thw, -1)
178
- pos_embed_new[cnt:cnt + thw] = pe
179
- cnt += thw
180
- patch_embeds = patch_embeds + pos_embed_new
181
-
182
- return patch_embeds
183
-
184
-
185
- # copied from qwen2.5-vl
186
- def apply_rotary_pos_emb_flashatt(
187
- q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
188
- ) -> Tuple[torch.Tensor, torch.Tensor]:
189
- cos = cos.chunk(2, dim=-1)[0].contiguous()
190
- sin = sin.chunk(2, dim=-1)[0].contiguous()
191
- q_embed = apply_rotary_emb(q.float(), cos.float(), sin.float()).type_as(q)
192
- k_embed = apply_rotary_emb(k.float(), cos.float(), sin.float()).type_as(k)
193
- return q_embed, k_embed
194
-
195
-
196
- class Siglip2Attention(nn.Module):
197
- """Multi-headed attention from 'Attention Is All You Need' paper"""
198
-
199
- def __init__(self, config):
200
- super().__init__()
201
- self.config = config
202
- self.embed_dim = config.hidden_size
203
- self.num_heads = config.num_attention_heads
204
- self.head_dim = self.embed_dim // self.num_heads
205
- if self.head_dim * self.num_heads != self.embed_dim:
206
- raise ValueError(
207
- f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
208
- f" {self.num_heads})."
209
- )
210
- self.scale = self.head_dim**-0.5
211
- self.dropout = config.attention_dropout
212
- self.is_causal = False
213
-
214
- self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
215
- self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
216
- self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
217
- self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
218
-
219
- self.use_rope = config.use_rope
220
-
221
- def forward(
222
- self,
223
- hidden_states: torch.Tensor,
224
- cu_seqlens: torch.Tensor,
225
- position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
226
- ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
227
- """Input shape: Batch x Time x Channel"""
228
-
229
- seq_length, embed_dim = hidden_states.shape
230
-
231
- queries = self.q_proj(hidden_states)
232
- keys = self.k_proj(hidden_states)
233
- values = self.v_proj(hidden_states)
234
-
235
- queries = queries.view(seq_length, self.num_heads, self.head_dim)
236
- keys = keys.view(seq_length, self.num_heads, self.head_dim)
237
- values = values.view(seq_length, self.num_heads, self.head_dim)
238
-
239
- if self.use_rope:
240
- cos, sin = position_embeddings
241
- queries, keys = apply_rotary_pos_emb_flashatt(queries.unsqueeze(0), keys.unsqueeze(0), cos, sin)
242
- queries = queries.squeeze(0)
243
- keys = keys.squeeze(0)
244
-
245
- max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
246
- attn_output = flash_attn_varlen_func(queries, keys, values, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen).reshape(
247
- seq_length, -1
248
- )
249
- attn_output = self.out_proj(attn_output)
250
- return attn_output
251
-
252
- class Siglip2MLP(nn.Module):
253
- def __init__(self, config):
254
- super().__init__()
255
- self.config = config
256
- self.activation_fn = ACT2FN[config.hidden_act]
257
- self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
258
- self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
259
-
260
- def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
261
- hidden_states = self.fc1(hidden_states)
262
- hidden_states = self.activation_fn(hidden_states)
263
- hidden_states = self.fc2(hidden_states)
264
- return hidden_states
265
-
266
-
267
- class Siglip2EncoderLayer(nn.Module):
268
- def __init__(self, config: Siglip2NavitConfig):
269
- super().__init__()
270
- self.embed_dim = config.hidden_size
271
- self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
272
- self.self_attn = Siglip2Attention(config)
273
- self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
274
- self.mlp = Siglip2MLP(config)
275
-
276
- def forward(
277
- self,
278
- hidden_states: torch.Tensor,
279
- cu_seqlens: torch.Tensor,
280
- position_embeddings: torch.Tensor
281
- ) -> tuple[torch.FloatTensor]:
282
- """
283
- Args:
284
- hidden_states (`torch.FloatTensor`):
285
- Input to the layer of shape `(batch, seq_len, embed_dim)`.
286
- attention_mask (`torch.FloatTensor`):
287
- Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
288
- output_attentions (`bool`, *optional*, defaults to `False`):
289
- Whether or not to return the attentions tensors of all attention layers. See `attentions` under
290
- returned tensors for more detail.
291
- """
292
- residual = hidden_states
293
-
294
- hidden_states = self.layer_norm1(hidden_states)
295
- hidden_states = self.self_attn(
296
- hidden_states=hidden_states,
297
- cu_seqlens=cu_seqlens,
298
- position_embeddings=position_embeddings
299
- )
300
- hidden_states = residual + hidden_states
301
-
302
- residual = hidden_states
303
- hidden_states = self.layer_norm2(hidden_states)
304
- hidden_states = self.mlp(hidden_states)
305
- hidden_states = residual + hidden_states
306
-
307
- return hidden_states
308
-
309
- class Siglip2Encoder(nn.Module):
310
- """
311
- Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
312
- [`Siglip2EncoderLayer`].
313
-
314
- Args:
315
- config: Siglip2NavitConfig
316
- """
317
-
318
- def __init__(self, config: Siglip2NavitConfig):
319
- super().__init__()
320
- self.config = config
321
- self.layers = nn.ModuleList([Siglip2EncoderLayer(config) for _ in range(config.num_hidden_layers)])
322
- self.gradient_checkpointing = False
323
-
324
- self.rotary_pos_emb = VisionRotaryEmbedding(config.hidden_size // config.num_attention_heads // 2)
325
- self.patch_size = config.patch_size
326
- self.hidden_stride = config.hidden_stride
327
- self.window_size = config.window_size
328
- self.spatial_merge_unit = config.hidden_stride * config.hidden_stride
329
- self.fullatt_block_indexes = None if config.fullatt_block_indexes is None else [int(i) for i in config.fullatt_block_indexes.split('|')]
330
-
331
-
332
- # copied from qwen2.5_vl
333
- def rot_pos_emb(self, grid_thw):
334
- pos_ids = []
335
- for t, h, w in grid_thw:
336
- hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
337
- hpos_ids = hpos_ids.reshape(
338
- h // self.hidden_stride,
339
- self.hidden_stride,
340
- w // self.hidden_stride,
341
- self.hidden_stride,
342
- )
343
- hpos_ids = hpos_ids.permute(0, 2, 1, 3)
344
- hpos_ids = hpos_ids.flatten()
345
-
346
- wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
347
- wpos_ids = wpos_ids.reshape(
348
- h // self.hidden_stride,
349
- self.hidden_stride,
350
- w // self.hidden_stride,
351
- self.hidden_stride,
352
- )
353
- wpos_ids = wpos_ids.permute(0, 2, 1, 3)
354
- wpos_ids = wpos_ids.flatten()
355
- pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
356
- pos_ids = torch.cat(pos_ids, dim=0)
357
- max_grid_size = grid_thw[:, 1:].max()
358
- rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
359
- rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
360
- return rotary_pos_emb
361
-
362
- def get_window_index(self, grid_thw):
363
- window_index: list = []
364
- cu_window_seqlens: list = [0]
365
- window_index_id = 0
366
- vit_merger_window_size = self.window_size // self.hidden_stride // self.patch_size # patch (after merge) number in each window
367
-
368
- for grid_t, grid_h, grid_w in grid_thw:
369
- llm_grid_h, llm_grid_w = (
370
- grid_h // self.hidden_stride, # number of patch after merge
371
- grid_w // self.hidden_stride,
372
- )
373
- index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(grid_t, llm_grid_h, llm_grid_w)
374
- pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size
375
- pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size
376
- num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size
377
- num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size
378
- index_padded = F.pad(index, (0, pad_w, 0, pad_h), "constant", -100)
379
- index_padded = index_padded.reshape(
380
- grid_t,
381
- num_windows_h,
382
- vit_merger_window_size,
383
- num_windows_w,
384
- vit_merger_window_size,
385
- )
386
- index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape(
387
- grid_t,
388
- num_windows_h * num_windows_w,
389
- vit_merger_window_size,
390
- vit_merger_window_size,
391
- )
392
- seqlens = (index_padded != -100).sum([2, 3]).reshape(-1)
393
- index_padded = index_padded.reshape(-1)
394
- index_new = index_padded[index_padded != -100]
395
- window_index.append(index_new + window_index_id)
396
- cu_seqlens_tmp = seqlens.cumsum(0) * self.spatial_merge_unit + cu_window_seqlens[-1]
397
- cu_window_seqlens.extend(cu_seqlens_tmp.tolist())
398
- window_index_id += (grid_t * llm_grid_h * llm_grid_w).item()
399
- window_index = torch.cat(window_index, dim=0)
400
-
401
- return window_index, cu_window_seqlens
402
-
403
- # Ignore copy
404
- def forward(
405
- self,
406
- inputs_embeds,
407
- grid_thws: torch.Tensor,
408
- output_hidden_states: bool = False,
409
- ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, ...]]]:
410
- r"""
411
- Args:
412
- inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
413
- Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
414
- This is useful if you want more control over how to convert `input_ids` indices into associated vectors
415
- than the model's internal embedding lookup matrix.
416
- attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
417
- Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
418
-
419
- - 1 for tokens that are **not masked**,
420
- - 0 for tokens that are **masked**.
421
-
422
- [What are attention masks?](../glossary#attention-mask)
423
- output_attentions (`bool`, *optional*):
424
- Whether or not to return the attentions tensors of all attention layers. See `attentions` under
425
- returned tensors for more detail.
426
- output_hidden_states (`bool`, *optional*):
427
- Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
428
- for more detail.
429
- return_dict (`bool`, *optional*):
430
- Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
431
- """
432
-
433
- rotary_pos_emb = self.rot_pos_emb(grid_thws)
434
- window_index, cu_window_seqlens = self.get_window_index(grid_thws)
435
- cu_window_seqlens = torch.tensor(
436
- cu_window_seqlens,
437
- device=inputs_embeds.device,
438
- dtype=grid_thws.dtype if torch.jit.is_tracing() else torch.int32,
439
- )
440
- cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
441
-
442
- seq_len, _ = inputs_embeds.size()
443
- inputs_embeds = inputs_embeds.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
444
- inputs_embeds = inputs_embeds[window_index, :, :]
445
- inputs_embeds = inputs_embeds.reshape(seq_len, -1)
446
- rotary_pos_emb = rotary_pos_emb.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
447
- rotary_pos_emb = rotary_pos_emb[window_index, :, :]
448
- rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
449
- emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
450
- position_embeddings = (emb.cos(), emb.sin())
451
-
452
- cu_seqlens = torch.repeat_interleave(grid_thws[:, 1] * grid_thws[:, 2], grid_thws[:, 0]).cumsum(
453
- dim=0,
454
- # Select dtype based on the following factors:
455
- # - FA2 requires that cu_seqlens_q must have dtype int32
456
- # - torch.onnx.export requires that cu_seqlens_q must have same dtype as grid_thw
457
- # See https://github.com/huggingface/transformers/pull/34852 for more information
458
- dtype=grid_thws.dtype if torch.jit.is_tracing() else torch.int32,
459
- )
460
- cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
461
-
462
- reverse_indices = torch.argsort(window_index)
463
- encoder_states = () if output_hidden_states else None
464
-
465
- hidden_states = inputs_embeds
466
- for index, block in enumerate(self.layers):
467
- if self.fullatt_block_indexes is None or index in self.fullatt_block_indexes:
468
- cu_seqlens_tmp = cu_seqlens
469
- else:
470
- cu_seqlens_tmp = cu_window_seqlens
471
- if self.gradient_checkpointing and self.training:
472
- hidden_states = self._gradient_checkpointing_func(block.__call__, hidden_states, cu_seqlens_tmp, position_embeddings)
473
- else:
474
- hidden_states = block(hidden_states, cu_seqlens_tmp, position_embeddings)
475
- if output_hidden_states:
476
- hidden_states_ = hidden_states.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
477
- encoder_states += (hidden_states_[reverse_indices, :].reshape(seq_len, -1),)
478
- # tokens = self.post_trunk_norm(tokens)
479
- hidden_states = hidden_states.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
480
- hidden_states = hidden_states[reverse_indices, :].reshape(seq_len, -1)
481
-
482
- return hidden_states, encoder_states
483
-
484
- class Siglip2VisionTransformer(nn.Module):
485
- def __init__(self, config: Siglip2NavitConfig):
486
- super().__init__()
487
- self.config = config
488
- embed_dim = config.hidden_size
489
-
490
- self.embeddings = Siglip2VisionEmbeddings(config)
491
- self.encoder = Siglip2Encoder(config)
492
- self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
493
- self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
494
-
495
- def forward(
496
- self,
497
- pixel_values: torch.FloatTensor,
498
- grid_thws: torch.LongTensor,
499
- output_hidden_states: Optional[bool] = True,
500
- return_dict: Optional[bool] = True,
501
- ) -> Union[
502
- Tuple[torch.Tensor],
503
- Tuple[torch.Tensor, Tuple[torch.Tensor, ...]],
504
- BaseModelOutputWithNoAttention,
505
- ]:
506
- r"""
507
- spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
508
- Tensor containing the spatial dimensions (height, width) of the input images.
509
- """
510
- # output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
511
- # output_hidden_states = (
512
- # output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
513
- # )
514
-
515
- hidden_states = self.embeddings(pixel_values, grid_thws)
516
-
517
- last_hidden_state, hidden_states = self.encoder(hidden_states, grid_thws, output_hidden_states)
518
- last_hidden_state = self.post_layernorm(last_hidden_state)
519
-
520
- if not return_dict:
521
- output = (last_hidden_state,)
522
- output += (hidden_states,) if output_hidden_states else ()
523
- return output
524
-
525
- return BaseModelOutputWithNoAttention(
526
- last_hidden_state=last_hidden_state,
527
- hidden_states=hidden_states
528
- )
529
-
530
- class Siglip2PreTrainedModel(PreTrainedModel):
531
- config_class = Siglip2NavitConfig
532
- base_model_prefix = "siglip2_navit"
533
- supports_gradient_checkpointing = True
534
-
535
- _no_split_modules = [
536
- "Siglip2VisionEmbeddings",
537
- "Siglip2EncoderLayer",
538
- ]
539
- _supports_flash_attn_2 = True
540
- _supports_sdpa = False
541
- _supports_flex_attn = False
542
- _supports_attention_backend = True
543
-
544
-
545
- class Siglip2NavitModel(Siglip2PreTrainedModel):
546
- config_class = Siglip2NavitConfig
547
- main_input_name = "pixel_values"
548
-
549
- def __init__(self, config: Siglip2NavitConfig):
550
- super().__init__(config)
551
-
552
- self.vision_model = Siglip2VisionTransformer(config)
553
-
554
- def get_input_embeddings(self) -> nn.Module:
555
- return self.vision_model.embeddings.patch_embedding
556
-
557
- def forward(
558
- self,
559
- pixel_values: torch.FloatTensor,
560
- grid_thws: torch.LongTensor,
561
- output_hidden_states: Optional[bool] = None,
562
- return_dict: Optional[bool] = None,
563
- ) -> Union[
564
- Tuple[torch.Tensor],
565
- Tuple[torch.Tensor, Tuple[torch.Tensor, ...]],
566
- BaseModelOutputWithNoAttention,
567
- ]:
568
-
569
- if output_hidden_states is None:
570
- output_hidden_states = self.config.output_hidden_states
571
- if return_dict is None:
572
- return_dict = self.config.use_return_dict
573
-
574
- return self.vision_model(
575
- pixel_values=pixel_values,
576
- grid_thws=grid_thws,
577
- output_hidden_states=output_hidden_states,
578
- return_dict=return_dict,
579
- )
580
-
581
- class VisualEmbedding(torch.nn.Embedding):
582
- """
583
- A visual embedding layer that can handle both discrete token IDs (long) and continuous
584
- soft-token probabilities (float).
585
- """
586
-
587
- def forward(self, visual_tokens: Tensor) -> Tensor:
588
- if visual_tokens.dtype in [torch.int8, torch.int16, torch.int32, torch.int64, torch.long]:
589
- return super().forward(visual_tokens)
590
- # Handle soft tokens (probabilities) by matrix multiplication with the embedding weight
591
- return torch.matmul(visual_tokens, self.weight)
592
-
593
-
594
- class VisualTokenizer(torch.nn.Module):
595
- """
596
- Tokenizes images or videos into a sequence of continuous visual tokens.
597
- """
598
-
599
- def __init__(self, vit, visual_vocab_size, image_processor_name_or_path, *args, **kwargs):
600
- super().__init__(*args, **kwargs)
601
- self.vit = vit
602
- self.image_processor = AutoImageProcessor.from_pretrained(image_processor_name_or_path, do_center_crop=False)
603
- head_dim = visual_vocab_size - len(INDICATOR_IDS)
604
- self.head = torch.nn.Sequential(
605
- torch.nn.Linear(self.vit.config.hidden_size * self.vit.config.hidden_stride ** 2, head_dim, bias=False),
606
- torch.nn.LayerNorm(head_dim)
607
- )
608
-
609
- def _encode(self, pixel_values, grid_thws):
610
- output = self.vit(pixel_values, grid_thws, output_hidden_states=True, return_dict=True)
611
- features = output.hidden_states[-1]
612
- seq_len, _ = features.shape
613
- features = features.reshape(seq_len // (self.vit.config.hidden_stride ** 2), -1)
614
- return features
615
-
616
- # Adapted from qwen2_vl
617
- @staticmethod
618
- def smart_resize(
619
- height: int, width: int, factor: int = 28, min_pixels: int = 448 * 448, max_pixels: int = 1344 * 1792
620
- ):
621
- """Rescales the image so that the following conditions are met:
622
- 1. Both dimensions are divisible by 'factor'.
623
- 2. The total number of pixels is within ['min_pixels', 'max_pixels'].
624
- 3. The aspect ratio is maintained as closely as possible.
625
- """
626
- if height < factor or width < factor:
627
- if height < width:
628
- width = round(factor / height * width)
629
- height = factor
630
- else:
631
- height = round(factor / width * height)
632
- width = factor
633
-
634
- elif max(height, width) / min(height, width) > 200:
635
- if height > width:
636
- height = 200 * width
637
- else:
638
- width = 200 * height
639
-
640
- h_bar = round(height / factor) * factor
641
- w_bar = round(width / factor) * factor
642
- if h_bar * w_bar > max_pixels:
643
- beta = math.sqrt((height * width) / max_pixels)
644
- h_bar = math.floor(height / beta / factor) * factor
645
- w_bar = math.floor(width / beta / factor) * factor
646
- elif h_bar * w_bar < min_pixels:
647
- beta = math.sqrt(min_pixels / (height * width))
648
- h_bar = math.ceil(height * beta / factor) * factor
649
- w_bar = math.ceil(width * beta / factor) * factor
650
- return h_bar, w_bar
651
-
652
- def preprocess(
653
- self,
654
- image: Optional[PIL.Image.Image] = None,
655
- video: Optional[List[PIL.Image.Image]] = None,
656
- min_pixels: Optional[int] = None,
657
- max_pixels: Optional[int] = None
658
- ):
659
- patch_size = self.vit.config.patch_size
660
- temporal_patch_size = self.vit.config.temporal_patch_size
661
- hidden_stride = self.vit.config.hidden_stride
662
- assert (image is None) ^ (video is None), "Invalid input: expect either image or video"
663
- if image is not None:
664
- images = [image]
665
- else:
666
- images = video
667
- images = [image.convert("RGB") if image.mode != 'RGB' else image for image in images]
668
- width, height = images[0].size
669
- processed_images = []
670
- for image in images:
671
- resized_height, resized_width = self.smart_resize(
672
- height,
673
- width,
674
- factor=patch_size * hidden_stride,
675
- min_pixels=min_pixels,
676
- max_pixels=max_pixels,
677
- )
678
- new_size = dict(height=resized_height, width=resized_width)
679
- new_image = self.image_processor.preprocess(image, size=new_size, return_tensors="np")['pixel_values'][0]
680
- processed_images.append(new_image)
681
-
682
- patches = np.array(processed_images)
683
- if patches.shape[0] % temporal_patch_size != 0:
684
- repeats = np.repeat(patches[-1][np.newaxis], temporal_patch_size - 1, axis=0)
685
- patches = np.concatenate([patches, repeats], axis=0)
686
- channel = patches.shape[1]
687
- grid_t = patches.shape[0] // temporal_patch_size
688
- grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
689
- grid_thw = torch.tensor([[grid_t, grid_h, grid_w]])
690
-
691
- patches = patches.reshape(
692
- grid_t, temporal_patch_size, channel,
693
- grid_h // hidden_stride, hidden_stride, patch_size,
694
- grid_w // hidden_stride, hidden_stride, patch_size,
695
- )
696
- patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
697
- flatten_patches = patches.reshape(
698
- grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size
699
- )
700
- flatten_patches = torch.tensor(flatten_patches)
701
-
702
- return flatten_patches, grid_thw
703
-
704
- def forward(
705
- self, pixel_values, grid_thws
706
- ) -> torch.Tensor: # [BatchSize, ImageShape] -> [BatchSize, #Token, VocabSize]
707
- features = self._encode(pixel_values, grid_thws)
708
- logits = self.head(features)
709
- tokens = torch.softmax(logits, dim=-1, dtype=torch.float32).to(logits.dtype)
710
-
711
- token_len, _ = tokens.shape
712
- padding_tensor = torch.zeros(size=(token_len, len(INDICATOR_IDS)),
713
- dtype=tokens.dtype,
714
- device=tokens.device,
715
- layout=tokens.layout,
716
- requires_grad=False)
717
- tokens = torch.cat((tokens, padding_tensor), dim=1)
718
- return tokens
719
-
720
-
721
- class OvisPreTrainedModel(PreTrainedModel):
722
- config_class = Ovis2_5_Config
723
- base_model_prefix = "ovis2_5"
724
-
725
-
726
- class Ovis2_5(OvisPreTrainedModel):
727
- _supports_flash_attn_2 = True
728
-
729
- def __init__(self, config: Ovis2_5_Config, *inputs, **kwargs):
730
- super().__init__(config, *inputs, **kwargs)
731
-
732
- self.llm = AutoModelForCausalLM.from_config(self.config.llm_config)
733
- assert self.config.hidden_size == self.llm.config.hidden_size, "hidden size mismatch"
734
- self.text_tokenizer = AutoTokenizer.from_pretrained(self.config.name_or_path)
735
- self.visual_tokenizer = VisualTokenizer(vit=AutoModel.from_config(self.config.vit_config),
736
- visual_vocab_size=self.config.visual_vocab_size,
737
- image_processor_name_or_path=self.config.name_or_path)
738
-
739
- self.vte = VisualEmbedding(self.config.visual_vocab_size, self.config.hidden_size,
740
- device=self.visual_tokenizer.vit.device, dtype=self.visual_tokenizer.vit.dtype)
741
- indicator_token_indices = torch.arange(
742
- self.config.visual_vocab_size - len(INDICATOR_IDS),
743
- self.config.visual_vocab_size,
744
- dtype=torch.long
745
- )
746
- self.register_buffer("indicator_token_indices", indicator_token_indices, persistent=False)
747
-
748
- def _merge_modules(modules_list: tuple):
749
- merged_modules = []
750
- for modules in modules_list:
751
- merged_modules.extend(modules if modules else [])
752
- return merged_modules
753
-
754
- # Standard model configurations for parallelism and device placement
755
- self._no_split_modules = _merge_modules(
756
- (self.llm._no_split_modules, self.visual_tokenizer.vit._no_split_modules))
757
- self._skip_keys_device_placement = self.llm._skip_keys_device_placement
758
- self._keep_in_fp32_modules = _merge_modules(
759
- (self.llm._keep_in_fp32_modules, self.visual_tokenizer.vit._keep_in_fp32_modules))
760
- self.is_parallelizable = all((self.llm.is_parallelizable, self.visual_tokenizer.vit.is_parallelizable))
761
- self.supports_gradient_checkpointing = True
762
-
763
- def tie_weights(self):
764
- self.llm.tie_weights()
765
-
766
- def get_wte(self):
767
- return self.llm.get_input_embeddings()
768
-
769
- def forward(
770
- self,
771
- input_ids: torch.Tensor,
772
- attention_mask: torch.Tensor,
773
- pixel_values: Optional[torch.Tensor],
774
- grid_thws: Optional[torch.Tensor],
775
- labels: Optional[torch.Tensor] = None,
776
- **kwargs
777
- ):
778
- inputs_embeds = self.merge_multimodal(
779
- input_ids=input_ids,
780
- pixel_values=pixel_values,
781
- grid_thws=grid_thws,
782
- )
783
- return self.llm(inputs_embeds=inputs_embeds, attention_mask=attention_mask, labels=labels, **kwargs)
784
-
785
- def merge_multimodal(
786
- self,
787
- input_ids: torch.Tensor,
788
- pixel_values: Optional[torch.Tensor],
789
- grid_thws: Optional[torch.Tensor],
790
- ):
791
- placeholder_token_mask = torch.lt(input_ids, 0)
792
- multimodal_embeds = self.get_wte()(torch.masked_fill(input_ids, placeholder_token_mask, 0))
793
-
794
- if pixel_values is not None:
795
- visual_indicator_embeds = self.vte(self.indicator_token_indices).to(
796
- dtype=multimodal_embeds.dtype, device=multimodal_embeds.device
797
- )
798
- visual_tokens = self.visual_tokenizer(pixel_values, grid_thws)
799
- visual_embeds = self.vte(visual_tokens).to(dtype=multimodal_embeds.dtype, device=multimodal_embeds.device)
800
-
801
- for i, indicator_id in enumerate(INDICATOR_IDS):
802
- multimodal_embeds[input_ids == indicator_id] = visual_indicator_embeds[i]
803
- multimodal_embeds[input_ids == VISUAL_ATOM_ID] = visual_embeds
804
-
805
- return multimodal_embeds
806
-
807
- def _merge_inputs(
808
- self, raw_input_ids, placeholder_id, grid_thws, indicator_begin_id, indicator_end_id
809
- ):
810
- input_ids = []
811
- prev_index = 0
812
- placeholder_indexes = [i for i, v in enumerate(raw_input_ids) if v == placeholder_id]
813
- for placeholder_index, grid_thw in zip(placeholder_indexes, grid_thws):
814
- input_ids.extend(raw_input_ids[prev_index:placeholder_index])
815
- num_image_atoms = grid_thw.prod().item()
816
- num_image_atoms //= self.visual_tokenizer.vit.config.hidden_stride ** 2
817
- num_image_atoms //= self.visual_tokenizer.vit.config.temporal_patch_size
818
- input_ids.extend([indicator_begin_id] + [VISUAL_ATOM_ID] * num_image_atoms + [indicator_end_id])
819
- prev_index = placeholder_index + 1
820
- input_ids.extend(raw_input_ids[prev_index:])
821
- return input_ids
822
-
823
- def _tokenize_with_visual_placeholder(self, text):
824
- placeholder = VIDEO_PLACEHOLDER if VIDEO_PLACEHOLDER in text else IMAGE_PLACEHOLDER
825
- placeholder_id = VIDEO_PLACEHOLDER_ID if VIDEO_PLACEHOLDER in text else IMAGE_PLACEHOLDER_ID
826
- chunks = [self.text_tokenizer(chunk, add_special_tokens=False).input_ids for chunk in text.split(placeholder)]
827
- input_ids = chunks[0]
828
- for chunk in chunks[1:]:
829
- input_ids.append(placeholder_id)
830
- input_ids.extend(chunk)
831
- return input_ids
832
-
833
- def preprocess_inputs(
834
- self,
835
- messages: List[Union[str, Dict]],
836
- min_pixels=448 * 448,
837
- max_pixels=1344 * 1792,
838
- add_generation_prompt=True,
839
- enable_thinking=False
840
- ):
841
- text = self.text_tokenizer.apply_chat_template(
842
- messages,
843
- tokenize=False,
844
- add_generation_prompt=add_generation_prompt,
845
- enable_thinking=enable_thinking
846
- )
847
- input_ids = self._tokenize_with_visual_placeholder(text)
848
- images = []
849
- videos = []
850
- for message in messages:
851
- content = message["content"]
852
- if isinstance(content, list):
853
- images.extend([item["image"] for item in content if item.get("image") is not None])
854
- videos.extend([item["video"] for item in content if item.get("video") is not None])
855
- if images and videos:
856
- raise ValueError(
857
- "Multiple visual input data types detected (both image and video provided). "
858
- "This model supports only one type of visual input data at a time. "
859
- "Please provide either image or video, but not both."
860
- )
861
-
862
- pixel_values, grid_thws = None, None
863
- if images:
864
- pixel_values, grid_thws = zip(
865
- *(self.visual_tokenizer.preprocess(image=image, min_pixels=min_pixels, max_pixels=max_pixels)
866
- for image in images)
867
- )
868
- input_ids = self._merge_inputs(
869
- input_ids, IMAGE_PLACEHOLDER_ID, grid_thws, INDICATOR_IDS[0], INDICATOR_IDS[1]
870
- )
871
- pixel_values = torch.cat(pixel_values, dim=0)
872
- grid_thws = torch.cat(grid_thws, dim=0)
873
- elif videos:
874
- assert len(videos) == 1, "only support single video"
875
- pixel_values, grid_thws = self.visual_tokenizer.preprocess(
876
- video=videos[0], min_pixels=min_pixels, max_pixels=max_pixels
877
- )
878
- input_ids = self._merge_inputs(
879
- input_ids, VIDEO_PLACEHOLDER_ID, grid_thws, INDICATOR_IDS[2], INDICATOR_IDS[3]
880
- )
881
-
882
- input_ids = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0)
883
-
884
- return input_ids, pixel_values, grid_thws
885
-
886
- def generate(
887
- self,
888
- inputs: Optional[torch.Tensor] = None,
889
- **kwargs,
890
- ) -> Union[GenerateOutput, torch.LongTensor]:
891
- attention_mask = torch.ne(inputs, self.text_tokenizer.pad_token_id).to(device=inputs.device)
892
- inputs_embeds = self.merge_multimodal(
893
- input_ids=inputs,
894
- pixel_values=kwargs.pop('pixel_values', None),
895
- grid_thws=kwargs.pop('grid_thws', None)
896
- )
897
- enable_thinking = kwargs.pop('enable_thinking', False)
898
- enable_thinking_budget = kwargs.pop('enable_thinking_budget', False)
899
- thinking_budget = kwargs.pop('thinking_budget', 1024)
900
-
901
- if enable_thinking and enable_thinking_budget:
902
- actual_max_new_tokens = kwargs['max_new_tokens']
903
- kwargs['max_new_tokens'] = thinking_budget
904
- generated_ids = self.llm.generate(inputs=None, inputs_embeds=inputs_embeds, attention_mask=attention_mask, **kwargs)
905
- output_ids = generated_ids
906
- output_ids_list = generated_ids[0]
907
-
908
- # check if the generation has already finished (151645 is <|im_end|>)
909
- if 151645 not in output_ids_list:
910
- # check if the thinking process has finished (151668 is </think>)
911
- # and prepare the second model input
912
- if 151668 not in output_ids_list:
913
- early_stopping_text = "\n\nConsidering the limited time by the user, I have to give the solution based on the thinking directly now.\n</think>\n\n"
914
- early_stopping_ids = self.text_tokenizer(early_stopping_text, return_tensors="pt", return_attention_mask=False).input_ids.to(inputs.device)
915
- input_ids_appendent = torch.cat([output_ids, early_stopping_ids], dim=-1)
916
- kwargs['streamer'].put(early_stopping_ids) if 'streamer' in kwargs else None
917
- else:
918
- input_ids_appendent = output_ids
919
-
920
-
921
- # second generation
922
- new_inputs = torch.cat([inputs, input_ids_appendent], dim=-1)
923
- attention_mask = torch.ne(new_inputs, self.text_tokenizer.pad_token_id).to(device=inputs.device)
924
- inputs_embeds_appendent = self.merge_multimodal(
925
- input_ids=input_ids_appendent,
926
- pixel_values=None,
927
- grid_thws=None
928
- )
929
- new_inputs_embeds = torch.cat([inputs_embeds, inputs_embeds_appendent], dim=-2)
930
-
931
- kwargs['max_new_tokens'] = inputs_embeds.size(-2) + actual_max_new_tokens - new_inputs_embeds.size(-2)
932
- generated_ids2 = self.llm.generate(inputs=None, inputs_embeds=new_inputs_embeds, attention_mask=attention_mask, **kwargs)
933
- kwargs['streamer'].manual_end() if 'streamer' in kwargs else None
934
- return torch.cat([input_ids_appendent, generated_ids2], dim=-1)
935
-
936
- else:
937
- kwargs['streamer'].manual_end() if 'streamer' in kwargs else None
938
- return generated_ids
939
-
940
- else:
941
- generated_ids = self.llm.generate(inputs=None, inputs_embeds=inputs_embeds, attention_mask=attention_mask, **kwargs)
942
- kwargs['streamer'].manual_end() if 'streamer' in kwargs else None
943
- return generated_ids
944
-
945
-
946
- AutoConfig.register('siglip2_navit', Siglip2NavitConfig)
947
- AutoModel.register(Siglip2NavitConfig, Siglip2NavitModel)
948
- AutoConfig.register("ovis2_5", Ovis2_5_Config)
949
- AutoModelForCausalLM.register(Ovis2_5_Config, Ovis2_5)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Ovis2.5-2B/.ipynb_checkpoints/preprocessor_config-checkpoint.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "do_convert_rgb": null,
3
- "do_normalize": true,
4
- "do_rescale": true,
5
- "do_resize": true,
6
- "image_mean": [
7
- 0.5,
8
- 0.5,
9
- 0.5
10
- ],
11
- "image_processor_type": "SiglipImageProcessor",
12
- "image_std": [
13
- 0.5,
14
- 0.5,
15
- 0.5
16
- ],
17
- "processor_class": "SiglipProcessor",
18
- "resample": 2,
19
- "rescale_factor": 0.00392156862745098,
20
- "size": {
21
- "height": 512,
22
- "width": 512
23
- }
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Ovis2.5-2B/.ipynb_checkpoints/tokenizer_config-checkpoint.json DELETED
@@ -1,240 +0,0 @@
1
- {
2
- "add_bos_token": false,
3
- "add_prefix_space": false,
4
- "added_tokens_decoder": {
5
- "151643": {
6
- "content": "<|endoftext|>",
7
- "lstrip": false,
8
- "normalized": false,
9
- "rstrip": false,
10
- "single_word": false,
11
- "special": true
12
- },
13
- "151644": {
14
- "content": "<|im_start|>",
15
- "lstrip": false,
16
- "normalized": false,
17
- "rstrip": false,
18
- "single_word": false,
19
- "special": true
20
- },
21
- "151645": {
22
- "content": "<|im_end|>",
23
- "lstrip": false,
24
- "normalized": false,
25
- "rstrip": false,
26
- "single_word": false,
27
- "special": true
28
- },
29
- "151646": {
30
- "content": "<|object_ref_start|>",
31
- "lstrip": false,
32
- "normalized": false,
33
- "rstrip": false,
34
- "single_word": false,
35
- "special": true
36
- },
37
- "151647": {
38
- "content": "<|object_ref_end|>",
39
- "lstrip": false,
40
- "normalized": false,
41
- "rstrip": false,
42
- "single_word": false,
43
- "special": true
44
- },
45
- "151648": {
46
- "content": "<|box_start|>",
47
- "lstrip": false,
48
- "normalized": false,
49
- "rstrip": false,
50
- "single_word": false,
51
- "special": true
52
- },
53
- "151649": {
54
- "content": "<|box_end|>",
55
- "lstrip": false,
56
- "normalized": false,
57
- "rstrip": false,
58
- "single_word": false,
59
- "special": true
60
- },
61
- "151650": {
62
- "content": "<|quad_start|>",
63
- "lstrip": false,
64
- "normalized": false,
65
- "rstrip": false,
66
- "single_word": false,
67
- "special": true
68
- },
69
- "151651": {
70
- "content": "<|quad_end|>",
71
- "lstrip": false,
72
- "normalized": false,
73
- "rstrip": false,
74
- "single_word": false,
75
- "special": true
76
- },
77
- "151652": {
78
- "content": "<|vision_start|>",
79
- "lstrip": false,
80
- "normalized": false,
81
- "rstrip": false,
82
- "single_word": false,
83
- "special": true
84
- },
85
- "151653": {
86
- "content": "<|vision_end|>",
87
- "lstrip": false,
88
- "normalized": false,
89
- "rstrip": false,
90
- "single_word": false,
91
- "special": true
92
- },
93
- "151654": {
94
- "content": "<|vision_pad|>",
95
- "lstrip": false,
96
- "normalized": false,
97
- "rstrip": false,
98
- "single_word": false,
99
- "special": true
100
- },
101
- "151655": {
102
- "content": "<|image_pad|>",
103
- "lstrip": false,
104
- "normalized": false,
105
- "rstrip": false,
106
- "single_word": false,
107
- "special": true
108
- },
109
- "151656": {
110
- "content": "<|video_pad|>",
111
- "lstrip": false,
112
- "normalized": false,
113
- "rstrip": false,
114
- "single_word": false,
115
- "special": true
116
- },
117
- "151657": {
118
- "content": "<tool_call>",
119
- "lstrip": false,
120
- "normalized": false,
121
- "rstrip": false,
122
- "single_word": false,
123
- "special": false
124
- },
125
- "151658": {
126
- "content": "</tool_call>",
127
- "lstrip": false,
128
- "normalized": false,
129
- "rstrip": false,
130
- "single_word": false,
131
- "special": false
132
- },
133
- "151659": {
134
- "content": "<|fim_prefix|>",
135
- "lstrip": false,
136
- "normalized": false,
137
- "rstrip": false,
138
- "single_word": false,
139
- "special": false
140
- },
141
- "151660": {
142
- "content": "<|fim_middle|>",
143
- "lstrip": false,
144
- "normalized": false,
145
- "rstrip": false,
146
- "single_word": false,
147
- "special": false
148
- },
149
- "151661": {
150
- "content": "<|fim_suffix|>",
151
- "lstrip": false,
152
- "normalized": false,
153
- "rstrip": false,
154
- "single_word": false,
155
- "special": false
156
- },
157
- "151662": {
158
- "content": "<|fim_pad|>",
159
- "lstrip": false,
160
- "normalized": false,
161
- "rstrip": false,
162
- "single_word": false,
163
- "special": false
164
- },
165
- "151663": {
166
- "content": "<|repo_name|>",
167
- "lstrip": false,
168
- "normalized": false,
169
- "rstrip": false,
170
- "single_word": false,
171
- "special": false
172
- },
173
- "151664": {
174
- "content": "<|file_sep|>",
175
- "lstrip": false,
176
- "normalized": false,
177
- "rstrip": false,
178
- "single_word": false,
179
- "special": false
180
- },
181
- "151665": {
182
- "content": "<tool_response>",
183
- "lstrip": false,
184
- "normalized": false,
185
- "rstrip": false,
186
- "single_word": false,
187
- "special": false
188
- },
189
- "151666": {
190
- "content": "</tool_response>",
191
- "lstrip": false,
192
- "normalized": false,
193
- "rstrip": false,
194
- "single_word": false,
195
- "special": false
196
- },
197
- "151667": {
198
- "content": "<think>",
199
- "lstrip": false,
200
- "normalized": false,
201
- "rstrip": false,
202
- "single_word": false,
203
- "special": false
204
- },
205
- "151668": {
206
- "content": "</think>",
207
- "lstrip": false,
208
- "normalized": false,
209
- "rstrip": false,
210
- "single_word": false,
211
- "special": false
212
- }
213
- },
214
- "additional_special_tokens": [
215
- "<|im_start|>",
216
- "<|im_end|>",
217
- "<|object_ref_start|>",
218
- "<|object_ref_end|>",
219
- "<|box_start|>",
220
- "<|box_end|>",
221
- "<|quad_start|>",
222
- "<|quad_end|>",
223
- "<|vision_start|>",
224
- "<|vision_end|>",
225
- "<|vision_pad|>",
226
- "<|image_pad|>",
227
- "<|video_pad|>"
228
- ],
229
- "bos_token": null,
230
- "chat_template": "{%- for message in messages %}{{- '<|im_start|>' + message.role + '\n'}}{%- if message.role == 'system' or message.role == 'user' %}{%- if message.content is string %}{{- message.content | replace('<image>', '') | replace('<video>', '') }}{%- else %}{%- for item in message.content %}{%- if item.type == 'text' and 'text' in item %}{{- item.text | replace('<image>', '') | replace('<video>', '') }}{%- elif item.type == 'image' %}{{- '<image>'}}{%- elif item.type == 'video' %}{{- '<video>'}}{%- else %}{{- raise_exception('Invalid content type. Supported types for system and user are text, image, video.')}}{%- endif %}{%- if not loop.last %}{{- '\n'}}{%- endif %}{%- endfor %}{%- endif %}{%- elif message.role == 'assistant' %}{%- set content = '' %}{%- if message.content is string %}{%- set content = message.content | replace('<image>', '') | replace('<video>', '') %}{%- else %}{%- for item in message.content %}{%- if item.type == 'text' and 'text' in item %}{%- set content = content ~ (item.text | replace('<image>', '') | replace('<video>', '')) %}{%- else %}{{- raise_exception('Invalid content type. Supported type for assistant is text.')}}{%- endif %}{%- endfor %}{%- endif %}{%- set content = content.split('</think>')[-1].lstrip('\n') %}{{- content }}{%- else %}{{- raise_exception('Invalid role. Supported roles are system, user, assistant.')}}{%- endif %}{{- '<|im_end|>\n'}}{%- endfor %}{%- if add_generation_prompt %}{{- '<|im_start|>assistant\n' }}{%- if enable_thinking is defined and enable_thinking is false %}{{- '<think>\n\n</think>\n\n' }}{%- endif %}{%- endif %}",
231
- "clean_up_tokenization_spaces": false,
232
- "eos_token": "<|im_end|>",
233
- "errors": "replace",
234
- "extra_special_tokens": {},
235
- "model_max_length": 131072,
236
- "pad_token": "<|endoftext|>",
237
- "split_special_tokens": false,
238
- "tokenizer_class": "Qwen2Tokenizer",
239
- "unk_token": null
240
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Ovis2.5-2B/.ipynb_checkpoints/vocab-checkpoint.json DELETED
The diff for this file is too large to render. See raw diff