diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..90353c76c0cc48ab838bfd4261fe743d8912c32f 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +Ovis2.5-2B/tokenizer.json filter=lfs diff=lfs merge=lfs -text +tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..7fcaa30e3ebf90c97106d294769991062b67f660 --- /dev/null +++ b/LICENSE @@ -0,0 +1,6 @@ +Copyright (C) 2025 AIDC-AI +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and limitations under the License. \ No newline at end of file diff --git a/NOTICE b/NOTICE new file mode 100644 index 0000000000000000000000000000000000000000..aee774d4f5982604fd333b34f7b7e10e630645ee --- /dev/null +++ b/NOTICE @@ -0,0 +1,9 @@ +Copyright (C) 2025 AIDC-AI +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. + +This model was trained based the following models: +1. Ovis2.5-2B ( https://huggingface.co/datasets/choosealicense/licenses/blob/main/markdown/apache-2.0.md ), license: Apache-2.0( https://choosealicense.com/licenses/apache-2.0/). +2. FLUX.1-schnell( https://huggingface.co/black-forest-labs/FLUX.1-schnell ),license: Apache-2.0 (https://huggingface.co/datasets/choosealicense/licenses/blob/main/markdown/apache-2.0.md ). \ No newline at end of file diff --git a/Ovis2.5-2B/.gitattributes b/Ovis2.5-2B/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..52373fe24473b1aa44333d318f578ae6bf04b49b --- /dev/null +++ b/Ovis2.5-2B/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/Ovis2.5-2B/.ipynb_checkpoints/added_tokens-checkpoint.json b/Ovis2.5-2B/.ipynb_checkpoints/added_tokens-checkpoint.json new file mode 100644 index 0000000000000000000000000000000000000000..b54f9135e44c1e81047e8d05cb027af8bc039eed --- /dev/null +++ b/Ovis2.5-2B/.ipynb_checkpoints/added_tokens-checkpoint.json @@ -0,0 +1,28 @@ +{ + "": 151668, + "": 151658, + "": 151666, + "": 151667, + "": 151657, + "": 151665, + "<|box_end|>": 151649, + "<|box_start|>": 151648, + "<|endoftext|>": 151643, + "<|file_sep|>": 151664, + "<|fim_middle|>": 151660, + "<|fim_pad|>": 151662, + "<|fim_prefix|>": 151659, + "<|fim_suffix|>": 151661, + "<|im_end|>": 151645, + "<|im_start|>": 151644, + "<|image_pad|>": 151655, + "<|object_ref_end|>": 151647, + "<|object_ref_start|>": 151646, + "<|quad_end|>": 151651, + "<|quad_start|>": 151650, + "<|repo_name|>": 151663, + "<|video_pad|>": 151656, + "<|vision_end|>": 151653, + "<|vision_pad|>": 151654, + "<|vision_start|>": 151652 +} diff --git a/Ovis2.5-2B/.ipynb_checkpoints/chat_template-checkpoint.json b/Ovis2.5-2B/.ipynb_checkpoints/chat_template-checkpoint.json new file mode 100644 index 0000000000000000000000000000000000000000..76876e335afe0bafd8a0d1fdbba64b0747e48c86 --- /dev/null +++ b/Ovis2.5-2B/.ipynb_checkpoints/chat_template-checkpoint.json @@ -0,0 +1,3 @@ +{ + "chat_template": "{%- for message in messages %}{{- '<|im_start|>' + message.role + '\n'}}{%- if message.role == 'system' or message.role == 'user' %}{%- if message.content is string %}{{- message.content | replace('', '') | replace('')[-1].lstrip('\n') %}{{- content }}{%- else %}{{- raise_exception('Invalid role. Supported roles are system, user, assistant.')}}{%- endif %}{{- '<|im_end|>\n'}}{%- endfor %}{%- if add_generation_prompt %}{{- '<|im_start|>assistant\n' }}{%- if enable_thinking is defined and enable_thinking is false %}{{- '\n\n\n\n' }}{%- endif %}{%- endif %}" +} \ No newline at end of file diff --git a/Ovis2.5-2B/.ipynb_checkpoints/config-checkpoint.json b/Ovis2.5-2B/.ipynb_checkpoints/config-checkpoint.json new file mode 100644 index 0000000000000000000000000000000000000000..af67aa96416bed215032fa2d42556ea909c9b22c --- /dev/null +++ b/Ovis2.5-2B/.ipynb_checkpoints/config-checkpoint.json @@ -0,0 +1,73 @@ +{ + "architectures": [ + "Ovis2_5" + ], + "auto_map": { + "AutoConfig": "configuration_ovis2_5.Ovis2_5_Config", + "AutoModelForCausalLM": "modeling_ovis2_5.Ovis2_5" + }, + "conversation_formatter_class": "Qwen3ConversationFormatter", + "hidden_size": 2048, + "vocab_size": 151936, + "num_attention_heads": 32, + "max_position_embeddings": 40960, + "llm_config": { + "_attn_implementation_autoset": true, + "_name_or_path": "Qwen/Qwen3-1.7B", + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 6144, + "max_position_embeddings": 40960, + "max_window_layers": 28, + "model_type": "qwen3", + "num_attention_heads": 16, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 + }, + "model_type": "ovis2_5", + "torch_dtype": "bfloat16", + "transformers_version": "4.51.3", + "use_cache": true, + "visual_vocab_size": 65536, + "vit_config": { + "_attn_implementation_autoset": true, + "_name_or_path": "google/siglip2-so400m-patch16-512", + "attention_dropout": 0.0, + "fullatt_block_indexes": null, + "hidden_act": "gelu_pytorch_tanh", + "hidden_size": 1152, + "hidden_stride": 2, + "image_size": 512, + "intermediate_size": 4304, + "layer_norm_eps": 1e-06, + "model_type": "siglip2_navit", + "num_attention_heads": 16, + "num_channels": 3, + "num_hidden_layers": 27, + "num_patches": -1, + "patch_size": 16, + "preserve_original_pe": true, + "temporal_patch_size": 1, + "torch_dtype": "bfloat16", + "use_rope": true, + "window_size": 112 + } +} diff --git a/Ovis2.5-2B/.ipynb_checkpoints/configuration_ovis2_5-checkpoint.py b/Ovis2.5-2B/.ipynb_checkpoints/configuration_ovis2_5-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..0a90e12def7a1ae18e663bda3741eb197c44d2e0 --- /dev/null +++ b/Ovis2.5-2B/.ipynb_checkpoints/configuration_ovis2_5-checkpoint.py @@ -0,0 +1,96 @@ +from typing import Any, Optional, List, Union + +from transformers import Qwen3Config +from transformers.configuration_utils import PretrainedConfig + +__all__ = ["Siglip2NavitConfig", "Ovis2_5_Config"] + + +class Siglip2NavitConfig(PretrainedConfig): + """This is the configuration class to store the configuration of an [`AIMv2Model`]. + + Instantiating a configuration with the defaults will yield a similar configuration + to that of the [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224). + + Args: + hidden_size: Dimension of the hidden representations. + intermediate_size: Dimension of the SwiGLU representations. + num_hidden_layers: Number of hidden layers in the Transformer. + num_attention_heads: Number of attention heads for each attention layer + in the Transformer. + num_channels: Number of input channels. + image_size: Image size. + patch_size: Patch size. + rms_norm_eps: Epsilon value used for the RMS normalization layer. + attention_dropout: Dropout ratio for attention probabilities. + projection_dropout: Dropout ratio for the projection layer after the attention. + qkv_bias: Whether to add a bias to the queries, keys and values. + use_bias: Whether to add a bias in the feed-forward and projection layers. + kwargs: Keyword arguments for the [`PretrainedConfig`]. + """ + + model_type: str = "siglip2_navit" + + def __init__( + self, + hidden_size: int = 1024, + intermediate_size: int = 4096, + num_hidden_layers: int = 24, + num_attention_heads: int = 16, + num_channels: int = 3, + num_patches: int = -1, + image_size: int = 512, + patch_size: int = 16, + hidden_act: str="gelu_pytorch_tanh", + layer_norm_eps: float = 1e-6, + attention_dropout: float = 0.0, + hidden_stride: int = 2, + window_size: int = 112, + fullatt_block_indexes: Optional[list] = None, + temporal_patch_size: int = 1, + preserve_original_pe: bool = True, + use_rope: bool = True, + **kwargs: Any, + ): + super().__init__(**kwargs) + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_channels = num_channels + self.num_patches = num_patches + self.patch_size = patch_size + self.image_size = image_size + self.hidden_act = hidden_act + self.attention_dropout = attention_dropout + self.layer_norm_eps = layer_norm_eps + self.hidden_stride = hidden_stride + self.window_size = window_size + self.fullatt_block_indexes = fullatt_block_indexes + self.temporal_patch_size = temporal_patch_size + self.preserve_original_pe = preserve_original_pe + self.use_rope = use_rope + +class Ovis2_5_Config(PretrainedConfig): + model_type = "ovis2_5" + sub_configs = dict(llm_config=Qwen3Config, vit_config=Siglip2NavitConfig) + + def __init__(self, + llm_config: Optional[Union[Qwen3Config, dict]] = None, + vit_config: Optional[Union[Siglip2NavitConfig, dict]] = None, + visual_vocab_size=65536, + hidden_size=None, + **kwargs + ): + super().__init__(**kwargs) + if isinstance(llm_config, dict): + llm_config = Qwen3Config(**llm_config) + self.llm_config = llm_config + if isinstance(vit_config, dict): + vit_config = Siglip2NavitConfig(**vit_config) + self.vit_config = vit_config + self.visual_vocab_size = visual_vocab_size + self.hidden_size = hidden_size + if kwargs.get('attn_implementation'): + self.llm_config._attn_implementation = kwargs['attn_implementation'] + self.vit_config._attn_implementation = kwargs['attn_implementation'] diff --git a/Ovis2.5-2B/.ipynb_checkpoints/generation_config-checkpoint.json b/Ovis2.5-2B/.ipynb_checkpoints/generation_config-checkpoint.json new file mode 100644 index 0000000000000000000000000000000000000000..d44b45bfc98fbdf410b07eda1b654fc9bce96b47 --- /dev/null +++ b/Ovis2.5-2B/.ipynb_checkpoints/generation_config-checkpoint.json @@ -0,0 +1,15 @@ +{ + "bos_token_id": 151643, + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "multimodal_max_length": 8192, + "pad_token_id": 151643, + "repetition_penalty": 1.05, + "temperature": 0.6, + "top_k": 20, + "top_p": 0.95, + "transformers_version": "4.51.3" +} diff --git a/Ovis2.5-2B/.ipynb_checkpoints/modeling_ovis2_5-checkpoint.py b/Ovis2.5-2B/.ipynb_checkpoints/modeling_ovis2_5-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..0f3e99f32f121cfd1a6e68500cc7d2aef6a916fc --- /dev/null +++ b/Ovis2.5-2B/.ipynb_checkpoints/modeling_ovis2_5-checkpoint.py @@ -0,0 +1,949 @@ +import math +from typing import Dict, List, Optional, Tuple, Union + +import PIL.Image +import numpy as np +import torch +from flash_attn import flash_attn_varlen_func +from flash_attn.layers.rotary import apply_rotary_emb +from torch import Tensor, nn +from torch.nn import functional as F +from transformers import ( + AutoConfig, + AutoImageProcessor, + AutoModel, + AutoModelForCausalLM, + AutoTokenizer, +) +from transformers.activations import ACT2FN +from transformers.generation.utils import GenerateOutput +from transformers.modeling_outputs import BaseModelOutputWithNoAttention +from transformers.modeling_utils import PreTrainedModel + +from .configuration_ovis2_5 import Siglip2NavitConfig, Ovis2_5_Config + +IMAGE_PLACEHOLDER = "" +IMAGE_PLACEHOLDER_ID = -200 +VIDEO_PLACEHOLDER = "