yongqiang commited on Jun 20, 2025

Commit

6fb90cb

1 Parent(s): 9426f83

init this repo

Browse files

Files changed (50) hide show

assets/bee.jpg +3 -0
infer_axmodel.py +316 -0
smolvlm2_axmodel/llama_p1024_l0_together.axmodel +3 -0
smolvlm2_axmodel/llama_p1024_l10_together.axmodel +3 -0
smolvlm2_axmodel/llama_p1024_l11_together.axmodel +3 -0
smolvlm2_axmodel/llama_p1024_l12_together.axmodel +3 -0
smolvlm2_axmodel/llama_p1024_l13_together.axmodel +3 -0
smolvlm2_axmodel/llama_p1024_l14_together.axmodel +3 -0
smolvlm2_axmodel/llama_p1024_l15_together.axmodel +3 -0
smolvlm2_axmodel/llama_p1024_l16_together.axmodel +3 -0
smolvlm2_axmodel/llama_p1024_l17_together.axmodel +3 -0
smolvlm2_axmodel/llama_p1024_l18_together.axmodel +3 -0
smolvlm2_axmodel/llama_p1024_l19_together.axmodel +3 -0
smolvlm2_axmodel/llama_p1024_l1_together.axmodel +3 -0
smolvlm2_axmodel/llama_p1024_l20_together.axmodel +3 -0
smolvlm2_axmodel/llama_p1024_l21_together.axmodel +3 -0
smolvlm2_axmodel/llama_p1024_l22_together.axmodel +3 -0
smolvlm2_axmodel/llama_p1024_l23_together.axmodel +3 -0
smolvlm2_axmodel/llama_p1024_l24_together.axmodel +3 -0
smolvlm2_axmodel/llama_p1024_l25_together.axmodel +3 -0
smolvlm2_axmodel/llama_p1024_l26_together.axmodel +3 -0
smolvlm2_axmodel/llama_p1024_l27_together.axmodel +3 -0
smolvlm2_axmodel/llama_p1024_l28_together.axmodel +3 -0
smolvlm2_axmodel/llama_p1024_l29_together.axmodel +3 -0
smolvlm2_axmodel/llama_p1024_l2_together.axmodel +3 -0
smolvlm2_axmodel/llama_p1024_l30_together.axmodel +3 -0
smolvlm2_axmodel/llama_p1024_l31_together.axmodel +3 -0
smolvlm2_axmodel/llama_p1024_l3_together.axmodel +3 -0
smolvlm2_axmodel/llama_p1024_l4_together.axmodel +3 -0
smolvlm2_axmodel/llama_p1024_l5_together.axmodel +3 -0
smolvlm2_axmodel/llama_p1024_l6_together.axmodel +3 -0
smolvlm2_axmodel/llama_p1024_l7_together.axmodel +3 -0
smolvlm2_axmodel/llama_p1024_l8_together.axmodel +3 -0
smolvlm2_axmodel/llama_p1024_l9_together.axmodel +3 -0
smolvlm2_axmodel/llama_post.axmodel +3 -0
smolvlm2_axmodel/model.embed_tokens.weight.npy +3 -0
smolvlm2_tokenizer/.gitattributes +35 -0
smolvlm2_tokenizer/README.md +270 -0
smolvlm2_tokenizer/added_tokens.json +130 -0
smolvlm2_tokenizer/chat_template.json +3 -0
smolvlm2_tokenizer/config.json +141 -0
smolvlm2_tokenizer/generation_config.json +7 -0
smolvlm2_tokenizer/merges.txt +0 -0
smolvlm2_tokenizer/preprocessor_config.json +35 -0
smolvlm2_tokenizer/processor_config.json +4 -0
smolvlm2_tokenizer/special_tokens_map.json +39 -0
smolvlm2_tokenizer/tokenizer.json +0 -0
smolvlm2_tokenizer/tokenizer_config.json +1192 -0
smolvlm2_tokenizer/vocab.json +0 -0
vit_mdoel/vision_model.onnx +3 -0

assets/bee.jpg ADDED Viewed

Git LFS Details

SHA256: 8b21ba78250f852ca5990063866b1ace6432521d0251bde7f8de783b22c99a6d
Pointer size: 132 Bytes
Size of remote file: 5.37 MB

infer_axmodel.py ADDED Viewed

	@@ -0,0 +1,316 @@

+from transformers import AutoProcessor, AutoModelForImageTextToText
+import torch
+import onnx
+import onnxruntime as ort
+import numpy as np
+import os
+from tqdm import tqdm
+from transformers import AutoConfig
+from typing import List, Tuple
+from axengine import InferenceSession
+from ml_dtypes import bfloat16
+device = "cuda" if torch.cuda.is_available() else "cpu"
+embeddings = torch.load("SmolVLMVisionEmbeddings.pkl", map_location=device, weights_only=False)
+embeds = np.load(os.path.join("./SmolVLM2-500M-Video-Instruct_1024_AXMODEL", "model.embed_tokens.weight.npy"))
+# connector = torch.load("SmolVLMConnector.pkl", map_location=device, weights_only=False)
+encoder = ort.InferenceSession(f'./export_onnx_model/vision_model.onnx', providers=["CPUExecutionProvider"])
+def run_vision_model(
+    pixel_values,
+    patch_attention_mask=None,
+):
+    batch_size = pixel_values.size(0)
+    if patch_attention_mask is None:
+        patch_size = 16
+        patch_attention_mask = torch.ones(
+            (
+                batch_size,
+                pixel_values.size(2) // patch_size,
+                pixel_values.size(3) // patch_size,
+            )
+        )
+        patch_attention_mask = patch_attention_mask.to(dtype=torch.bool, device=pixel_values.device)
+    hidden_states = embeddings(pixel_values=pixel_values, patch_attention_mask=patch_attention_mask)
+    patch_attention_mask = patch_attention_mask.view(batch_size, -1)
+    # The call to `_upad_input` in `_flash_attention_forward` is expensive
+    # So when the `patch_attention_mask` is full of 1s (i.e. attending to the whole sequence),
+    # avoiding passing the attention_mask, which is equivalent to attending to the full sequence
+    if not torch.any(~patch_attention_mask):
+        patch_attention_mask = None
+    elif not self._use_flash_attention_2:
+        patch_attention_mask = _prepare_4d_attention_mask(patch_attention_mask, hidden_states.dtype)
+    encoder_outputs = encoder.run(None, {"input": hidden_states.detach().cpu().to(dtype=torch.float32).numpy()})[0]
+    encoder_outputs = torch.from_numpy(encoder_outputs).to(device, dtype=hidden_states.dtype)
+    return encoder_outputs
+def get_image_features(pixel_values: torch.FloatTensor, pixel_attention_mask: torch.LongTensor = None):
+    """
+    Encodes images into continuous embeddings that can be forwarded to the language model.
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+            The tensors corresponding to the input images.
+        pixel_attention_mask (`torch.LongTensor`, *optional*):
+            The attention mask indicating padded regions in the image.
+    """
+    batch_size, num_images, num_channels, height, width = pixel_values.shape
+    pixel_values = pixel_values.view(batch_size * num_images, *pixel_values.shape[2:])
+    # Remove padding images - padding images are full 0.
+    nb_values_per_image = pixel_values.shape[1:].numel()
+    real_images_inds = (pixel_values == 0.0).sum(dim=(-1, -2, -3)) != nb_values_per_image
+    if not any(real_images_inds):
+        # no images, leave one empty image.
+        real_images_inds[0] = True
+    pixel_values = pixel_values[real_images_inds].contiguous()
+    # Handle the vision attention mask
+    if pixel_attention_mask is None:
+        pixel_attention_mask = torch.ones(
+            size=[pixel_values.shape[i] for i in (0, 2, 3)],
+            dtype=torch.bool,
+            device=pixel_values.device,
+        )
+    else:
+        # Remove padding images from the mask
+        pixel_attention_mask = pixel_attention_mask.view(batch_size * num_images, *pixel_attention_mask.shape[2:])
+        pixel_attention_mask = pixel_attention_mask[real_images_inds].contiguous()
+    patch_size = 16
+    patches_subgrid = pixel_attention_mask.unfold(dimension=1, size=patch_size, step=patch_size)
+    patches_subgrid = patches_subgrid.unfold(dimension=2, size=patch_size, step=patch_size)
+    patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
+    # Get sequence from the vision encoder
+    image_hidden_states = run_vision_model(pixel_values, patch_attention_mask)
+    # Modality projection & resampling
+    # image_hidden_states = connector(image_hidden_states) # 已经 fuse 到了 onnx 中
+    return image_hidden_states
+def inputs_merger(
+        input_ids: torch.LongTensor, inputs_embeds: torch.Tensor, image_hidden_states: torch.Tensor
+    ):
+    """
+    This method aims at merging the token embeddings with the image hidden states into one single sequence of vectors that are fed to the transformer LM.
+    The merging happens as follows:
+    - The text token sequence is: `tok_1 tok_2 tok_3 <fake_token_around_image> <image> <image> ... <image> <fake_token_around_image> tok_4`.
+    - We get the image hidden states for the image through the vision encoder and that hidden state, after a pixel shuffle operation, is then projected into the text embedding space.
+    We thus have a sequence of image hidden states of size (1, image_seq_len, hidden_dim), where 1 is for batch_size of 1 image and hidden_dim is the hidden_dim of the LM transformer.
+    - The merging happens so that we obtain the following sequence: `vector_tok_1 vector_tok_2 vector_tok_3 vector_fake_tok_around_image {sequence of image_seq_len image hidden states} vector_fake_toke_around_image vector_tok_4`. That sequence is fed to the LM.
+    - To fit the format of that sequence, `input_ids`, `input_embeds`, `attention_mask` are all 3 adapted to insert the image hidden states.
+    """
+    _, patch_size, _ = image_hidden_states.shape
+    image_mask = input_ids == 49190 # self.image_token_id
+    num_image_tokens = image_mask.sum(dim=1)
+    if not torch.all(num_image_tokens % patch_size == 0):
+        raise ValueError("At least one sample has <image> tokens not divisible by patch_size.")
+    blocks_per_sample = num_image_tokens // patch_size
+    offsets = torch.nn.functional.pad(blocks_per_sample.cumsum(dim=0), (1, 0), value=0)
+    block_offset = offsets[:-1]
+    row_cum = image_mask.cumsum(dim=-1)
+    chunk_idx = (row_cum - 1) // patch_size
+    local_idx = (row_cum - 1) % patch_size
+    block_idx = block_offset.unsqueeze(1) + chunk_idx
+    image_embeds = torch.zeros_like(inputs_embeds)
+    image_embeds[image_mask] = image_hidden_states[block_idx[image_mask], local_idx[image_mask], :]
+    merged_embeds = torch.where(image_mask.unsqueeze(-1), image_embeds, inputs_embeds)
+    return merged_embeds
+def post_process(data, topk=1, topp=0.9, temperature=0.6):
+    def top_p(l: np.ndarray, p: float) -> np.ndarray:
+        index = np.argsort(l)
+        res = l.copy()
+        sum_p = 0
+        for i in index[::-1]:
+            if sum_p >= p:
+                res[i] = 0
+            sum_p += res[i]
+        return res / sum_p
+    def softmax(l: np.ndarray) -> np.ndarray:
+        l_max = l - l.max()
+        l_exp = np.exp(l_max)
+        res = l_exp / np.sum(l_exp)
+        return res.astype(np.float64)
+    r = data.astype(np.float32)
+    r = r.flatten()
+    candidate_index = np.argpartition(r, -topk)[-topk:]
+    candidate_value = r[candidate_index]
+    candidate_value /= temperature
+    candidate_soft = softmax(candidate_value)
+    candidate_soft = top_p(candidate_soft, topp)
+    candidate_soft = candidate_soft.astype(np.float64) / candidate_soft.sum()
+    pos = np.random.multinomial(1, candidate_soft).argmax()
+    next_token = candidate_index[pos]
+    return next_token, candidate_index, candidate_soft
+if __name__ == "__main__":
+    hf_model_path = "./SmolVLM2-500M-Video-Instruct/"
+    axmodel_path = "./SmolVLM2-500M-Video-Instruct_1024_AXMODEL"
+    prompt = 'Can you describe this image?'
+    processor = AutoProcessor.from_pretrained(hf_model_path)
+    config = AutoConfig.from_pretrained(hf_model_path, trust_remote_code=True)
+    tokenizer = processor.tokenizer
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "url": "./bee.jpg"},
+                {"type": "text", "text": prompt},
+            ]
+        },
+    ]
+    inputs = processor.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        tokenize=True,
+        return_dict=True,
+        return_tensors="pt",
+    ).to(device, dtype=torch.bfloat16)
+    pixel_values = inputs["pixel_values"]
+    pixel_attention_mask = inputs["pixel_attention_mask"]
+    input_ids = inputs["input_ids"]
+    input_ids_length = input_ids.shape[1]
+    inputs_embeds = np.take(embeds, input_ids[0].cpu().numpy().tolist(), axis=0)[None, ...]
+    inputs_embeds = torch.from_numpy(inputs_embeds).to(device, dtype=torch.bfloat16)
+    """
+    miniforge-pypy3/envs/lerobot/lib/python3.10/site-packages/transformers/models/smolvlm/modeling_smolvlm.py(681)get_image_features()
+    """
+    image_hidden_states = get_image_features(pixel_values, pixel_attention_mask)
+    inputs_embeds = inputs_merger(
+        input_ids=input_ids,
+        inputs_embeds=inputs_embeds,
+        image_hidden_states=image_hidden_states,
+    ).to(dtype=torch.float32).cpu().numpy()
+    prefill_data = inputs_embeds
+    prefill_data = prefill_data.astype(bfloat16)
+    token_ids = input_ids[0].cpu().numpy().tolist()
+    token_len = len(token_ids)
+    lastN = 2048
+    cfg = config.text_config
+    kv_dim = cfg.hidden_size // cfg.num_attention_heads * cfg.num_key_value_heads
+    k_caches = [
+        np.zeros((1, lastN, kv_dim), dtype=bfloat16)
+        for _ in range(cfg.num_hidden_layers)
+    ]
+    v_caches = [
+        np.zeros((1, lastN, kv_dim), dtype=bfloat16)
+        for _ in range(cfg.num_hidden_layers)
+    ]
+    prefill_decoder_sessins = []
+    for i in tqdm(range(cfg.num_hidden_layers), desc="Init InferenceSession"):
+        session = InferenceSession(
+            f"{axmodel_path}/llama_p1024_l{i}_together.axmodel"
+        )
+        prefill_decoder_sessins.append(session)
+    post_process_session = InferenceSession(
+        f"{axmodel_path}/llama_post.axmodel"
+    )
+    print("model load done!")
+    """
+        prefill
+    """
+    prefill_len = 1024
+    if prefill_len > 0:
+        indices = np.array(list(range(prefill_len)), np.uint32).reshape(
+            (1, prefill_len)
+        )
+        indices[:, token_len:] = 0
+        mask = np.zeros((1, prefill_len, prefill_len)) - 65536
+        data = np.zeros((1, prefill_len, cfg.hidden_size)).astype(bfloat16)
+        data[:, 0:token_len] = prefill_data
+        for i, t in enumerate(token_ids):
+            mask[:, i, : i + 1] = 0
+        mask = mask.astype(bfloat16)
+        for i in range(cfg.num_hidden_layers):
+            input_feed = {
+                "K_cache": np.zeros((1, 1, cfg.hidden_size), dtype=bfloat16),
+                "V_cache": np.zeros((1, 1, cfg.hidden_size), dtype=bfloat16),
+                "indices": indices,
+                "input": data,
+                "mask": mask,
+            }
+            outputs = prefill_decoder_sessins[i].run(None, input_feed, shape_group=1)
+            k_caches[i][:, :token_len, :] = outputs[0][:, :token_len, :]
+            v_caches[i][:, :token_len, :] = outputs[1][:, :token_len, :]
+            data[:, :token_len] = outputs[2][:, :token_len, :]
+    post_out = post_process_session.run(None, {"input": data[:, token_len - 1, :][None, ...]})[0]
+    next_token, posssible_tokens, possible_soft = post_process(post_out, topk=1)
+    posibles = [tokenizer.decode([t]) for t in posssible_tokens]
+    posible_soft = [str((t, s)) for t, s in zip(posibles, possible_soft)]
+    token_ids.append(next_token)
+    # print("prefill done!")
+    print(f"input prompt: {prompt}\n")
+    print("answer >>", tokenizer.decode(token_ids[token_len], skip_special_tokens=True), end='', flush=True)
+    """
+        decode
+    """
+    mask = np.zeros((1, 1, lastN + 1), dtype=np.float32).astype(bfloat16)
+    mask[:, :, :lastN] -= 65536
+    mask[:, :, :token_len] = 0
+    for start_indice in range(lastN + 1):
+        if prefill_len > 0 and start_indice < token_len:
+            continue
+        next_token = token_ids[start_indice]
+        indices = np.array([start_indice], np.uint32).reshape((1, 1))
+        data = embeds[next_token, :].reshape((1, 1, cfg.hidden_size)).astype(bfloat16)
+        for i in range(cfg.num_hidden_layers):
+            input_feed = {
+                "K_cache": k_caches[i],
+                "V_cache": v_caches[i],
+                "indices": indices,
+                "input": data,
+                "mask": mask,
+            }
+            outputs = prefill_decoder_sessins[i].run(None, input_feed, shape_group=0)
+            k_caches[i][:, start_indice, :] = outputs[0][:, :, :]
+            v_caches[i][:, start_indice, :] = outputs[1][:, :, :]
+            data = outputs[2]
+        mask[..., start_indice] = 0
+        if start_indice < token_len - 1:
+            pass
+        else:
+            post_out = post_process_session.run(None, {"input": data})[0]
+            next_token, posssible_tokens, possible_soft = post_process(post_out)
+            token_ids.append(next_token)
+            print(tokenizer.decode(next_token, skip_special_tokens=True), end='', flush=True)
+        if next_token == tokenizer.eos_token_id:
+            break
+    print("\n")

smolvlm2_axmodel/llama_p1024_l0_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:125ac7e80a94dbd3920fb0e0077ccad612abe8fabc2040dda09b19813ce96f68
+size 12002005

smolvlm2_axmodel/llama_p1024_l10_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:12f5aa82a4dcc3a66aaad951b1ea87c50e618c93adade3a2d1a7b5614169f5a1
+size 12002005

smolvlm2_axmodel/llama_p1024_l11_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ba247ba036a831b6201b53a03bf9847e16be239b386846cf22980da6695cc0d6
+size 12002005

smolvlm2_axmodel/llama_p1024_l12_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:011aea9b7e4fcadec5d1b2c386ff4a12e2f3f0e0e31eca634afc8acc9f0d343b
+size 12002005

smolvlm2_axmodel/llama_p1024_l13_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9420f15bb5b591f258212242bc5fa5566ba45f4d697d0599999114961152d1fd
+size 12002005

smolvlm2_axmodel/llama_p1024_l14_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:397511107011f700388029e604c2f5ec6d092f9cb6e09ab890a198932173193c
+size 12002005

smolvlm2_axmodel/llama_p1024_l15_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:689d9286ad7cf81345352f85bfbb8387934fe7ccb76d3f56563ded5f1d7cdb7b
+size 12002005

smolvlm2_axmodel/llama_p1024_l16_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b91fecc232c92c9faa5fca4ca1bff0802abc8351457f9b34ef55327ccdcbc85a
+size 12002005

smolvlm2_axmodel/llama_p1024_l17_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9404c81f4a02fe332ae1f4ed5361d2f68eea66a9550233cc4c1d4455afc95797
+size 12002005

smolvlm2_axmodel/llama_p1024_l18_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ffa8d959498bd479d2bbb2c42e883a21bb173fbcb73f5d1bbdebe6c8365e8e21
+size 12002005

smolvlm2_axmodel/llama_p1024_l19_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:66265cbf7cd8571f949c23ca6a5918f8c95fb3413e4349cb9c9f3ac18231ca21
+size 12002005

smolvlm2_axmodel/llama_p1024_l1_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9addcae5bad93adaf9f8df49d4cbfa82024be2d2e0b2e815537121a7417ecb88
+size 12002005

smolvlm2_axmodel/llama_p1024_l20_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:69430a836a9eb0d46242419a999e761d61a0c4cc4d17eafbe373641551ac0a8b
+size 12002005

smolvlm2_axmodel/llama_p1024_l21_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6a19009fd1a1d28c9414cb9421af4c66473088a0b3caea9157bde6aac071e1ce
+size 12002005

smolvlm2_axmodel/llama_p1024_l22_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ec30ac9fd2a52f281b76a037d0aa146b8144277aed3408a6c281e5a7df8ba62a
+size 12002005

smolvlm2_axmodel/llama_p1024_l23_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1093d36fa84d6248b1a4728d8ae2aadb1143894eaf3d960e12fd3753d3ab4da2
+size 12002005

smolvlm2_axmodel/llama_p1024_l24_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff63d4efb6dd75433205ce87e4d69d7850dad86555b2919864f04c5df3a8a844
+size 12002005

smolvlm2_axmodel/llama_p1024_l25_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:83d8b772f3aef6356234912a371baebcb6c0897faf3d524091b7ea2fc56f77bc
+size 12002005

smolvlm2_axmodel/llama_p1024_l26_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:033f9deb6fe2288347d1af507d7a31deb0633614dfb0efe9a3a9c962afbe44eb
+size 12002005

smolvlm2_axmodel/llama_p1024_l27_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c0c8c035eb371dd31d53844534c4d321efc933e1097ad3e9d87afd52dba74214
+size 12002005

smolvlm2_axmodel/llama_p1024_l28_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8d33cae03279cab06a856cfacc3e84414c615082a4a358bd09c4a5996c17c575
+size 12002005

smolvlm2_axmodel/llama_p1024_l29_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:84583f5ef60b629b34d47c7deeb3200c096d6d6bf3de3f6bec4da6ae005b5a1e
+size 12002005

smolvlm2_axmodel/llama_p1024_l2_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4514475633a7317118fe4486200bbed73929bd4210c6da4041591797ad93fb3a
+size 12002005

smolvlm2_axmodel/llama_p1024_l30_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:39e1612aac9b1604146b61b4fc37eaada2299f62078260689bf03812c256c75b
+size 12002005

smolvlm2_axmodel/llama_p1024_l31_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5f2f54bcb7d01ea69a3177b72d49e3bdab2d0e0403e86085903389cc6839b5fd
+size 12002005

smolvlm2_axmodel/llama_p1024_l3_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a991d67e4c1dc4bf58689ce4a58362f6bcc73a87257bcb2982774a0b056ca720
+size 12002005

smolvlm2_axmodel/llama_p1024_l4_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9a43e6886989c31dfffeae70177fc9464322bded5bb69515e31aaade31b431b5
+size 12002005

smolvlm2_axmodel/llama_p1024_l5_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ed59bef655c1eae8eb7af4566ef21fd874cfac72b67bbfd1a7279e1a1cffd2c8
+size 12002005

smolvlm2_axmodel/llama_p1024_l6_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:848640700c17925475ef9f9edeaa0fccf235e90a5ad159430682ac389910d86b
+size 12002005

smolvlm2_axmodel/llama_p1024_l7_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:46e4bce8f94d80d12e3b1a5ceae7ba62cbaa06f0ddf11f13999b1936a98bc0a1
+size 12002005

smolvlm2_axmodel/llama_p1024_l8_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b3ba57d8f2cd4d932445600d161a04b0a1160f452425c5abd08f94bece56f23f
+size 12002005

smolvlm2_axmodel/llama_p1024_l9_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0464cccfdfb0566069bad977d98f70b9e15e8e0b642a6e01ca2b16b5f7eb170a
+size 12002005

smolvlm2_axmodel/llama_post.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:89e16c32d05a23b3449b298d8df16bc80edba5c719812c2567e074bdccafbd50
+size 51580706

smolvlm2_axmodel/model.embed_tokens.weight.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:226adbf93820671559d70330a69e69f02641a41b8284dd26b51576545ab3eb10
+size 189235328

smolvlm2_tokenizer/.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

smolvlm2_tokenizer/README.md ADDED Viewed

	@@ -0,0 +1,270 @@

+---
+library_name: transformers
+license: apache-2.0
+datasets:
+- HuggingFaceM4/the_cauldron
+- HuggingFaceM4/Docmatix
+- lmms-lab/LLaVA-OneVision-Data
+- lmms-lab/M4-Instruct-Data
+- HuggingFaceFV/finevideo
+- MAmmoTH-VL/MAmmoTH-VL-Instruct-12M
+- lmms-lab/LLaVA-Video-178K
+- orrzohar/Video-STaR
+- Mutonix/Vript
+- TIGER-Lab/VISTA-400K
+- Enxin/MovieChat-1K_train
+- ShareGPT4Video/ShareGPT4Video
+pipeline_tag: image-text-to-text
+language:
+- en
+base_model:
+- HuggingFaceTB/SmolVLM-500M-Instruct
+---
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/SmolVLM2_banner.png" width="800" height="auto" alt="Image description">
+# SmolVLM2-500M-Video
+SmolVLM2-500M-Video is a lightweight multimodal model designed to analyze video content. The model processes videos, images, and text inputs to generate text outputs - whether answering questions about media files, comparing visual content, or transcribing text from images. Despite its compact size, requiring only 1.8GB of GPU RAM for video inference, it delivers robust performance on complex multimodal tasks. This efficiency makes it particularly well-suited for on-device applications where computational resources may be limited.
+## Model Summary
+- **Developed by:** Hugging Face 🤗
+- **Model type:** Multi-modal model (image/multi-image/video/text)
+- **Language(s) (NLP):** English
+- **License:** Apache 2.0
+- **Architecture:** Based on [Idefics3](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3) (see technical summary)
+## Resources
+- **Demo:** [Video Highlight Generator](https://huggingface.co/spaces/HuggingFaceTB/SmolVLM2-HighlightGenerator)
+- **Blog:** [Blog post](https://huggingface.co/blog/smolvlm2)
+## Uses
+SmolVLM2 can be used for inference on multimodal (video / image / text) tasks where the input consists of text queries along with video or one or more images. Text and media files can be interleaved arbitrarily, enabling tasks like captioning, visual question answering, and storytelling based on visual content. The model does not support image or video generation.
+To fine-tune SmolVLM2 on a specific task, you can follow [the fine-tuning tutorial](https://github.com/huggingface/smollm/blob/main/vision/finetuning/Smol_VLM_FT.ipynb).
+## Evaluation
+We evaluated the performance of the SmolVLM2 family on the following scientific benchmarks:
+| Size    | Video-MME | MLVU | MVBench |
+|----------|-----------------|----------|---------------|
+| 2.2B   | 52.1            | 55.2     | 46.27        |
+| 500M | 42.2            | 47.3     | 39.73        |
+| 256M | 33.7            | 40.6     | 32.7          |
+### How to get started
+You can use transformers to load, infer and fine-tune SmolVLM. Make sure you have num2words, flash-attn and latest transformers installed.
+You can load the model as follows.
+```python
+from transformers import AutoProcessor, AutoModelForImageTextToText
+import torch
+model_path = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
+processor = AutoProcessor.from_pretrained(model_path)
+model = AutoModelForImageTextToText.from_pretrained(
+    model_path,
+    torch_dtype=torch.bfloat16,
+    _attn_implementation="flash_attention_2"
+).to("cuda")
+```
+#### Simple Inference
+You preprocess your inputs directly using chat templates and directly passing them
+```python
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
+            {"type": "text", "text": "Can you describe this image?"},
+        ]
+    },
+]
+inputs = processor.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+).to(model.device, dtype=torch.bfloat16)
+generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=64)
+generated_texts = processor.batch_decode(
+    generated_ids,
+    skip_special_tokens=True,
+)
+print(generated_texts[0])
+```
+#### Video Inference
+To use SmolVLM2 for video inference, make sure you have decord installed.
+```python
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "video", "path": "path_to_video.mp4"},
+            {"type": "text", "text": "Describe this video in detail"}
+        ]
+    },
+]
+inputs = processor.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+).to(model.device, dtype=torch.bfloat16)
+generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=64)
+generated_texts = processor.batch_decode(
+    generated_ids,
+    skip_special_tokens=True,
+)
+print(generated_texts[0])
+```
+#### Multi-image Interleaved Inference
+You can interleave multiple media with text using chat templates.
+```python
+import torch
+messages = [
+    {
+        "role": "user",
+        "content": [
+          {"type": "text", "text": "What is the similarity between these two images?"},
+          {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
+          {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"},
+        ]
+    },
+]
+inputs = processor.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+).to(model.device, dtype=torch.bfloat16)
+generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=64)
+generated_texts = processor.batch_decode(
+    generated_ids,
+    skip_special_tokens=True,
+)
+print(generated_texts[0])
+```
+### Model optimizations
+## Misuse and Out-of-scope Use
+SmolVLM is not intended for high-stakes scenarios or critical decision-making processes that affect an individual's well-being or livelihood. The model may produce content that appears factual but may not be accurate. Misuse includes, but is not limited to:
+- Prohibited Uses:
+  - Evaluating or scoring individuals (e.g., in employment, education, credit)
+  - Critical automated decision-making
+  - Generating unreliable factual content
+- Malicious Activities:
+  - Spam generation
+  - Disinformation campaigns
+  - Harassment or abuse
+  - Unauthorized surveillance
+### License
+SmolVLM2 is built upon [SigLIP](https://huggingface.co/google/siglip-base-patch16-512) as image encoder and [SmolLM2](https://huggingface.co/HuggingFaceTB/SmolLM2-360M-Instruct) for text decoder part.
+We release the SmolVLM2 checkpoints under the Apache 2.0 license.
+## Citation information
+You can cite us in the following way:
+```bibtex
+@article{marafioti2025smolvlm,
+  title={SmolVLM: Redefining small and efficient multimodal models},
+  author={Andrés Marafioti and Orr Zohar and Miquel Farré and Merve Noyan and Elie Bakouch and Pedro Cuenca and Cyril Zakka and Loubna Ben Allal and Anton Lozhkov and Nouamane Tazi and Vaibhav Srivastav and Joshua Lochner and Hugo Larcher and Mathieu Morlon and Lewis Tunstall and Leandro von Werra and Thomas Wolf},
+  journal={arXiv preprint arXiv:2504.05299},
+  year={2025}
+}
+```
+## Training Data
+SmolVLM2 used 3.3M samples for training originally from ten different datasets: [LlaVa Onevision](https://huggingface.co/datasets/lmms-lab/LLaVA-OneVision-Data), [M4-Instruct](https://huggingface.co/datasets/lmms-lab/M4-Instruct-Data), [Mammoth](https://huggingface.co/datasets/MAmmoTH-VL/MAmmoTH-VL-Instruct-12M), [LlaVa Video 178K](https://huggingface.co/datasets/lmms-lab/LLaVA-Video-178K), [FineVideo](https://huggingface.co/datasets/HuggingFaceFV/finevideo), [VideoStar](https://huggingface.co/datasets/orrzohar/Video-STaR), [VRipt](https://huggingface.co/datasets/Mutonix/Vript), [Vista-400K](https://huggingface.co/datasets/TIGER-Lab/VISTA-400K), [MovieChat](https://huggingface.co/datasets/Enxin/MovieChat-1K_train) and [ShareGPT4Video](https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video).
+In the following plots we give a general overview of the samples across modalities and the source of those samples.
+<!--
+<center><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/smolvlm2_data_split.png" width="auto" height="auto" alt="Image description">
+</center>
+### Details
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/smolvlm2_datadetails.png" width="auto" height="auto" alt="Image description"> -->
+## Data Split per modality
+| Data Type    | Percentage |
+|--------------|------------|
+| Image        | 34.4%      |
+| Text         | 20.2%      |
+| Video        | 33.0%      |
+| Multi-image  | 12.3%      |
+## Granular dataset slices per modality
+### Text Datasets
+| Dataset                                    | Percentage |
+|--------------------------------------------|------------|
+| llava-onevision/magpie_pro_ft3_80b_mt      | 6.8%       |
+| llava-onevision/magpie_pro_ft3_80b_tt      | 6.8%       |
+| llava-onevision/magpie_pro_qwen2_72b_tt    | 5.8%       |
+| llava-onevision/mathqa                     | 0.9%       |
+### Multi-image Datasets
+| Dataset                                    | Percentage |
+|--------------------------------------------|------------|
+| m4-instruct-data/m4_instruct_multiimage    | 10.4%      |
+| mammoth/multiimage-cap6                    | 1.9%       |
+### Image Datasets
+| Dataset                                    | Percentage |
+|--------------------------------------------|------------|
+| llava-onevision/other                      | 17.4%      |
+| llava-onevision/vision_flan                | 3.9%       |
+| llava-onevision/mavis_math_metagen         | 2.6%       |
+| llava-onevision/mavis_math_rule_geo        | 2.5%       |
+| llava-onevision/sharegpt4o                 | 1.7%       |
+| llava-onevision/sharegpt4v_coco            | 1.5%       |
+| llava-onevision/image_textualization       | 1.3%       |
+| llava-onevision/sharegpt4v_llava           | 0.9%       |
+| llava-onevision/mapqa                      | 0.9%       |
+| llava-onevision/qa                         | 0.8%       |
+| llava-onevision/textocr                    | 0.8%       |
+### Video Datasets
+| Dataset                                    | Percentage |
+|--------------------------------------------|------------|
+| llava-video-178k/1-2m                      | 7.3%       |
+| llava-video-178k/2-3m                      | 7.0%       |
+| other-video/combined                       | 5.7%       |
+| llava-video-178k/hound                     | 4.4%       |
+| llava-video-178k/0-30s                     | 2.4%       |
+| video-star/starb                           | 2.2%       |
+| vista-400k/combined                        | 2.2%       |
+| vript/long                                 | 1.0%       |
+| ShareGPT4Video/all                         | 0.8%       |

smolvlm2_tokenizer/added_tokens.json ADDED Viewed

	@@ -0,0 +1,130 @@

+{
+  "<end_of_utterance>": 49279,
+  "<fake_token_around_image>": 49189,
+  "<global-img>": 49152,
+  "<image>": 49190,
+  "<row_1_col_1>": 49153,
+  "<row_1_col_2>": 49154,
+  "<row_1_col_3>": 49155,
+  "<row_1_col_4>": 49156,
+  "<row_1_col_5>": 49157,
+  "<row_1_col_6>": 49158,
+  "<row_2_col_1>": 49159,
+  "<row_2_col_2>": 49160,
+  "<row_2_col_3>": 49161,
+  "<row_2_col_4>": 49162,
+  "<row_2_col_5>": 49163,
+  "<row_2_col_6>": 49164,
+  "<row_3_col_1>": 49165,
+  "<row_3_col_2>": 49166,
+  "<row_3_col_3>": 49167,
+  "<row_3_col_4>": 49168,
+  "<row_3_col_5>": 49169,
+  "<row_3_col_6>": 49170,
+  "<row_4_col_1>": 49171,
+  "<row_4_col_2>": 49172,
+  "<row_4_col_3>": 49173,
+  "<row_4_col_4>": 49174,
+  "<row_4_col_5>": 49175,
+  "<row_4_col_6>": 49176,
+  "<row_5_col_1>": 49177,
+  "<row_5_col_2>": 49178,
+  "<row_5_col_3>": 49179,
+  "<row_5_col_4>": 49180,
+  "<row_5_col_5>": 49181,
+  "<row_5_col_6>": 49182,
+  "<row_6_col_1>": 49183,
+  "<row_6_col_2>": 49184,
+  "<row_6_col_3>": 49185,
+  "<row_6_col_4>": 49186,
+  "<row_6_col_5>": 49187,
+  "<row_6_col_6>": 49188,
+  "<|reserved_special_token_0|>": 49191,
+  "<|reserved_special_token_10|>": 49201,
+  "<|reserved_special_token_11|>": 49202,
+  "<|reserved_special_token_12|>": 49203,
+  "<|reserved_special_token_13|>": 49204,
+  "<|reserved_special_token_14|>": 49205,
+  "<|reserved_special_token_15|>": 49206,
+  "<|reserved_special_token_16|>": 49207,
+  "<|reserved_special_token_17|>": 49208,
+  "<|reserved_special_token_18|>": 49209,
+  "<|reserved_special_token_19|>": 49210,
+  "<|reserved_special_token_1|>": 49192,
+  "<|reserved_special_token_20|>": 49211,
+  "<|reserved_special_token_21|>": 49212,
+  "<|reserved_special_token_22|>": 49213,
+  "<|reserved_special_token_23|>": 49214,
+  "<|reserved_special_token_24|>": 49215,
+  "<|reserved_special_token_25|>": 49216,
+  "<|reserved_special_token_26|>": 49217,
+  "<|reserved_special_token_27|>": 49218,
+  "<|reserved_special_token_28|>": 49219,
+  "<|reserved_special_token_29|>": 49220,
+  "<|reserved_special_token_2|>": 49193,
+  "<|reserved_special_token_30|>": 49221,
+  "<|reserved_special_token_31|>": 49222,
+  "<|reserved_special_token_32|>": 49223,
+  "<|reserved_special_token_33|>": 49224,
+  "<|reserved_special_token_34|>": 49225,
+  "<|reserved_special_token_35|>": 49226,
+  "<|reserved_special_token_36|>": 49227,
+  "<|reserved_special_token_37|>": 49228,
+  "<|reserved_special_token_38|>": 49229,
+  "<|reserved_special_token_39|>": 49230,
+  "<|reserved_special_token_3|>": 49194,
+  "<|reserved_special_token_40|>": 49231,
+  "<|reserved_special_token_41|>": 49232,
+  "<|reserved_special_token_42|>": 49233,
+  "<|reserved_special_token_43|>": 49234,
+  "<|reserved_special_token_44|>": 49235,
+  "<|reserved_special_token_45|>": 49236,
+  "<|reserved_special_token_46|>": 49237,
+  "<|reserved_special_token_47|>": 49238,
+  "<|reserved_special_token_48|>": 49239,
+  "<|reserved_special_token_49|>": 49240,
+  "<|reserved_special_token_4|>": 49195,
+  "<|reserved_special_token_50|>": 49241,
+  "<|reserved_special_token_51|>": 49242,
+  "<|reserved_special_token_52|>": 49243,
+  "<|reserved_special_token_53|>": 49244,
+  "<|reserved_special_token_54|>": 49245,
+  "<|reserved_special_token_55|>": 49246,
+  "<|reserved_special_token_56|>": 49247,
+  "<|reserved_special_token_57|>": 49248,
+  "<|reserved_special_token_58|>": 49249,
+  "<|reserved_special_token_59|>": 49250,
+  "<|reserved_special_token_5|>": 49196,
+  "<|reserved_special_token_60|>": 49251,
+  "<|reserved_special_token_61|>": 49252,
+  "<|reserved_special_token_62|>": 49253,
+  "<|reserved_special_token_63|>": 49254,
+  "<|reserved_special_token_64|>": 49255,
+  "<|reserved_special_token_65|>": 49256,
+  "<|reserved_special_token_66|>": 49257,
+  "<|reserved_special_token_67|>": 49258,
+  "<|reserved_special_token_68|>": 49259,
+  "<|reserved_special_token_69|>": 49260,
+  "<|reserved_special_token_6|>": 49197,
+  "<|reserved_special_token_70|>": 49261,
+  "<|reserved_special_token_71|>": 49262,
+  "<|reserved_special_token_72|>": 49263,
+  "<|reserved_special_token_73|>": 49264,
+  "<|reserved_special_token_74|>": 49265,
+  "<|reserved_special_token_75|>": 49266,
+  "<|reserved_special_token_76|>": 49267,
+  "<|reserved_special_token_77|>": 49268,
+  "<|reserved_special_token_78|>": 49269,
+  "<|reserved_special_token_79|>": 49270,
+  "<|reserved_special_token_7|>": 49198,
+  "<|reserved_special_token_80|>": 49271,
+  "<|reserved_special_token_81|>": 49272,
+  "<|reserved_special_token_82|>": 49273,
+  "<|reserved_special_token_83|>": 49274,
+  "<|reserved_special_token_84|>": 49275,
+  "<|reserved_special_token_85|>": 49276,
+  "<|reserved_special_token_86|>": 49277,
+  "<|reserved_special_token_87|>": 49278,
+  "<|reserved_special_token_8|>": 49199,
+  "<|reserved_special_token_9|>": 49200
+}

smolvlm2_tokenizer/chat_template.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "chat_template": "<|im_start|>{% for message in messages %}{{message['role'] | capitalize}}{% if message['content'][0]['type'] == 'image' %}{{':'}}{% else %}{{': '}}{% endif %}{% for line in message['content'] %}{% if line['type'] == 'text' %}{{line['text']}}{% elif line['type'] == 'image' %}{{ '<image>' }}{% endif %}{% endfor %}<end_of_utterance>\n{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}"
+}

smolvlm2_tokenizer/config.json ADDED Viewed

	@@ -0,0 +1,141 @@

+{
+  "architectures": [
+    "SmolVLMForConditionalGeneration"
+  ],
+  "image_token_id": 49190,
+  "model_type": "smolvlm",
+  "pad_token_id": 128002,
+  "scale_factor": 4,
+  "text_config": {
+    "_flash_attn_2_enabled": true,
+    "_name_or_path": "None",
+    "architectures": [
+      "VLlama3ForCausalLM"
+    ],
+    "head_dim": 64,
+    "hidden_size": 960,
+    "intermediate_size": 2560,
+    "is_llama_config": true,
+    "max_position_embeddings": 8192,
+    "model_type": "llama",
+    "neftune_noise_alpha": 0.0,
+    "num_attention_heads": 15,
+    "num_hidden_layers": 32,
+    "num_key_value_heads": 5,
+    "pad_token_id": 2,
+    "perceiver_config": {
+      "_attn_implementation_autoset": false,
+      "_name_or_path": "",
+      "add_cross_attention": false,
+      "architectures": null,
+      "attention_dropout": 0.0,
+      "bad_words_ids": null,
+      "begin_suppress_tokens": null,
+      "bos_token_id": null,
+      "chunk_size_feed_forward": 0,
+      "cross_attention_hidden_size": null,
+      "decoder_start_token_id": null,
+      "diversity_penalty": 0.0,
+      "do_sample": false,
+      "early_stopping": false,
+      "encoder_no_repeat_ngram_size": 0,
+      "eos_token_id": null,
+      "exponential_decay_length_penalty": null,
+      "finetuning_task": null,
+      "forced_bos_token_id": null,
+      "forced_eos_token_id": null,
+      "hidden_act": "silu",
+      "id2label": {
+        "0": "LABEL_0",
+        "1": "LABEL_1"
+      },
+      "is_decoder": false,
+      "is_encoder_decoder": false,
+      "label2id": {
+        "LABEL_0": 0,
+        "LABEL_1": 1
+      },
+      "length_penalty": 1.0,
+      "max_length": 20,
+      "min_length": 0,
+      "model_type": "vllama3",
+      "no_repeat_ngram_size": 0,
+      "num_beam_groups": 1,
+      "num_beams": 1,
+      "num_key_value_heads": 1,
+      "num_return_sequences": 1,
+      "output_attentions": false,
+      "output_hidden_states": false,
+      "output_scores": false,
+      "pad_token_id": null,
+      "prefix": null,
+      "problem_type": null,
+      "pruned_heads": {},
+      "qk_layer_norms_perceiver": false,
+      "remove_invalid_values": false,
+      "repetition_penalty": 1.0,
+      "resampler_depth": 6,
+      "resampler_head_dim": 96,
+      "resampler_n_heads": 16,
+      "resampler_n_latents": 64,
+      "return_dict": true,
+      "return_dict_in_generate": false,
+      "sep_token_id": null,
+      "suppress_tokens": null,
+      "task_specific_params": null,
+      "temperature": 1.0,
+      "tf_legacy_loss": false,
+      "tie_encoder_decoder": false,
+      "tie_word_embeddings": true,
+      "tokenizer_class": null,
+      "top_k": 50,
+      "top_p": 1.0,
+      "torch_dtype": null,
+      "torchscript": false,
+      "transformers_version": "4.46.0",
+      "typical_p": 1.0,
+      "use_bfloat16": false
+    },
+    "pixel_shuffle_factor": 4,
+    "qk_layer_norms": false,
+    "rms_norm_eps": 1e-05,
+    "rope_interleaved": false,
+    "rope_theta": 100000,
+    "torch_dtype": "bfloat16",
+    "transformers.js_config": {
+      "kv_cache_dtype": {
+        "fp16": "float16",
+        "q4f16": "float16"
+      }
+    },
+    "use_resampler": false,
+    "vocab_size": 49280
+  },
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers.js_config": {
+    "kv_cache_dtype": {
+      "fp16": "float16",
+      "q4f16": "float16"
+    }
+  },
+  "transformers_version": "4.47.1",
+  "use_cache": false,
+  "use_reentrant_checkpointing": false,
+  "vision_config": {
+    "hidden_size": 768,
+    "image_size": 512,
+    "max_image_size": {
+      "longest_edge": 512
+    },
+    "model_type": "smolvlm_vision",
+    "num_attention_heads": 12,
+    "patch_size": 16,
+    "size": {
+      "longest_edge": 2048
+    },
+    "tie_word_embeddings": false,
+    "use_base_siglip": false
+  },
+  "vocab_size": 49280
+}

smolvlm2_tokenizer/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "eos_token_id": 49279,
+  "pad_token_id": 2,
+  "transformers_version": "4.47.1"
+}

smolvlm2_tokenizer/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

smolvlm2_tokenizer/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "do_convert_rgb": true,
+  "do_image_splitting": true,
+  "do_normalize": true,
+  "do_pad": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "SmolVLMImageProcessor",
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "max_image_size": {
+    "longest_edge": 512
+  },
+  "processor_class": "SmolVLMProcessor",
+  "resample": 1,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "longest_edge": 2048
+  },
+  "video_sampling": {
+    "fps": 1,
+    "max_frames": 64,
+    "video_size": {
+      "longest_edge": 512
+    }
+  }
+}

smolvlm2_tokenizer/processor_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "image_seq_len": 64,
+  "processor_class": "SmolVLMProcessor"
+}

smolvlm2_tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "additional_special_tokens": [
+    "<fake_token_around_image>",
+    "<image>",
+    "<end_of_utterance>"
+  ],
+  "bos_token": {
+    "content": "<|im_start|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "end_of_utterance_token": "<end_of_utterance>",
+  "eos_token": {
+    "content": "<end_of_utterance>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "fake_image_token": "<fake_token_around_image>",
+  "global_image_token": "<global-img>",
+  "image_token": "<image>",
+  "pad_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

smolvlm2_tokenizer/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

smolvlm2_tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,1192 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<repo_name>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<reponame>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<file_sep>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<filename>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<gh_stars>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<jupyter_script>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49152": {
+      "content": "<global-img>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49153": {
+      "content": "<row_1_col_1>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49154": {
+      "content": "<row_1_col_2>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49155": {
+      "content": "<row_1_col_3>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49156": {
+      "content": "<row_1_col_4>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49157": {
+      "content": "<row_1_col_5>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49158": {
+      "content": "<row_1_col_6>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49159": {
+      "content": "<row_2_col_1>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49160": {
+      "content": "<row_2_col_2>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49161": {
+      "content": "<row_2_col_3>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49162": {
+      "content": "<row_2_col_4>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49163": {
+      "content": "<row_2_col_5>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49164": {
+      "content": "<row_2_col_6>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49165": {
+      "content": "<row_3_col_1>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49166": {
+      "content": "<row_3_col_2>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49167": {
+      "content": "<row_3_col_3>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49168": {
+      "content": "<row_3_col_4>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49169": {
+      "content": "<row_3_col_5>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49170": {
+      "content": "<row_3_col_6>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49171": {
+      "content": "<row_4_col_1>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49172": {
+      "content": "<row_4_col_2>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49173": {
+      "content": "<row_4_col_3>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49174": {
+      "content": "<row_4_col_4>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49175": {
+      "content": "<row_4_col_5>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49176": {
+      "content": "<row_4_col_6>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49177": {
+      "content": "<row_5_col_1>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49178": {
+      "content": "<row_5_col_2>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49179": {
+      "content": "<row_5_col_3>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49180": {
+      "content": "<row_5_col_4>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49181": {
+      "content": "<row_5_col_5>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49182": {
+      "content": "<row_5_col_6>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49183": {
+      "content": "<row_6_col_1>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49184": {
+      "content": "<row_6_col_2>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49185": {
+      "content": "<row_6_col_3>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49186": {
+      "content": "<row_6_col_4>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49187": {
+      "content": "<row_6_col_5>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49188": {
+      "content": "<row_6_col_6>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49189": {
+      "content": "<fake_token_around_image>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49190": {
+      "content": "<image>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49191": {
+      "content": "<|reserved_special_token_0|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49192": {
+      "content": "<|reserved_special_token_1|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49193": {
+      "content": "<|reserved_special_token_2|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49194": {
+      "content": "<|reserved_special_token_3|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49195": {
+      "content": "<|reserved_special_token_4|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49196": {
+      "content": "<|reserved_special_token_5|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49197": {
+      "content": "<|reserved_special_token_6|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49198": {
+      "content": "<|reserved_special_token_7|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49199": {
+      "content": "<|reserved_special_token_8|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49200": {
+      "content": "<|reserved_special_token_9|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49201": {
+      "content": "<|reserved_special_token_10|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49202": {
+      "content": "<|reserved_special_token_11|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49203": {
+      "content": "<|reserved_special_token_12|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49204": {
+      "content": "<|reserved_special_token_13|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49205": {
+      "content": "<|reserved_special_token_14|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49206": {
+      "content": "<|reserved_special_token_15|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49207": {
+      "content": "<|reserved_special_token_16|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49208": {
+      "content": "<|reserved_special_token_17|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49209": {
+      "content": "<|reserved_special_token_18|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49210": {
+      "content": "<|reserved_special_token_19|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49211": {
+      "content": "<|reserved_special_token_20|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49212": {
+      "content": "<|reserved_special_token_21|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49213": {
+      "content": "<|reserved_special_token_22|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49214": {
+      "content": "<|reserved_special_token_23|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49215": {
+      "content": "<|reserved_special_token_24|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49216": {
+      "content": "<|reserved_special_token_25|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49217": {
+      "content": "<|reserved_special_token_26|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49218": {
+      "content": "<|reserved_special_token_27|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49219": {
+      "content": "<|reserved_special_token_28|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49220": {
+      "content": "<|reserved_special_token_29|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49221": {
+      "content": "<|reserved_special_token_30|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49222": {
+      "content": "<|reserved_special_token_31|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49223": {
+      "content": "<|reserved_special_token_32|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49224": {
+      "content": "<|reserved_special_token_33|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49225": {
+      "content": "<|reserved_special_token_34|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49226": {
+      "content": "<|reserved_special_token_35|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49227": {
+      "content": "<|reserved_special_token_36|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49228": {
+      "content": "<|reserved_special_token_37|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49229": {
+      "content": "<|reserved_special_token_38|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49230": {
+      "content": "<|reserved_special_token_39|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49231": {
+      "content": "<|reserved_special_token_40|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49232": {
+      "content": "<|reserved_special_token_41|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49233": {
+      "content": "<|reserved_special_token_42|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49234": {
+      "content": "<|reserved_special_token_43|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49235": {
+      "content": "<|reserved_special_token_44|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49236": {
+      "content": "<|reserved_special_token_45|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49237": {
+      "content": "<|reserved_special_token_46|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49238": {
+      "content": "<|reserved_special_token_47|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49239": {
+      "content": "<|reserved_special_token_48|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49240": {
+      "content": "<|reserved_special_token_49|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49241": {
+      "content": "<|reserved_special_token_50|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49242": {
+      "content": "<|reserved_special_token_51|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49243": {
+      "content": "<|reserved_special_token_52|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49244": {
+      "content": "<|reserved_special_token_53|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49245": {
+      "content": "<|reserved_special_token_54|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49246": {
+      "content": "<|reserved_special_token_55|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49247": {
+      "content": "<|reserved_special_token_56|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49248": {
+      "content": "<|reserved_special_token_57|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49249": {
+      "content": "<|reserved_special_token_58|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49250": {
+      "content": "<|reserved_special_token_59|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49251": {
+      "content": "<|reserved_special_token_60|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49252": {
+      "content": "<|reserved_special_token_61|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49253": {
+      "content": "<|reserved_special_token_62|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49254": {
+      "content": "<|reserved_special_token_63|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49255": {
+      "content": "<|reserved_special_token_64|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49256": {
+      "content": "<|reserved_special_token_65|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49257": {
+      "content": "<|reserved_special_token_66|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49258": {
+      "content": "<|reserved_special_token_67|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49259": {
+      "content": "<|reserved_special_token_68|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49260": {
+      "content": "<|reserved_special_token_69|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49261": {
+      "content": "<|reserved_special_token_70|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49262": {
+      "content": "<|reserved_special_token_71|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49263": {
+      "content": "<|reserved_special_token_72|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49264": {
+      "content": "<|reserved_special_token_73|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49265": {
+      "content": "<|reserved_special_token_74|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49266": {
+      "content": "<|reserved_special_token_75|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49267": {
+      "content": "<|reserved_special_token_76|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49268": {
+      "content": "<|reserved_special_token_77|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49269": {
+      "content": "<|reserved_special_token_78|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49270": {
+      "content": "<|reserved_special_token_79|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49271": {
+      "content": "<|reserved_special_token_80|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49272": {
+      "content": "<|reserved_special_token_81|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49273": {
+      "content": "<|reserved_special_token_82|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49274": {
+      "content": "<|reserved_special_token_83|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49275": {
+      "content": "<|reserved_special_token_84|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49276": {
+      "content": "<|reserved_special_token_85|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49277": {
+      "content": "<|reserved_special_token_86|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49278": {
+      "content": "<|reserved_special_token_87|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49279": {
+      "content": "<end_of_utterance>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<fake_token_around_image>",
+    "<image>",
+    "<end_of_utterance>"
+  ],
+  "bos_token": "<|im_start|>",
+  "chat_template": "<|im_start|>{% for message in messages %}{{message['role'] | capitalize}}{% if message['content'][0]['type'] == 'image' %}{{':'}}{% else %}{{': '}}{% endif %}{% for line in message['content'] %}{% if line['type'] == 'text' %}{{line['text']}}{% elif line['type'] == 'image' %}{{ '<image>' }}{% endif %}{% endfor %}<end_of_utterance>\n{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "end_of_utterance_token": "<end_of_utterance>",
+  "eos_token": "<end_of_utterance>",
+  "extra_special_tokens": {
+    "end_of_utterance_token": "<end_of_utterance>",
+    "fake_image_token": "<fake_token_around_image>",
+    "global_image_token": "<global-img>",
+    "image_token": "<image>"
+  },
+  "fake_image_token": "<fake_token_around_image>",
+  "global_image_token": "<global-img>",
+  "image_token": "<image>",
+  "legacy": false,
+  "model_max_length": 8192,
+  "pad_token": "<|im_end|>",
+  "processor_class": "SmolVLMProcessor",
+  "tokenizer_class": "GPT2Tokenizer",
+  "truncation_side": "left",
+  "unk_token": "<|endoftext|>",
+  "vocab_size": 49152
+}

smolvlm2_tokenizer/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

vit_mdoel/vision_model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b5b317aa656fc27e49745a23253ee9adcd14ca90e3a9145bdd4568a5a18b2f41
+size 387531753