tarekziade commited on
Commit
c5e7bcd
·
verified ·
1 Parent(s): 1f906d3

Upload folder using huggingface_hub

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
.gitattributes CHANGED
@@ -35,3 +35,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  model.onnx.data filter=lfs diff=lfs merge=lfs -text
37
  projection/model.onnx.data filter=lfs diff=lfs merge=lfs -text
 
 
 
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  model.onnx.data filter=lfs diff=lfs merge=lfs -text
37
  projection/model.onnx.data filter=lfs diff=lfs merge=lfs -text
38
+ multi_modal_projector/model.onnx.data filter=lfs diff=lfs merge=lfs -text
39
+ vision_encoder/model.onnx.data filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,25 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # DistilViT2 for Transformers.js
2
+
3
+ This model is compatible with transformers.js image-to-text pipeline.
4
+
5
+ ## Usage
6
+
7
+ ```javascript
8
+ import { pipeline } from '@huggingface/transformers';
9
+
10
+ const captioner = await pipeline('image-to-text', 'tarekziade/distilvit2');
11
+ const result = await captioner('path/to/image.jpg');
12
+ console.log(result);
13
+ ```
14
+
15
+ ## Architecture
16
+
17
+ - Vision: SigLIP-base-patch16-224 (frozen during training)
18
+ - Projector: Trained linear/MLP projection (768 → 576)
19
+ - Text: SmolLM-135M with merged LoRA adapters
20
+
21
+ ## Training
22
+
23
+ - Dataset: Flickr30k, COCO
24
+ - Task: Image captioning
25
+ - Trainable parameters: 2.2M (1% of total)
config.json CHANGED
@@ -1,174 +1,23 @@
1
  {
 
2
  "architectures": [
3
  "PrefixConditioningVLM"
4
  ],
5
- "freeze_vision_encoder": true,
6
- "lora_alpha": 16,
7
- "lora_dropout": 0.1,
8
- "lora_r": 16,
9
- "lora_target_modules": null,
10
- "max_length": 30,
11
- "mlp_hidden_dim": 2048,
12
- "model_type": "prefix_conditioning_vlm",
13
- "projection_dim": 512,
14
- "projection_type": "linear",
15
- "text_config": {
16
- "_name_or_path": "HuggingFaceTB/SmolLM-135M",
17
- "add_cross_attention": false,
18
- "architectures": [
19
- "LlamaForCausalLM"
20
- ],
21
- "attention_bias": false,
22
- "attention_dropout": 0.0,
23
- "bad_words_ids": null,
24
- "begin_suppress_tokens": null,
25
- "bos_token_id": 0,
26
- "chunk_size_feed_forward": 0,
27
- "cross_attention_hidden_size": null,
28
- "decoder_start_token_id": null,
29
- "diversity_penalty": 0.0,
30
- "do_sample": false,
31
- "early_stopping": false,
32
- "encoder_no_repeat_ngram_size": 0,
33
- "eos_token_id": 0,
34
- "exponential_decay_length_penalty": null,
35
- "finetuning_task": null,
36
- "forced_bos_token_id": null,
37
- "forced_eos_token_id": null,
38
- "head_dim": 64,
39
- "hidden_act": "silu",
40
- "hidden_size": 576,
41
- "id2label": {
42
- "0": "LABEL_0",
43
- "1": "LABEL_1"
44
- },
45
- "initializer_range": 0.02,
46
- "intermediate_size": 1536,
47
- "is_decoder": false,
48
- "is_encoder_decoder": false,
49
- "label2id": {
50
- "LABEL_0": 0,
51
- "LABEL_1": 1
52
- },
53
- "length_penalty": 1.0,
54
- "max_length": 20,
55
- "max_position_embeddings": 2048,
56
- "min_length": 0,
57
- "mlp_bias": false,
58
- "model_type": "llama",
59
- "no_repeat_ngram_size": 0,
60
- "num_attention_heads": 9,
61
- "num_beam_groups": 1,
62
- "num_beams": 1,
63
- "num_hidden_layers": 30,
64
- "num_key_value_heads": 3,
65
- "num_return_sequences": 1,
66
- "output_attentions": false,
67
- "output_hidden_states": false,
68
- "output_scores": false,
69
- "pad_token_id": null,
70
- "prefix": null,
71
- "pretraining_tp": 1,
72
- "problem_type": null,
73
- "pruned_heads": {},
74
- "remove_invalid_values": false,
75
- "repetition_penalty": 1.0,
76
- "return_dict": true,
77
- "return_dict_in_generate": false,
78
- "rms_norm_eps": 1e-05,
79
- "rope_scaling": null,
80
- "rope_theta": 10000.0,
81
- "sep_token_id": null,
82
- "suppress_tokens": null,
83
- "task_specific_params": null,
84
- "temperature": 1.0,
85
- "tf_legacy_loss": false,
86
- "tie_encoder_decoder": false,
87
- "tie_word_embeddings": true,
88
- "tokenizer_class": null,
89
- "top_k": 50,
90
- "top_p": 1.0,
91
- "torch_dtype": "float32",
92
- "torchscript": false,
93
- "typical_p": 1.0,
94
- "use_bfloat16": false,
95
- "use_cache": true,
96
- "vocab_size": 49152
97
- },
98
- "torch_dtype": "float32",
99
- "transformers_version": "4.53.3",
100
  "vision_config": {
101
- "_name_or_path": "google/siglip-base-patch16-224",
102
- "add_cross_attention": false,
103
- "architectures": null,
104
- "attention_dropout": 0.0,
105
- "bad_words_ids": null,
106
- "begin_suppress_tokens": null,
107
- "bos_token_id": null,
108
- "chunk_size_feed_forward": 0,
109
- "cross_attention_hidden_size": null,
110
- "decoder_start_token_id": null,
111
- "diversity_penalty": 0.0,
112
- "do_sample": false,
113
- "early_stopping": false,
114
- "encoder_no_repeat_ngram_size": 0,
115
- "eos_token_id": null,
116
- "exponential_decay_length_penalty": null,
117
- "finetuning_task": null,
118
- "forced_bos_token_id": null,
119
- "forced_eos_token_id": null,
120
- "hidden_act": "gelu_pytorch_tanh",
121
  "hidden_size": 768,
122
- "id2label": {
123
- "0": "LABEL_0",
124
- "1": "LABEL_1"
125
- },
126
  "image_size": 224,
127
- "intermediate_size": 3072,
128
- "is_decoder": false,
129
- "is_encoder_decoder": false,
130
- "label2id": {
131
- "LABEL_0": 0,
132
- "LABEL_1": 1
133
- },
134
- "layer_norm_eps": 1e-06,
135
- "length_penalty": 1.0,
136
- "max_length": 20,
137
- "min_length": 0,
138
- "model_type": "siglip_vision_model",
139
- "no_repeat_ngram_size": 0,
140
- "num_attention_heads": 12,
141
- "num_beam_groups": 1,
142
- "num_beams": 1,
143
- "num_channels": 3,
144
- "num_hidden_layers": 12,
145
- "num_return_sequences": 1,
146
- "output_attentions": false,
147
- "output_hidden_states": false,
148
- "output_scores": false,
149
- "pad_token_id": null,
150
- "patch_size": 16,
151
- "prefix": null,
152
- "problem_type": null,
153
- "pruned_heads": {},
154
- "remove_invalid_values": false,
155
- "repetition_penalty": 1.0,
156
- "return_dict": true,
157
- "return_dict_in_generate": false,
158
- "sep_token_id": null,
159
- "suppress_tokens": null,
160
- "task_specific_params": null,
161
- "temperature": 1.0,
162
- "tf_legacy_loss": false,
163
- "tie_encoder_decoder": false,
164
- "tie_word_embeddings": true,
165
- "tokenizer_class": null,
166
- "top_k": 50,
167
- "top_p": 1.0,
168
- "torch_dtype": "float32",
169
- "torchscript": false,
170
- "typical_p": 1.0,
171
- "use_bfloat16": false
172
  },
173
- "vision_unfreeze_layers": 0
174
- }
 
 
 
1
  {
2
+ "model_type": "llava",
3
  "architectures": [
4
  "PrefixConditioningVLM"
5
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  "vision_config": {
7
+ "model_type": "siglip_vision_model",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  "hidden_size": 768,
 
 
 
 
9
  "image_size": 224,
10
+ "patch_size": 16
11
+ },
12
+ "text_config": {
13
+ "model_type": "gpt2",
14
+ "vocab_size": 49152,
15
+ "n_positions": 2048,
16
+ "n_embd": 576,
17
+ "n_layer": 30,
18
+ "n_head": 9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  },
20
+ "projector_hidden_size": 512,
21
+ "vision_feature_select_strategy": "full",
22
+ "vision_feature_layer": -1
23
+ }
multi_modal_projector/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52cbc298ed3ff4a9fd20e63080f6e722a57136240508d0ac294005ee583ecfe4
3
+ size 580
multi_modal_projector/model.onnx.data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58cbfbfc1d88bca785733cdec608c08da434ad60f4c4bd2e2982f6c10dd54d8f
3
+ size 1835008
special_tokens_map.json CHANGED
@@ -32,13 +32,7 @@
32
  "rstrip": false,
33
  "single_word": false
34
  },
35
- "pad_token": {
36
- "content": "<|endoftext|>",
37
- "lstrip": false,
38
- "normalized": false,
39
- "rstrip": false,
40
- "single_word": false
41
- },
42
  "unk_token": {
43
  "content": "<|endoftext|>",
44
  "lstrip": false,
 
32
  "rstrip": false,
33
  "single_word": false
34
  },
35
+ "pad_token": "<|endoftext|>",
 
 
 
 
 
 
36
  "unk_token": {
37
  "content": "<|endoftext|>",
38
  "lstrip": false,
text_model/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 0,
8
+ "eos_token_id": 0,
9
+ "head_dim": 64,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 576,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 1536,
14
+ "max_position_embeddings": 2048,
15
+ "mlp_bias": false,
16
+ "model_type": "llama",
17
+ "num_attention_heads": 9,
18
+ "num_hidden_layers": 30,
19
+ "num_key_value_heads": 3,
20
+ "pretraining_tp": 1,
21
+ "rms_norm_eps": 1e-05,
22
+ "rope_scaling": null,
23
+ "rope_theta": 10000.0,
24
+ "tie_word_embeddings": true,
25
+ "torch_dtype": "float32",
26
+ "transformers_version": "4.53.3",
27
+ "use_cache": true,
28
+ "vocab_size": 49152
29
+ }
text_model/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 0,
5
+ "transformers_version": "4.53.3"
6
+ }
text_model/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ce16e82eb17e4d75ac020086b09dbd796140317c83d639606feb3b9b4dbc307
3
+ size 545494928
vision_encoder/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4dea7354fafd5a7dd96e08b1a3fd632e09185ff100a5d914974be30eb0877133
3
+ size 111186
vision_encoder/model.onnx.data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4fbafa23edb2db76ee79def6879ea1717481380341b526890c8188363a298f84
3
+ size 371589120