mediaai1 commited on
Commit
76c4d64
·
verified ·
1 Parent(s): 9177488

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ evaluation/level1_top_confusions.png filter=lfs diff=lfs merge=lfs -text
38
+ evaluation/level2_top_confusions.png filter=lfs diff=lfs merge=lfs -text
39
+ evaluation/confusion_flows.png filter=lfs diff=lfs merge=lfs -text
40
+ evaluation/precision_recall_scatter.png filter=lfs diff=lfs merge=lfs -text
41
+ evaluation/precision_recall_zoom.png filter=lfs diff=lfs merge=lfs -text
42
+ evaluation/relative_performance_ranking.png filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - vi
4
+ tags:
5
+ - intent-classification
6
+ - vietnamese
7
+ ---
8
+
9
+ # gemma-3-4b-intent-1128-v3-lr1e05-bs2
10
+
11
+ Fine-tuned model for Vietnamese intent classification.
12
+
13
+ ## Evaluation Results
14
+
15
+ **Dataset**: `allganize/viettel-intent-augmented-total-1128` (validation)
16
+ **Samples**: 2,467
17
+ **Evaluated**: 2025-11-28 08:34:43
18
+
19
+ ### 📊 Overall Metrics
20
+
21
+ | Metric | Score |
22
+ |--------|-------|
23
+ | **Format Accuracy** | 100.00% |
24
+ | **Level 1 Accuracy** | 99.27% |
25
+ | **Level 1 F1 (macro)** | 0.9884 |
26
+ | **Level 2 Accuracy** | 97.97% |
27
+ | **Level 2 F1 (macro)** | 0.9751 |
28
+ | **Combined Accuracy** | 97.93% |
29
+ | **Combined F1 (macro)** | 0.9742 |
30
+
31
+ ![Overall Metrics](evaluation/overall_metrics.png)
32
+
33
+ ### 🎯 Hierarchical Breakdown
34
+
35
+ | Category | Rate | Count |
36
+ |----------|------|-------|
37
+ | Both Correct | 97.93% | 2416 |
38
+ | Level 1 Only | 1.34% | 33 |
39
+ | Both Wrong | 0.69% | 17 |
40
+
41
+ ### 🔍 Detailed Analysis
42
+
43
+ **Confusion Patterns:**
44
+ ![Confusion Flows](evaluation/confusion_flows.png)
45
+
46
+ **Precision-Recall Analysis:**
47
+ ![Precision-Recall Scatter](evaluation/precision_recall_scatter.png)
48
+
49
+ **Intent Performance Distribution:**
50
+ ![Intent Issues](evaluation/intent_issue_distribution.png)
51
+
52
+ **Top Confused Intents:**
53
+ - Level 1: [View Heatmap](evaluation/level1_top_confusions.png)
54
+ - Level 2: [View Heatmap](evaluation/level2_top_confusions.png)
55
+
56
+
57
+ ### 🎯 High-Performance Analysis
58
+
59
+ **Zoomed Precision-Recall (0.85-1.0):**
60
+ ![Precision-Recall Zoom](evaluation/precision_recall_zoom.png)
61
+
62
+ **Relative Performance Ranking (Bottom 20):**
63
+ ![Relative Ranking](evaluation/relative_performance_ranking.png)
64
+
65
+
66
+
67
+ ### 🎯 Intents for Improvement
68
+
69
+ **Level 1:**
70
+
71
+ - `Chat` (P: 97.44%, R: 92.68%, F1: 95.00%)
72
+ - `FindMyPhone` (P: 100.00%, R: 92.86%, F1: 96.30%)
73
+ - `News` (P: 97.62%, R: 97.62%, F1: 97.62%)
74
+ - `UserQuery` (P: 95.92%, R: 100.00%, F1: 97.92%)
75
+
76
+ **Level 2:**
77
+
78
+ - `DeactivateSetting` (P: 93.33%, R: 82.35%, F1: 87.50%)
79
+ - `NavigateMove` (P: 92.86%, R: 86.67%, F1: 89.66%)
80
+ - `SuggestContent` (P: 81.82%, R: 100.00%, F1: 90.00%)
81
+ - `ChannelDown` (P: 93.33%, R: 87.50%, F1: 90.32%)
82
+ - `StopRinging` (P: 83.33%, R: 100.00%, F1: 90.91%)
83
+
84
+
85
+ ---
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<image_soft_token>": 262144
3
+ }
chat_template.jinja ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ {{ bos_token }}
3
+
4
+ {%- if messages[0]['role'] == 'system' -%}
5
+ {%- if messages[0]['content'] is string -%}
6
+ {%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}
7
+ {%- else -%}
8
+ {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}
9
+ {%- endif -%}
10
+ {%- set loop_messages = messages[1:] -%}
11
+ {%- else -%}
12
+ {%- set first_user_prefix = "" -%}
13
+ {%- set loop_messages = messages -%}
14
+ {%- endif -%}
15
+
16
+ {%- for message in loop_messages -%}
17
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
18
+ {{- raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") -}}
19
+ {%- endif -%}
20
+
21
+ {%- set role = "model" if message['role'] == 'assistant' else message['role'] -%}
22
+
23
+ {{- '<start_of_turn>' ~ role ~ '\n' ~ (first_user_prefix if loop.first else "") -}}
24
+
25
+ {%- if message['role'] == 'assistant' -%}
26
+ {% generation %}
27
+ {%- if message['content'] is string -%}
28
+ {{- message['content'] -}}
29
+ {%- elif message['content'] is iterable -%}
30
+ {%- for item in message['content'] -%}
31
+ {%- if item['type'] == 'image' -%}
32
+ {{- '<start_of_image>' -}}
33
+ {%- elif item['type'] == 'text' -%}
34
+ {{- item['text'] -}}
35
+ {%- endif -%}
36
+ {%- endfor -%}
37
+ {%- else -%}
38
+ {{- raise_exception("Invalid content type") -}}
39
+ {%- endif -%}
40
+ {{- '<end_of_turn>\n' -}}
41
+ {% endgeneration %}
42
+ {%- else -%}
43
+ {%- if message['content'] is string -%}
44
+ {{- message['content'] -}}
45
+ {%- elif message['content'] is iterable -%}
46
+ {%- for item in message['content'] -%}
47
+ {%- if item['type'] == 'image' -%}
48
+ {{- '<start_of_image>' -}}
49
+ {%- elif item['type'] == 'text' -%}
50
+ {{- item['text'] -}}
51
+ {%- endif -%}
52
+ {%- endfor -%}
53
+ {%- else -%}
54
+ {{- raise_exception("Invalid content type") -}}
55
+ {%- endif -%}
56
+ {{- '<end_of_turn>\n' -}}
57
+ {%- endif -%}
58
+
59
+ {%- endfor -%}
60
+
61
+ {%- if add_generation_prompt -%}
62
+ {{- '<start_of_turn>model\n' -}}
63
+ {%- endif -%}
config.json ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Gemma3ForConditionalGeneration"
4
+ ],
5
+ "boi_token_index": 255999,
6
+ "bos_token_id": 2,
7
+ "dtype": "bfloat16",
8
+ "eoi_token_index": 256000,
9
+ "eos_token_id": 106,
10
+ "image_token_index": 262144,
11
+ "initializer_range": 0.02,
12
+ "mm_tokens_per_image": 256,
13
+ "model_type": "gemma3",
14
+ "pad_token_id": 106,
15
+ "text_config": {
16
+ "_sliding_window_pattern": 6,
17
+ "attention_bias": false,
18
+ "attention_dropout": 0.0,
19
+ "attn_logit_softcapping": null,
20
+ "dtype": "bfloat16",
21
+ "final_logit_softcapping": null,
22
+ "head_dim": 256,
23
+ "hidden_activation": "gelu_pytorch_tanh",
24
+ "hidden_size": 2560,
25
+ "initializer_range": 0.02,
26
+ "intermediate_size": 10240,
27
+ "layer_types": [
28
+ "sliding_attention",
29
+ "sliding_attention",
30
+ "sliding_attention",
31
+ "sliding_attention",
32
+ "sliding_attention",
33
+ "full_attention",
34
+ "sliding_attention",
35
+ "sliding_attention",
36
+ "sliding_attention",
37
+ "sliding_attention",
38
+ "sliding_attention",
39
+ "full_attention",
40
+ "sliding_attention",
41
+ "sliding_attention",
42
+ "sliding_attention",
43
+ "sliding_attention",
44
+ "sliding_attention",
45
+ "full_attention",
46
+ "sliding_attention",
47
+ "sliding_attention",
48
+ "sliding_attention",
49
+ "sliding_attention",
50
+ "sliding_attention",
51
+ "full_attention",
52
+ "sliding_attention",
53
+ "sliding_attention",
54
+ "sliding_attention",
55
+ "sliding_attention",
56
+ "sliding_attention",
57
+ "full_attention",
58
+ "sliding_attention",
59
+ "sliding_attention",
60
+ "sliding_attention",
61
+ "sliding_attention"
62
+ ],
63
+ "max_position_embeddings": 131072,
64
+ "model_type": "gemma3_text",
65
+ "num_attention_heads": 8,
66
+ "num_hidden_layers": 34,
67
+ "num_key_value_heads": 4,
68
+ "query_pre_attn_scalar": 256,
69
+ "rms_norm_eps": 1e-06,
70
+ "rope_local_base_freq": 10000.0,
71
+ "rope_scaling": {
72
+ "factor": 8.0,
73
+ "rope_type": "linear"
74
+ },
75
+ "rope_theta": 1000000.0,
76
+ "sliding_window": 1024,
77
+ "use_bidirectional_attention": false,
78
+ "use_cache": true,
79
+ "vocab_size": 262208
80
+ },
81
+ "transformers_version": "4.57.3",
82
+ "vision_config": {
83
+ "attention_dropout": 0.0,
84
+ "dtype": "bfloat16",
85
+ "hidden_act": "gelu_pytorch_tanh",
86
+ "hidden_size": 1152,
87
+ "image_size": 896,
88
+ "intermediate_size": 4304,
89
+ "layer_norm_eps": 1e-06,
90
+ "model_type": "siglip_vision_model",
91
+ "num_attention_heads": 16,
92
+ "num_channels": 3,
93
+ "num_hidden_layers": 27,
94
+ "patch_size": 14,
95
+ "vision_use_head": false
96
+ }
97
+ }
evaluation/confusion_flows.png ADDED

Git LFS Details

  • SHA256: b31b5a4df02d1d80e1f2ba21a9948c6b4821e8ad6abee259d0b715e2efcbe8a9
  • Pointer size: 131 Bytes
  • Size of remote file: 146 kB
evaluation/intent_issue_distribution.png ADDED
evaluation/level1_top_confusions.png ADDED

Git LFS Details

  • SHA256: ff21e5582412d2541819f765fb273f3f5961a5683ac50e083f40169c2f83e7f2
  • Pointer size: 131 Bytes
  • Size of remote file: 221 kB
evaluation/level2_top_confusions.png ADDED

Git LFS Details

  • SHA256: 7f98589b9dc8f33e4ec2e23ec11a153b298c288196f535f0624a6d8386f9c82c
  • Pointer size: 131 Bytes
  • Size of remote file: 284 kB
evaluation/overall_metrics.png ADDED
evaluation/precision_recall_scatter.png ADDED

Git LFS Details

  • SHA256: 3801760d114597b4b0985b6660829abb884371e82367569b5c154fafcafe1b5a
  • Pointer size: 131 Bytes
  • Size of remote file: 100 kB
evaluation/precision_recall_zoom.png ADDED

Git LFS Details

  • SHA256: 6959f52862b5c618eed5cc472b79841d2f2a66c2af60bcaf9ac3c464cb172175
  • Pointer size: 131 Bytes
  • Size of remote file: 176 kB
evaluation/relative_performance_ranking.png ADDED

Git LFS Details

  • SHA256: 88c8d89df9538b6ee5074c275f6fd463f0a6f16149173d4ef06a14ff70b1642c
  • Pointer size: 131 Bytes
  • Size of remote file: 236 kB
generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 2,
3
+ "cache_implementation": "hybrid",
4
+ "do_sample": true,
5
+ "eos_token_id": [
6
+ 106,
7
+ 1,
8
+ 106
9
+ ],
10
+ "pad_token_id": 106,
11
+ "top_k": 64,
12
+ "top_p": 0.95,
13
+ "transformers_version": "4.57.3"
14
+ }
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5fe795cccd05897f231ba58b0e61cdd2b1d192c5d9f957ba94431e1fa4447128
3
+ size 4961251752
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e88463423f1ca34bc30560344d6b10a1e0cfb0a4867bdf6c4cd30edeba48f0fe
3
+ size 3639026128
model.safetensors.index.json ADDED
@@ -0,0 +1,891 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 4300079472,
4
+ "total_size": 8600158944
5
+ },
6
+ "weight_map": {
7
+ "language_model.model.embed_tokens.weight": "model-00001-of-00002.safetensors",
8
+ "language_model.model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
9
+ "language_model.model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
10
+ "language_model.model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
11
+ "language_model.model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
12
+ "language_model.model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
13
+ "language_model.model.layers.0.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
14
+ "language_model.model.layers.0.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
15
+ "language_model.model.layers.0.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
16
+ "language_model.model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
17
+ "language_model.model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
18
+ "language_model.model.layers.0.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
19
+ "language_model.model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
20
+ "language_model.model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
21
+ "language_model.model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
22
+ "language_model.model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
23
+ "language_model.model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
24
+ "language_model.model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
25
+ "language_model.model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
26
+ "language_model.model.layers.1.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
27
+ "language_model.model.layers.1.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
28
+ "language_model.model.layers.1.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
29
+ "language_model.model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
30
+ "language_model.model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
31
+ "language_model.model.layers.1.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
32
+ "language_model.model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
33
+ "language_model.model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
34
+ "language_model.model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
35
+ "language_model.model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
36
+ "language_model.model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
37
+ "language_model.model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
38
+ "language_model.model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
39
+ "language_model.model.layers.10.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
40
+ "language_model.model.layers.10.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
41
+ "language_model.model.layers.10.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
42
+ "language_model.model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
43
+ "language_model.model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
44
+ "language_model.model.layers.10.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
45
+ "language_model.model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
46
+ "language_model.model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
47
+ "language_model.model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
48
+ "language_model.model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
49
+ "language_model.model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
50
+ "language_model.model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
51
+ "language_model.model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
52
+ "language_model.model.layers.11.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
53
+ "language_model.model.layers.11.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
54
+ "language_model.model.layers.11.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
55
+ "language_model.model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
56
+ "language_model.model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
57
+ "language_model.model.layers.11.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
58
+ "language_model.model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
59
+ "language_model.model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
60
+ "language_model.model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
61
+ "language_model.model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
62
+ "language_model.model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
63
+ "language_model.model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
64
+ "language_model.model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
65
+ "language_model.model.layers.12.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
66
+ "language_model.model.layers.12.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
67
+ "language_model.model.layers.12.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
68
+ "language_model.model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
69
+ "language_model.model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
70
+ "language_model.model.layers.12.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
71
+ "language_model.model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
72
+ "language_model.model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
73
+ "language_model.model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
74
+ "language_model.model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
75
+ "language_model.model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
76
+ "language_model.model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
77
+ "language_model.model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
78
+ "language_model.model.layers.13.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
79
+ "language_model.model.layers.13.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
80
+ "language_model.model.layers.13.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
81
+ "language_model.model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
82
+ "language_model.model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
83
+ "language_model.model.layers.13.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
84
+ "language_model.model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
85
+ "language_model.model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
86
+ "language_model.model.layers.14.input_layernorm.weight": "model-00002-of-00002.safetensors",
87
+ "language_model.model.layers.14.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
88
+ "language_model.model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
89
+ "language_model.model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
90
+ "language_model.model.layers.14.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
91
+ "language_model.model.layers.14.post_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
92
+ "language_model.model.layers.14.pre_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
93
+ "language_model.model.layers.14.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
94
+ "language_model.model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
95
+ "language_model.model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
96
+ "language_model.model.layers.14.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
97
+ "language_model.model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
98
+ "language_model.model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
99
+ "language_model.model.layers.15.input_layernorm.weight": "model-00002-of-00002.safetensors",
100
+ "language_model.model.layers.15.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
101
+ "language_model.model.layers.15.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
102
+ "language_model.model.layers.15.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
103
+ "language_model.model.layers.15.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
104
+ "language_model.model.layers.15.post_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
105
+ "language_model.model.layers.15.pre_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
106
+ "language_model.model.layers.15.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
107
+ "language_model.model.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
108
+ "language_model.model.layers.15.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
109
+ "language_model.model.layers.15.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
110
+ "language_model.model.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
111
+ "language_model.model.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
112
+ "language_model.model.layers.16.input_layernorm.weight": "model-00002-of-00002.safetensors",
113
+ "language_model.model.layers.16.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
114
+ "language_model.model.layers.16.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
115
+ "language_model.model.layers.16.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
116
+ "language_model.model.layers.16.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
117
+ "language_model.model.layers.16.post_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
118
+ "language_model.model.layers.16.pre_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
119
+ "language_model.model.layers.16.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
120
+ "language_model.model.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
121
+ "language_model.model.layers.16.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
122
+ "language_model.model.layers.16.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
123
+ "language_model.model.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
124
+ "language_model.model.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
125
+ "language_model.model.layers.17.input_layernorm.weight": "model-00002-of-00002.safetensors",
126
+ "language_model.model.layers.17.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
127
+ "language_model.model.layers.17.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
128
+ "language_model.model.layers.17.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
129
+ "language_model.model.layers.17.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
130
+ "language_model.model.layers.17.post_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
131
+ "language_model.model.layers.17.pre_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
132
+ "language_model.model.layers.17.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
133
+ "language_model.model.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
134
+ "language_model.model.layers.17.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
135
+ "language_model.model.layers.17.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
136
+ "language_model.model.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
137
+ "language_model.model.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
138
+ "language_model.model.layers.18.input_layernorm.weight": "model-00002-of-00002.safetensors",
139
+ "language_model.model.layers.18.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
140
+ "language_model.model.layers.18.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
141
+ "language_model.model.layers.18.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
142
+ "language_model.model.layers.18.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
143
+ "language_model.model.layers.18.post_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
144
+ "language_model.model.layers.18.pre_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
145
+ "language_model.model.layers.18.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
146
+ "language_model.model.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
147
+ "language_model.model.layers.18.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
148
+ "language_model.model.layers.18.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
149
+ "language_model.model.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
150
+ "language_model.model.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
151
+ "language_model.model.layers.19.input_layernorm.weight": "model-00002-of-00002.safetensors",
152
+ "language_model.model.layers.19.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
153
+ "language_model.model.layers.19.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
154
+ "language_model.model.layers.19.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
155
+ "language_model.model.layers.19.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
156
+ "language_model.model.layers.19.post_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
157
+ "language_model.model.layers.19.pre_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
158
+ "language_model.model.layers.19.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
159
+ "language_model.model.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
160
+ "language_model.model.layers.19.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
161
+ "language_model.model.layers.19.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
162
+ "language_model.model.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
163
+ "language_model.model.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
164
+ "language_model.model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
165
+ "language_model.model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
166
+ "language_model.model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
167
+ "language_model.model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
168
+ "language_model.model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
169
+ "language_model.model.layers.2.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
170
+ "language_model.model.layers.2.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
171
+ "language_model.model.layers.2.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
172
+ "language_model.model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
173
+ "language_model.model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
174
+ "language_model.model.layers.2.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
175
+ "language_model.model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
176
+ "language_model.model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
177
+ "language_model.model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
178
+ "language_model.model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
179
+ "language_model.model.layers.20.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
180
+ "language_model.model.layers.20.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
181
+ "language_model.model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
182
+ "language_model.model.layers.20.post_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
183
+ "language_model.model.layers.20.pre_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
184
+ "language_model.model.layers.20.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
185
+ "language_model.model.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
186
+ "language_model.model.layers.20.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
187
+ "language_model.model.layers.20.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
188
+ "language_model.model.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
189
+ "language_model.model.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
190
+ "language_model.model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
191
+ "language_model.model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
192
+ "language_model.model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
193
+ "language_model.model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
194
+ "language_model.model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
195
+ "language_model.model.layers.21.post_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
196
+ "language_model.model.layers.21.pre_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
197
+ "language_model.model.layers.21.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
198
+ "language_model.model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
199
+ "language_model.model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
200
+ "language_model.model.layers.21.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
201
+ "language_model.model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
202
+ "language_model.model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
203
+ "language_model.model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
204
+ "language_model.model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
205
+ "language_model.model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
206
+ "language_model.model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
207
+ "language_model.model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
208
+ "language_model.model.layers.22.post_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
209
+ "language_model.model.layers.22.pre_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
210
+ "language_model.model.layers.22.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
211
+ "language_model.model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
212
+ "language_model.model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
213
+ "language_model.model.layers.22.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
214
+ "language_model.model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
215
+ "language_model.model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
216
+ "language_model.model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
217
+ "language_model.model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
218
+ "language_model.model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
219
+ "language_model.model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
220
+ "language_model.model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
221
+ "language_model.model.layers.23.post_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
222
+ "language_model.model.layers.23.pre_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
223
+ "language_model.model.layers.23.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
224
+ "language_model.model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
225
+ "language_model.model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
226
+ "language_model.model.layers.23.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
227
+ "language_model.model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
228
+ "language_model.model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
229
+ "language_model.model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
230
+ "language_model.model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
231
+ "language_model.model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
232
+ "language_model.model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
233
+ "language_model.model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
234
+ "language_model.model.layers.24.post_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
235
+ "language_model.model.layers.24.pre_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
236
+ "language_model.model.layers.24.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
237
+ "language_model.model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
238
+ "language_model.model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
239
+ "language_model.model.layers.24.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
240
+ "language_model.model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
241
+ "language_model.model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
242
+ "language_model.model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
243
+ "language_model.model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
244
+ "language_model.model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
245
+ "language_model.model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
246
+ "language_model.model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
247
+ "language_model.model.layers.25.post_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
248
+ "language_model.model.layers.25.pre_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
249
+ "language_model.model.layers.25.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
250
+ "language_model.model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
251
+ "language_model.model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
252
+ "language_model.model.layers.25.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
253
+ "language_model.model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
254
+ "language_model.model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
255
+ "language_model.model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
256
+ "language_model.model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
257
+ "language_model.model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
258
+ "language_model.model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
259
+ "language_model.model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
260
+ "language_model.model.layers.26.post_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
261
+ "language_model.model.layers.26.pre_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
262
+ "language_model.model.layers.26.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
263
+ "language_model.model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
264
+ "language_model.model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
265
+ "language_model.model.layers.26.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
266
+ "language_model.model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
267
+ "language_model.model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
268
+ "language_model.model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
269
+ "language_model.model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
270
+ "language_model.model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
271
+ "language_model.model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
272
+ "language_model.model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
273
+ "language_model.model.layers.27.post_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
274
+ "language_model.model.layers.27.pre_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
275
+ "language_model.model.layers.27.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
276
+ "language_model.model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
277
+ "language_model.model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
278
+ "language_model.model.layers.27.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
279
+ "language_model.model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
280
+ "language_model.model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
281
+ "language_model.model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
282
+ "language_model.model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
283
+ "language_model.model.layers.28.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
284
+ "language_model.model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
285
+ "language_model.model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
286
+ "language_model.model.layers.28.post_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
287
+ "language_model.model.layers.28.pre_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
288
+ "language_model.model.layers.28.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
289
+ "language_model.model.layers.28.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
290
+ "language_model.model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
291
+ "language_model.model.layers.28.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
292
+ "language_model.model.layers.28.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
293
+ "language_model.model.layers.28.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
294
+ "language_model.model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
295
+ "language_model.model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
296
+ "language_model.model.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
297
+ "language_model.model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
298
+ "language_model.model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
299
+ "language_model.model.layers.29.post_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
300
+ "language_model.model.layers.29.pre_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
301
+ "language_model.model.layers.29.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
302
+ "language_model.model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
303
+ "language_model.model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
304
+ "language_model.model.layers.29.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
305
+ "language_model.model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
306
+ "language_model.model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
307
+ "language_model.model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
308
+ "language_model.model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
309
+ "language_model.model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
310
+ "language_model.model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
311
+ "language_model.model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
312
+ "language_model.model.layers.3.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
313
+ "language_model.model.layers.3.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
314
+ "language_model.model.layers.3.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
315
+ "language_model.model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
316
+ "language_model.model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
317
+ "language_model.model.layers.3.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
318
+ "language_model.model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
319
+ "language_model.model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
320
+ "language_model.model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
321
+ "language_model.model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
322
+ "language_model.model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
323
+ "language_model.model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
324
+ "language_model.model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
325
+ "language_model.model.layers.30.post_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
326
+ "language_model.model.layers.30.pre_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
327
+ "language_model.model.layers.30.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
328
+ "language_model.model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
329
+ "language_model.model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
330
+ "language_model.model.layers.30.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
331
+ "language_model.model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
332
+ "language_model.model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
333
+ "language_model.model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
334
+ "language_model.model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
335
+ "language_model.model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
336
+ "language_model.model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
337
+ "language_model.model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
338
+ "language_model.model.layers.31.post_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
339
+ "language_model.model.layers.31.pre_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
340
+ "language_model.model.layers.31.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
341
+ "language_model.model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
342
+ "language_model.model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
343
+ "language_model.model.layers.31.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
344
+ "language_model.model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
345
+ "language_model.model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
346
+ "language_model.model.layers.32.input_layernorm.weight": "model-00002-of-00002.safetensors",
347
+ "language_model.model.layers.32.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
348
+ "language_model.model.layers.32.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
349
+ "language_model.model.layers.32.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
350
+ "language_model.model.layers.32.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
351
+ "language_model.model.layers.32.post_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
352
+ "language_model.model.layers.32.pre_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
353
+ "language_model.model.layers.32.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
354
+ "language_model.model.layers.32.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
355
+ "language_model.model.layers.32.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
356
+ "language_model.model.layers.32.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
357
+ "language_model.model.layers.32.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
358
+ "language_model.model.layers.32.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
359
+ "language_model.model.layers.33.input_layernorm.weight": "model-00002-of-00002.safetensors",
360
+ "language_model.model.layers.33.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
361
+ "language_model.model.layers.33.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
362
+ "language_model.model.layers.33.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
363
+ "language_model.model.layers.33.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
364
+ "language_model.model.layers.33.post_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
365
+ "language_model.model.layers.33.pre_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
366
+ "language_model.model.layers.33.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
367
+ "language_model.model.layers.33.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
368
+ "language_model.model.layers.33.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
369
+ "language_model.model.layers.33.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
370
+ "language_model.model.layers.33.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
371
+ "language_model.model.layers.33.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
372
+ "language_model.model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
373
+ "language_model.model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
374
+ "language_model.model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
375
+ "language_model.model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
376
+ "language_model.model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
377
+ "language_model.model.layers.4.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
378
+ "language_model.model.layers.4.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
379
+ "language_model.model.layers.4.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
380
+ "language_model.model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
381
+ "language_model.model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
382
+ "language_model.model.layers.4.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
383
+ "language_model.model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
384
+ "language_model.model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
385
+ "language_model.model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
386
+ "language_model.model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
387
+ "language_model.model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
388
+ "language_model.model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
389
+ "language_model.model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
390
+ "language_model.model.layers.5.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
391
+ "language_model.model.layers.5.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
392
+ "language_model.model.layers.5.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
393
+ "language_model.model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
394
+ "language_model.model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
395
+ "language_model.model.layers.5.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
396
+ "language_model.model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
397
+ "language_model.model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
398
+ "language_model.model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
399
+ "language_model.model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
400
+ "language_model.model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
401
+ "language_model.model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
402
+ "language_model.model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
403
+ "language_model.model.layers.6.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
404
+ "language_model.model.layers.6.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
405
+ "language_model.model.layers.6.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
406
+ "language_model.model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
407
+ "language_model.model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
408
+ "language_model.model.layers.6.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
409
+ "language_model.model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
410
+ "language_model.model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
411
+ "language_model.model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
412
+ "language_model.model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
413
+ "language_model.model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
414
+ "language_model.model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
415
+ "language_model.model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
416
+ "language_model.model.layers.7.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
417
+ "language_model.model.layers.7.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
418
+ "language_model.model.layers.7.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
419
+ "language_model.model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
420
+ "language_model.model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
421
+ "language_model.model.layers.7.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
422
+ "language_model.model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
423
+ "language_model.model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
424
+ "language_model.model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
425
+ "language_model.model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
426
+ "language_model.model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
427
+ "language_model.model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
428
+ "language_model.model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
429
+ "language_model.model.layers.8.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
430
+ "language_model.model.layers.8.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
431
+ "language_model.model.layers.8.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
432
+ "language_model.model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
433
+ "language_model.model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
434
+ "language_model.model.layers.8.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
435
+ "language_model.model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
436
+ "language_model.model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
437
+ "language_model.model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
438
+ "language_model.model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
439
+ "language_model.model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
440
+ "language_model.model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
441
+ "language_model.model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
442
+ "language_model.model.layers.9.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
443
+ "language_model.model.layers.9.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
444
+ "language_model.model.layers.9.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
445
+ "language_model.model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
446
+ "language_model.model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
447
+ "language_model.model.layers.9.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
448
+ "language_model.model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
449
+ "language_model.model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
450
+ "language_model.model.norm.weight": "model-00002-of-00002.safetensors",
451
+ "multi_modal_projector.mm_input_projection_weight": "model-00001-of-00002.safetensors",
452
+ "multi_modal_projector.mm_soft_emb_norm.weight": "model-00001-of-00002.safetensors",
453
+ "vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00001-of-00002.safetensors",
454
+ "vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00001-of-00002.safetensors",
455
+ "vision_tower.vision_model.embeddings.position_embedding.weight": "model-00001-of-00002.safetensors",
456
+ "vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00001-of-00002.safetensors",
457
+ "vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00001-of-00002.safetensors",
458
+ "vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00001-of-00002.safetensors",
459
+ "vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00001-of-00002.safetensors",
460
+ "vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00001-of-00002.safetensors",
461
+ "vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00001-of-00002.safetensors",
462
+ "vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00001-of-00002.safetensors",
463
+ "vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00001-of-00002.safetensors",
464
+ "vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
465
+ "vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
466
+ "vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
467
+ "vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
468
+ "vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
469
+ "vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
470
+ "vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
471
+ "vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
472
+ "vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00001-of-00002.safetensors",
473
+ "vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00001-of-00002.safetensors",
474
+ "vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00001-of-00002.safetensors",
475
+ "vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00001-of-00002.safetensors",
476
+ "vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00001-of-00002.safetensors",
477
+ "vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00001-of-00002.safetensors",
478
+ "vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00001-of-00002.safetensors",
479
+ "vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00001-of-00002.safetensors",
480
+ "vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
481
+ "vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
482
+ "vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
483
+ "vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
484
+ "vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
485
+ "vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
486
+ "vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
487
+ "vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
488
+ "vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00001-of-00002.safetensors",
489
+ "vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00001-of-00002.safetensors",
490
+ "vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00001-of-00002.safetensors",
491
+ "vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00001-of-00002.safetensors",
492
+ "vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00001-of-00002.safetensors",
493
+ "vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00001-of-00002.safetensors",
494
+ "vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00001-of-00002.safetensors",
495
+ "vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00001-of-00002.safetensors",
496
+ "vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
497
+ "vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
498
+ "vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
499
+ "vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
500
+ "vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
501
+ "vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
502
+ "vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
503
+ "vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
504
+ "vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00001-of-00002.safetensors",
505
+ "vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00001-of-00002.safetensors",
506
+ "vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00001-of-00002.safetensors",
507
+ "vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00001-of-00002.safetensors",
508
+ "vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00001-of-00002.safetensors",
509
+ "vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00001-of-00002.safetensors",
510
+ "vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00001-of-00002.safetensors",
511
+ "vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00001-of-00002.safetensors",
512
+ "vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
513
+ "vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
514
+ "vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
515
+ "vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
516
+ "vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
517
+ "vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
518
+ "vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
519
+ "vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
520
+ "vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00001-of-00002.safetensors",
521
+ "vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00001-of-00002.safetensors",
522
+ "vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00001-of-00002.safetensors",
523
+ "vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00001-of-00002.safetensors",
524
+ "vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00001-of-00002.safetensors",
525
+ "vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00001-of-00002.safetensors",
526
+ "vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00001-of-00002.safetensors",
527
+ "vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00001-of-00002.safetensors",
528
+ "vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
529
+ "vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
530
+ "vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
531
+ "vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
532
+ "vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
533
+ "vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
534
+ "vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
535
+ "vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
536
+ "vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00001-of-00002.safetensors",
537
+ "vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00001-of-00002.safetensors",
538
+ "vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00001-of-00002.safetensors",
539
+ "vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00001-of-00002.safetensors",
540
+ "vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00001-of-00002.safetensors",
541
+ "vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00001-of-00002.safetensors",
542
+ "vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00001-of-00002.safetensors",
543
+ "vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00001-of-00002.safetensors",
544
+ "vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
545
+ "vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
546
+ "vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
547
+ "vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
548
+ "vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
549
+ "vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
550
+ "vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
551
+ "vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
552
+ "vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00001-of-00002.safetensors",
553
+ "vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00001-of-00002.safetensors",
554
+ "vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00001-of-00002.safetensors",
555
+ "vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00001-of-00002.safetensors",
556
+ "vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00001-of-00002.safetensors",
557
+ "vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00001-of-00002.safetensors",
558
+ "vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00001-of-00002.safetensors",
559
+ "vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00001-of-00002.safetensors",
560
+ "vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
561
+ "vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
562
+ "vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
563
+ "vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
564
+ "vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
565
+ "vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
566
+ "vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
567
+ "vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
568
+ "vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00001-of-00002.safetensors",
569
+ "vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00001-of-00002.safetensors",
570
+ "vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00001-of-00002.safetensors",
571
+ "vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00001-of-00002.safetensors",
572
+ "vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00001-of-00002.safetensors",
573
+ "vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00001-of-00002.safetensors",
574
+ "vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00001-of-00002.safetensors",
575
+ "vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00001-of-00002.safetensors",
576
+ "vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
577
+ "vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
578
+ "vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
579
+ "vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
580
+ "vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
581
+ "vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
582
+ "vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
583
+ "vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
584
+ "vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00001-of-00002.safetensors",
585
+ "vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00001-of-00002.safetensors",
586
+ "vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00001-of-00002.safetensors",
587
+ "vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00001-of-00002.safetensors",
588
+ "vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00001-of-00002.safetensors",
589
+ "vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00001-of-00002.safetensors",
590
+ "vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00001-of-00002.safetensors",
591
+ "vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00001-of-00002.safetensors",
592
+ "vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
593
+ "vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
594
+ "vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
595
+ "vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
596
+ "vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
597
+ "vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
598
+ "vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
599
+ "vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
600
+ "vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00001-of-00002.safetensors",
601
+ "vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00001-of-00002.safetensors",
602
+ "vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00001-of-00002.safetensors",
603
+ "vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00001-of-00002.safetensors",
604
+ "vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00001-of-00002.safetensors",
605
+ "vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00001-of-00002.safetensors",
606
+ "vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00001-of-00002.safetensors",
607
+ "vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00001-of-00002.safetensors",
608
+ "vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
609
+ "vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
610
+ "vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
611
+ "vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
612
+ "vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
613
+ "vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
614
+ "vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
615
+ "vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
616
+ "vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00001-of-00002.safetensors",
617
+ "vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00001-of-00002.safetensors",
618
+ "vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00001-of-00002.safetensors",
619
+ "vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00001-of-00002.safetensors",
620
+ "vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00001-of-00002.safetensors",
621
+ "vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00001-of-00002.safetensors",
622
+ "vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00001-of-00002.safetensors",
623
+ "vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00001-of-00002.safetensors",
624
+ "vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
625
+ "vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
626
+ "vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
627
+ "vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
628
+ "vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
629
+ "vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
630
+ "vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
631
+ "vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
632
+ "vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00001-of-00002.safetensors",
633
+ "vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00001-of-00002.safetensors",
634
+ "vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00001-of-00002.safetensors",
635
+ "vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00001-of-00002.safetensors",
636
+ "vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00001-of-00002.safetensors",
637
+ "vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00001-of-00002.safetensors",
638
+ "vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00001-of-00002.safetensors",
639
+ "vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00001-of-00002.safetensors",
640
+ "vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
641
+ "vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
642
+ "vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
643
+ "vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
644
+ "vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
645
+ "vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
646
+ "vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
647
+ "vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
648
+ "vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00001-of-00002.safetensors",
649
+ "vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00001-of-00002.safetensors",
650
+ "vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00001-of-00002.safetensors",
651
+ "vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00001-of-00002.safetensors",
652
+ "vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00001-of-00002.safetensors",
653
+ "vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00001-of-00002.safetensors",
654
+ "vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00001-of-00002.safetensors",
655
+ "vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00001-of-00002.safetensors",
656
+ "vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
657
+ "vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
658
+ "vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
659
+ "vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
660
+ "vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
661
+ "vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
662
+ "vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
663
+ "vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
664
+ "vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00001-of-00002.safetensors",
665
+ "vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00001-of-00002.safetensors",
666
+ "vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00001-of-00002.safetensors",
667
+ "vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00001-of-00002.safetensors",
668
+ "vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00001-of-00002.safetensors",
669
+ "vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00001-of-00002.safetensors",
670
+ "vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00001-of-00002.safetensors",
671
+ "vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00001-of-00002.safetensors",
672
+ "vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
673
+ "vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
674
+ "vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
675
+ "vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
676
+ "vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
677
+ "vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
678
+ "vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
679
+ "vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
680
+ "vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00001-of-00002.safetensors",
681
+ "vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00001-of-00002.safetensors",
682
+ "vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00001-of-00002.safetensors",
683
+ "vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00001-of-00002.safetensors",
684
+ "vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00001-of-00002.safetensors",
685
+ "vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00001-of-00002.safetensors",
686
+ "vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00001-of-00002.safetensors",
687
+ "vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00001-of-00002.safetensors",
688
+ "vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
689
+ "vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
690
+ "vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
691
+ "vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
692
+ "vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
693
+ "vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
694
+ "vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
695
+ "vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
696
+ "vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00001-of-00002.safetensors",
697
+ "vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00001-of-00002.safetensors",
698
+ "vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00001-of-00002.safetensors",
699
+ "vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00001-of-00002.safetensors",
700
+ "vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00001-of-00002.safetensors",
701
+ "vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00001-of-00002.safetensors",
702
+ "vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00001-of-00002.safetensors",
703
+ "vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00001-of-00002.safetensors",
704
+ "vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
705
+ "vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
706
+ "vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
707
+ "vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
708
+ "vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
709
+ "vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
710
+ "vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
711
+ "vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
712
+ "vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00001-of-00002.safetensors",
713
+ "vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00001-of-00002.safetensors",
714
+ "vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00001-of-00002.safetensors",
715
+ "vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00001-of-00002.safetensors",
716
+ "vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00001-of-00002.safetensors",
717
+ "vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00001-of-00002.safetensors",
718
+ "vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00001-of-00002.safetensors",
719
+ "vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00001-of-00002.safetensors",
720
+ "vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
721
+ "vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
722
+ "vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
723
+ "vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
724
+ "vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
725
+ "vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
726
+ "vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
727
+ "vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
728
+ "vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00001-of-00002.safetensors",
729
+ "vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00001-of-00002.safetensors",
730
+ "vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00001-of-00002.safetensors",
731
+ "vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00001-of-00002.safetensors",
732
+ "vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00001-of-00002.safetensors",
733
+ "vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00001-of-00002.safetensors",
734
+ "vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00001-of-00002.safetensors",
735
+ "vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00001-of-00002.safetensors",
736
+ "vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
737
+ "vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
738
+ "vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
739
+ "vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
740
+ "vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
741
+ "vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
742
+ "vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
743
+ "vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
744
+ "vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00001-of-00002.safetensors",
745
+ "vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00001-of-00002.safetensors",
746
+ "vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00001-of-00002.safetensors",
747
+ "vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00001-of-00002.safetensors",
748
+ "vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00001-of-00002.safetensors",
749
+ "vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00001-of-00002.safetensors",
750
+ "vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00001-of-00002.safetensors",
751
+ "vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00001-of-00002.safetensors",
752
+ "vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
753
+ "vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
754
+ "vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
755
+ "vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
756
+ "vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
757
+ "vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
758
+ "vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
759
+ "vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
760
+ "vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00001-of-00002.safetensors",
761
+ "vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00001-of-00002.safetensors",
762
+ "vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00001-of-00002.safetensors",
763
+ "vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00001-of-00002.safetensors",
764
+ "vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00001-of-00002.safetensors",
765
+ "vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00001-of-00002.safetensors",
766
+ "vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00001-of-00002.safetensors",
767
+ "vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00001-of-00002.safetensors",
768
+ "vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
769
+ "vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
770
+ "vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
771
+ "vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
772
+ "vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
773
+ "vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
774
+ "vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
775
+ "vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
776
+ "vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00001-of-00002.safetensors",
777
+ "vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00001-of-00002.safetensors",
778
+ "vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00001-of-00002.safetensors",
779
+ "vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00001-of-00002.safetensors",
780
+ "vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00001-of-00002.safetensors",
781
+ "vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00001-of-00002.safetensors",
782
+ "vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00001-of-00002.safetensors",
783
+ "vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00001-of-00002.safetensors",
784
+ "vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
785
+ "vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
786
+ "vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
787
+ "vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
788
+ "vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
789
+ "vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
790
+ "vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
791
+ "vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
792
+ "vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00001-of-00002.safetensors",
793
+ "vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00001-of-00002.safetensors",
794
+ "vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00001-of-00002.safetensors",
795
+ "vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00001-of-00002.safetensors",
796
+ "vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00001-of-00002.safetensors",
797
+ "vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00001-of-00002.safetensors",
798
+ "vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00001-of-00002.safetensors",
799
+ "vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00001-of-00002.safetensors",
800
+ "vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
801
+ "vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
802
+ "vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
803
+ "vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
804
+ "vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
805
+ "vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
806
+ "vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
807
+ "vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
808
+ "vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00001-of-00002.safetensors",
809
+ "vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00001-of-00002.safetensors",
810
+ "vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00001-of-00002.safetensors",
811
+ "vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00001-of-00002.safetensors",
812
+ "vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00001-of-00002.safetensors",
813
+ "vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00001-of-00002.safetensors",
814
+ "vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00001-of-00002.safetensors",
815
+ "vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00001-of-00002.safetensors",
816
+ "vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
817
+ "vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
818
+ "vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
819
+ "vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
820
+ "vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
821
+ "vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
822
+ "vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
823
+ "vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
824
+ "vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00001-of-00002.safetensors",
825
+ "vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00001-of-00002.safetensors",
826
+ "vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00001-of-00002.safetensors",
827
+ "vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00001-of-00002.safetensors",
828
+ "vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00001-of-00002.safetensors",
829
+ "vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00001-of-00002.safetensors",
830
+ "vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00001-of-00002.safetensors",
831
+ "vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00001-of-00002.safetensors",
832
+ "vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
833
+ "vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
834
+ "vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
835
+ "vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
836
+ "vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
837
+ "vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
838
+ "vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
839
+ "vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
840
+ "vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00001-of-00002.safetensors",
841
+ "vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00001-of-00002.safetensors",
842
+ "vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00001-of-00002.safetensors",
843
+ "vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00001-of-00002.safetensors",
844
+ "vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00001-of-00002.safetensors",
845
+ "vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00001-of-00002.safetensors",
846
+ "vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00001-of-00002.safetensors",
847
+ "vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00001-of-00002.safetensors",
848
+ "vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
849
+ "vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
850
+ "vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
851
+ "vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
852
+ "vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
853
+ "vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
854
+ "vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
855
+ "vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
856
+ "vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00001-of-00002.safetensors",
857
+ "vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00001-of-00002.safetensors",
858
+ "vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00001-of-00002.safetensors",
859
+ "vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00001-of-00002.safetensors",
860
+ "vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00001-of-00002.safetensors",
861
+ "vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00001-of-00002.safetensors",
862
+ "vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00001-of-00002.safetensors",
863
+ "vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00001-of-00002.safetensors",
864
+ "vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
865
+ "vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
866
+ "vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
867
+ "vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
868
+ "vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
869
+ "vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
870
+ "vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
871
+ "vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
872
+ "vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00001-of-00002.safetensors",
873
+ "vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00001-of-00002.safetensors",
874
+ "vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00001-of-00002.safetensors",
875
+ "vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00001-of-00002.safetensors",
876
+ "vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00001-of-00002.safetensors",
877
+ "vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00001-of-00002.safetensors",
878
+ "vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00001-of-00002.safetensors",
879
+ "vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00001-of-00002.safetensors",
880
+ "vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
881
+ "vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
882
+ "vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
883
+ "vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
884
+ "vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
885
+ "vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
886
+ "vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
887
+ "vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
888
+ "vision_tower.vision_model.post_layernorm.bias": "model-00001-of-00002.safetensors",
889
+ "vision_tower.vision_model.post_layernorm.weight": "model-00001-of-00002.safetensors"
890
+ }
891
+ }
preprocessor_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": null,
3
+ "do_normalize": true,
4
+ "do_pan_and_scan": null,
5
+ "do_rescale": true,
6
+ "do_resize": true,
7
+ "image_mean": [
8
+ 0.5,
9
+ 0.5,
10
+ 0.5
11
+ ],
12
+ "image_processor_type": "Gemma3ImageProcessor",
13
+ "image_seq_length": 256,
14
+ "image_std": [
15
+ 0.5,
16
+ 0.5,
17
+ 0.5
18
+ ],
19
+ "pan_and_scan_max_num_crops": null,
20
+ "pan_and_scan_min_crop_size": null,
21
+ "pan_and_scan_min_ratio_to_activate": null,
22
+ "processor_class": "Gemma3Processor",
23
+ "resample": 2,
24
+ "rescale_factor": 0.00392156862745098,
25
+ "size": {
26
+ "height": 896,
27
+ "width": 896
28
+ }
29
+ }
processor_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "image_seq_length": 256,
3
+ "processor_class": "Gemma3Processor"
4
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "boi_token": "<start_of_image>",
3
+ "bos_token": {
4
+ "content": "<bos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "eoi_token": "<end_of_image>",
11
+ "eos_token": "<end_of_turn>",
12
+ "image_token": "<image_soft_token>",
13
+ "pad_token": "<end_of_turn>",
14
+ "unk_token": {
15
+ "content": "<unk>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false
20
+ }
21
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4667f2089529e8e7657cfb6d1c19910ae71ff5f28aa7ab2ff2763330affad795
3
+ size 33384568
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
3
+ size 4689074
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
trainer_state.json ADDED
@@ -0,0 +1,2695 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 662,
3
+ "best_metric": 0.005529946647584438,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 662,
7
+ "global_step": 1323,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 0.3124125599861145,
14
+ "epoch": 0.0007558578987150416,
15
+ "grad_norm": 420.0,
16
+ "learning_rate": 0.0,
17
+ "loss": 1.7321,
18
+ "mean_token_accuracy": 0.9224806427955627,
19
+ "num_tokens": 27164.0,
20
+ "step": 1
21
+ },
22
+ {
23
+ "entropy": 0.31456032395362854,
24
+ "epoch": 0.003779289493575208,
25
+ "grad_norm": 322.0,
26
+ "learning_rate": 3.007518796992481e-07,
27
+ "loss": 1.7027,
28
+ "mean_token_accuracy": 0.9347696453332901,
29
+ "num_tokens": 137780.0,
30
+ "step": 5
31
+ },
32
+ {
33
+ "entropy": 0.31488672494888303,
34
+ "epoch": 0.007558578987150416,
35
+ "grad_norm": 384.0,
36
+ "learning_rate": 6.766917293233083e-07,
37
+ "loss": 1.5286,
38
+ "mean_token_accuracy": 0.9335291385650635,
39
+ "num_tokens": 274385.0,
40
+ "step": 10
41
+ },
42
+ {
43
+ "entropy": 0.3156801164150238,
44
+ "epoch": 0.011337868480725623,
45
+ "grad_norm": 330.0,
46
+ "learning_rate": 1.0526315789473685e-06,
47
+ "loss": 0.9665,
48
+ "mean_token_accuracy": 0.9378681778907776,
49
+ "num_tokens": 408513.0,
50
+ "step": 15
51
+ },
52
+ {
53
+ "entropy": 0.3204445898532867,
54
+ "epoch": 0.015117157974300832,
55
+ "grad_norm": 55.25,
56
+ "learning_rate": 1.4285714285714286e-06,
57
+ "loss": 0.243,
58
+ "mean_token_accuracy": 0.9738323450088501,
59
+ "num_tokens": 546284.0,
60
+ "step": 20
61
+ },
62
+ {
63
+ "entropy": 0.32781667709350587,
64
+ "epoch": 0.01889644746787604,
65
+ "grad_norm": 21.5,
66
+ "learning_rate": 1.8045112781954887e-06,
67
+ "loss": 0.2025,
68
+ "mean_token_accuracy": 0.9826048254966736,
69
+ "num_tokens": 681832.0,
70
+ "step": 25
71
+ },
72
+ {
73
+ "entropy": 0.33683774471282957,
74
+ "epoch": 0.022675736961451247,
75
+ "grad_norm": 15.0625,
76
+ "learning_rate": 2.180451127819549e-06,
77
+ "loss": 0.0655,
78
+ "mean_token_accuracy": 0.99116530418396,
79
+ "num_tokens": 818921.0,
80
+ "step": 30
81
+ },
82
+ {
83
+ "entropy": 0.3569560945034027,
84
+ "epoch": 0.026455026455026454,
85
+ "grad_norm": 71.0,
86
+ "learning_rate": 2.556390977443609e-06,
87
+ "loss": 0.1554,
88
+ "mean_token_accuracy": 0.9790175199508667,
89
+ "num_tokens": 956260.0,
90
+ "step": 35
91
+ },
92
+ {
93
+ "entropy": 0.38554110527038576,
94
+ "epoch": 0.030234315948601664,
95
+ "grad_norm": 24.375,
96
+ "learning_rate": 2.9323308270676694e-06,
97
+ "loss": 0.0749,
98
+ "mean_token_accuracy": 0.985888135433197,
99
+ "num_tokens": 1093085.0,
100
+ "step": 40
101
+ },
102
+ {
103
+ "entropy": 0.42414658665657046,
104
+ "epoch": 0.034013605442176874,
105
+ "grad_norm": 30.375,
106
+ "learning_rate": 3.3082706766917295e-06,
107
+ "loss": 0.0599,
108
+ "mean_token_accuracy": 0.9864688992500306,
109
+ "num_tokens": 1230268.0,
110
+ "step": 45
111
+ },
112
+ {
113
+ "entropy": 0.4906178951263428,
114
+ "epoch": 0.03779289493575208,
115
+ "grad_norm": 15.25,
116
+ "learning_rate": 3.6842105263157896e-06,
117
+ "loss": 0.0667,
118
+ "mean_token_accuracy": 0.9870380163192749,
119
+ "num_tokens": 1365940.0,
120
+ "step": 50
121
+ },
122
+ {
123
+ "entropy": 0.5692261815071106,
124
+ "epoch": 0.04157218442932729,
125
+ "grad_norm": 9.3125,
126
+ "learning_rate": 4.06015037593985e-06,
127
+ "loss": 0.0422,
128
+ "mean_token_accuracy": 0.9911088466644287,
129
+ "num_tokens": 1503442.0,
130
+ "step": 55
131
+ },
132
+ {
133
+ "entropy": 0.6098211169242859,
134
+ "epoch": 0.045351473922902494,
135
+ "grad_norm": 4.5,
136
+ "learning_rate": 4.43609022556391e-06,
137
+ "loss": 0.0374,
138
+ "mean_token_accuracy": 0.9921163320541382,
139
+ "num_tokens": 1642345.0,
140
+ "step": 60
141
+ },
142
+ {
143
+ "entropy": 0.6400599360466004,
144
+ "epoch": 0.0491307634164777,
145
+ "grad_norm": 10.6875,
146
+ "learning_rate": 4.81203007518797e-06,
147
+ "loss": 0.0392,
148
+ "mean_token_accuracy": 0.9900148630142211,
149
+ "num_tokens": 1778117.0,
150
+ "step": 65
151
+ },
152
+ {
153
+ "entropy": 0.6656645774841309,
154
+ "epoch": 0.05291005291005291,
155
+ "grad_norm": 8.25,
156
+ "learning_rate": 5.187969924812031e-06,
157
+ "loss": 0.0335,
158
+ "mean_token_accuracy": 0.9904999375343323,
159
+ "num_tokens": 1913704.0,
160
+ "step": 70
161
+ },
162
+ {
163
+ "entropy": 0.6820027470588684,
164
+ "epoch": 0.05668934240362812,
165
+ "grad_norm": 11.625,
166
+ "learning_rate": 5.56390977443609e-06,
167
+ "loss": 0.0386,
168
+ "mean_token_accuracy": 0.9905120015144349,
169
+ "num_tokens": 2051532.0,
170
+ "step": 75
171
+ },
172
+ {
173
+ "entropy": 0.6896930813789368,
174
+ "epoch": 0.06046863189720333,
175
+ "grad_norm": 10.375,
176
+ "learning_rate": 5.939849624060151e-06,
177
+ "loss": 0.0407,
178
+ "mean_token_accuracy": 0.987898075580597,
179
+ "num_tokens": 2186524.0,
180
+ "step": 80
181
+ },
182
+ {
183
+ "entropy": 0.646740996837616,
184
+ "epoch": 0.06424792139077853,
185
+ "grad_norm": 6.84375,
186
+ "learning_rate": 6.31578947368421e-06,
187
+ "loss": 0.0223,
188
+ "mean_token_accuracy": 0.991657304763794,
189
+ "num_tokens": 2322781.0,
190
+ "step": 85
191
+ },
192
+ {
193
+ "entropy": 0.5873156905174255,
194
+ "epoch": 0.06802721088435375,
195
+ "grad_norm": 11.875,
196
+ "learning_rate": 6.691729323308271e-06,
197
+ "loss": 0.0287,
198
+ "mean_token_accuracy": 0.9932127594947815,
199
+ "num_tokens": 2459839.0,
200
+ "step": 90
201
+ },
202
+ {
203
+ "entropy": 0.5771235466003418,
204
+ "epoch": 0.07180650037792895,
205
+ "grad_norm": 6.78125,
206
+ "learning_rate": 7.067669172932331e-06,
207
+ "loss": 0.0504,
208
+ "mean_token_accuracy": 0.987032687664032,
209
+ "num_tokens": 2597176.0,
210
+ "step": 95
211
+ },
212
+ {
213
+ "entropy": 0.6098615527153015,
214
+ "epoch": 0.07558578987150416,
215
+ "grad_norm": 10.0,
216
+ "learning_rate": 7.4436090225563915e-06,
217
+ "loss": 0.0328,
218
+ "mean_token_accuracy": 0.9890416979789733,
219
+ "num_tokens": 2734219.0,
220
+ "step": 100
221
+ },
222
+ {
223
+ "entropy": 0.6399690747261048,
224
+ "epoch": 0.07936507936507936,
225
+ "grad_norm": 10.75,
226
+ "learning_rate": 7.81954887218045e-06,
227
+ "loss": 0.0357,
228
+ "mean_token_accuracy": 0.9906492829322815,
229
+ "num_tokens": 2869035.0,
230
+ "step": 105
231
+ },
232
+ {
233
+ "entropy": 0.6518049478530884,
234
+ "epoch": 0.08314436885865457,
235
+ "grad_norm": 8.625,
236
+ "learning_rate": 8.195488721804512e-06,
237
+ "loss": 0.0107,
238
+ "mean_token_accuracy": 0.9964231610298157,
239
+ "num_tokens": 3005281.0,
240
+ "step": 110
241
+ },
242
+ {
243
+ "entropy": 0.6230420708656311,
244
+ "epoch": 0.08692365835222977,
245
+ "grad_norm": 9.875,
246
+ "learning_rate": 8.571428571428571e-06,
247
+ "loss": 0.0228,
248
+ "mean_token_accuracy": 0.9922025680541993,
249
+ "num_tokens": 3141872.0,
250
+ "step": 115
251
+ },
252
+ {
253
+ "entropy": 0.6244900345802307,
254
+ "epoch": 0.09070294784580499,
255
+ "grad_norm": 0.62109375,
256
+ "learning_rate": 8.947368421052632e-06,
257
+ "loss": 0.0156,
258
+ "mean_token_accuracy": 0.9958398461341857,
259
+ "num_tokens": 3279542.0,
260
+ "step": 120
261
+ },
262
+ {
263
+ "entropy": 0.6464813113212585,
264
+ "epoch": 0.0944822373393802,
265
+ "grad_norm": 3.359375,
266
+ "learning_rate": 9.323308270676693e-06,
267
+ "loss": 0.0251,
268
+ "mean_token_accuracy": 0.9926198482513428,
269
+ "num_tokens": 3415299.0,
270
+ "step": 125
271
+ },
272
+ {
273
+ "entropy": 0.6796218752861023,
274
+ "epoch": 0.0982615268329554,
275
+ "grad_norm": 11.6875,
276
+ "learning_rate": 9.699248120300752e-06,
277
+ "loss": 0.038,
278
+ "mean_token_accuracy": 0.9911927700042724,
279
+ "num_tokens": 3551106.0,
280
+ "step": 130
281
+ },
282
+ {
283
+ "entropy": 0.6995040655136109,
284
+ "epoch": 0.10204081632653061,
285
+ "grad_norm": 15.4375,
286
+ "learning_rate": 9.99998257609161e-06,
287
+ "loss": 0.0277,
288
+ "mean_token_accuracy": 0.994271981716156,
289
+ "num_tokens": 3688660.0,
290
+ "step": 135
291
+ },
292
+ {
293
+ "entropy": 0.7067576289176941,
294
+ "epoch": 0.10582010582010581,
295
+ "grad_norm": 2.484375,
296
+ "learning_rate": 9.999372752048729e-06,
297
+ "loss": 0.0115,
298
+ "mean_token_accuracy": 0.9963347554206848,
299
+ "num_tokens": 3825289.0,
300
+ "step": 140
301
+ },
302
+ {
303
+ "entropy": 0.6457632541656494,
304
+ "epoch": 0.10959939531368103,
305
+ "grad_norm": 10.8125,
306
+ "learning_rate": 9.997891854019538e-06,
307
+ "loss": 0.0214,
308
+ "mean_token_accuracy": 0.9937294125556946,
309
+ "num_tokens": 3962011.0,
310
+ "step": 145
311
+ },
312
+ {
313
+ "entropy": 0.5923184990882874,
314
+ "epoch": 0.11337868480725624,
315
+ "grad_norm": 5.9375,
316
+ "learning_rate": 9.995540140030759e-06,
317
+ "loss": 0.0438,
318
+ "mean_token_accuracy": 0.9906680107116699,
319
+ "num_tokens": 4097931.0,
320
+ "step": 150
321
+ },
322
+ {
323
+ "entropy": 0.6753766417503357,
324
+ "epoch": 0.11715797430083144,
325
+ "grad_norm": 4.75,
326
+ "learning_rate": 9.992318019837171e-06,
327
+ "loss": 0.0341,
328
+ "mean_token_accuracy": 0.993306303024292,
329
+ "num_tokens": 4235447.0,
330
+ "step": 155
331
+ },
332
+ {
333
+ "entropy": 0.7623552441596985,
334
+ "epoch": 0.12093726379440665,
335
+ "grad_norm": 6.21875,
336
+ "learning_rate": 9.988226054850218e-06,
337
+ "loss": 0.0207,
338
+ "mean_token_accuracy": 0.9943189144134521,
339
+ "num_tokens": 4370167.0,
340
+ "step": 160
341
+ },
342
+ {
343
+ "entropy": 0.7839436531066895,
344
+ "epoch": 0.12471655328798185,
345
+ "grad_norm": 5.71875,
346
+ "learning_rate": 9.983264958040194e-06,
347
+ "loss": 0.0175,
348
+ "mean_token_accuracy": 0.9942814111709595,
349
+ "num_tokens": 4506649.0,
350
+ "step": 165
351
+ },
352
+ {
353
+ "entropy": 0.806551742553711,
354
+ "epoch": 0.12849584278155707,
355
+ "grad_norm": 6.1875,
356
+ "learning_rate": 9.977435593812013e-06,
357
+ "loss": 0.0295,
358
+ "mean_token_accuracy": 0.9921065330505371,
359
+ "num_tokens": 4643769.0,
360
+ "step": 170
361
+ },
362
+ {
363
+ "entropy": 0.7947233080863952,
364
+ "epoch": 0.13227513227513227,
365
+ "grad_norm": 10.1875,
366
+ "learning_rate": 9.970738977854597e-06,
367
+ "loss": 0.0202,
368
+ "mean_token_accuracy": 0.9931598544120789,
369
+ "num_tokens": 4781667.0,
370
+ "step": 175
371
+ },
372
+ {
373
+ "entropy": 0.778699517250061,
374
+ "epoch": 0.1360544217687075,
375
+ "grad_norm": 9.4375,
376
+ "learning_rate": 9.963176276963916e-06,
377
+ "loss": 0.0211,
378
+ "mean_token_accuracy": 0.99481680393219,
379
+ "num_tokens": 4919296.0,
380
+ "step": 180
381
+ },
382
+ {
383
+ "entropy": 0.7494423031806946,
384
+ "epoch": 0.1398337112622827,
385
+ "grad_norm": 0.447265625,
386
+ "learning_rate": 9.954748808839675e-06,
387
+ "loss": 0.0135,
388
+ "mean_token_accuracy": 0.9973663449287414,
389
+ "num_tokens": 5054459.0,
390
+ "step": 185
391
+ },
392
+ {
393
+ "entropy": 0.7103082180023194,
394
+ "epoch": 0.1436130007558579,
395
+ "grad_norm": 0.50390625,
396
+ "learning_rate": 9.945458041855732e-06,
397
+ "loss": 0.0242,
398
+ "mean_token_accuracy": 0.9963336825370789,
399
+ "num_tokens": 5189912.0,
400
+ "step": 190
401
+ },
402
+ {
403
+ "entropy": 0.6840783715248108,
404
+ "epoch": 0.1473922902494331,
405
+ "grad_norm": 3.328125,
406
+ "learning_rate": 9.935305594804247e-06,
407
+ "loss": 0.0136,
408
+ "mean_token_accuracy": 0.996890377998352,
409
+ "num_tokens": 5326831.0,
410
+ "step": 195
411
+ },
412
+ {
413
+ "entropy": 0.7042184591293335,
414
+ "epoch": 0.15117157974300832,
415
+ "grad_norm": 0.875,
416
+ "learning_rate": 9.924293236613643e-06,
417
+ "loss": 0.0187,
418
+ "mean_token_accuracy": 0.9957641124725342,
419
+ "num_tokens": 5463018.0,
420
+ "step": 200
421
+ },
422
+ {
423
+ "entropy": 0.7395557761192322,
424
+ "epoch": 0.15495086923658352,
425
+ "grad_norm": 2.453125,
426
+ "learning_rate": 9.91242288604037e-06,
427
+ "loss": 0.0175,
428
+ "mean_token_accuracy": 0.9968855857849122,
429
+ "num_tokens": 5600264.0,
430
+ "step": 205
431
+ },
432
+ {
433
+ "entropy": 0.7494800806045532,
434
+ "epoch": 0.15873015873015872,
435
+ "grad_norm": 1.0390625,
436
+ "learning_rate": 9.899696611334612e-06,
437
+ "loss": 0.0301,
438
+ "mean_token_accuracy": 0.994823443889618,
439
+ "num_tokens": 5736124.0,
440
+ "step": 210
441
+ },
442
+ {
443
+ "entropy": 0.6967356204986572,
444
+ "epoch": 0.16250944822373395,
445
+ "grad_norm": 3.3125,
446
+ "learning_rate": 9.886116629879906e-06,
447
+ "loss": 0.0097,
448
+ "mean_token_accuracy": 0.9979151964187623,
449
+ "num_tokens": 5870255.0,
450
+ "step": 215
451
+ },
452
+ {
453
+ "entropy": 0.6044986009597778,
454
+ "epoch": 0.16628873771730915,
455
+ "grad_norm": 5.53125,
456
+ "learning_rate": 9.871685307806796e-06,
457
+ "loss": 0.0122,
458
+ "mean_token_accuracy": 0.9984508991241455,
459
+ "num_tokens": 6005956.0,
460
+ "step": 220
461
+ },
462
+ {
463
+ "entropy": 0.5804512858390808,
464
+ "epoch": 0.17006802721088435,
465
+ "grad_norm": 7.34375,
466
+ "learning_rate": 9.85640515958057e-06,
467
+ "loss": 0.0215,
468
+ "mean_token_accuracy": 0.995808458328247,
469
+ "num_tokens": 6143559.0,
470
+ "step": 225
471
+ },
472
+ {
473
+ "entropy": 0.5928897976875305,
474
+ "epoch": 0.17384731670445955,
475
+ "grad_norm": 9.6875,
476
+ "learning_rate": 9.840278847563147e-06,
477
+ "loss": 0.0279,
478
+ "mean_token_accuracy": 0.9937070727348327,
479
+ "num_tokens": 6279410.0,
480
+ "step": 230
481
+ },
482
+ {
483
+ "entropy": 0.6215349555015564,
484
+ "epoch": 0.17762660619803478,
485
+ "grad_norm": 3.5,
486
+ "learning_rate": 9.823309181549194e-06,
487
+ "loss": 0.0253,
488
+ "mean_token_accuracy": 0.9947557568550109,
489
+ "num_tokens": 6415542.0,
490
+ "step": 235
491
+ },
492
+ {
493
+ "entropy": 0.6546105861663818,
494
+ "epoch": 0.18140589569160998,
495
+ "grad_norm": 1.3125,
496
+ "learning_rate": 9.805499118276555e-06,
497
+ "loss": 0.02,
498
+ "mean_token_accuracy": 0.9947941064834595,
499
+ "num_tokens": 6550334.0,
500
+ "step": 240
501
+ },
502
+ {
503
+ "entropy": 0.6813999891281128,
504
+ "epoch": 0.18518518518518517,
505
+ "grad_norm": 4.5625,
506
+ "learning_rate": 9.786851760911084e-06,
507
+ "loss": 0.0123,
508
+ "mean_token_accuracy": 0.996846055984497,
509
+ "num_tokens": 6686963.0,
510
+ "step": 245
511
+ },
512
+ {
513
+ "entropy": 0.6820224285125732,
514
+ "epoch": 0.1889644746787604,
515
+ "grad_norm": 4.3125,
516
+ "learning_rate": 9.767370358505958e-06,
517
+ "loss": 0.0074,
518
+ "mean_token_accuracy": 0.9979071021080017,
519
+ "num_tokens": 6823233.0,
520
+ "step": 250
521
+ },
522
+ {
523
+ "entropy": 0.6676241278648376,
524
+ "epoch": 0.1927437641723356,
525
+ "grad_norm": 1.109375,
526
+ "learning_rate": 9.747058305435566e-06,
527
+ "loss": 0.0061,
528
+ "mean_token_accuracy": 0.9984600782394409,
529
+ "num_tokens": 6959250.0,
530
+ "step": 255
531
+ },
532
+ {
533
+ "entropy": 0.6509259819984436,
534
+ "epoch": 0.1965230536659108,
535
+ "grad_norm": 12.0,
536
+ "learning_rate": 9.7259191408041e-06,
537
+ "loss": 0.0169,
538
+ "mean_token_accuracy": 0.9963755011558533,
539
+ "num_tokens": 7098179.0,
540
+ "step": 260
541
+ },
542
+ {
543
+ "entropy": 0.6398890137672424,
544
+ "epoch": 0.20030234315948603,
545
+ "grad_norm": 0.11181640625,
546
+ "learning_rate": 9.703956547828893e-06,
547
+ "loss": 0.0201,
548
+ "mean_token_accuracy": 0.9968681931495667,
549
+ "num_tokens": 7234547.0,
550
+ "step": 265
551
+ },
552
+ {
553
+ "entropy": 0.6418223738670349,
554
+ "epoch": 0.20408163265306123,
555
+ "grad_norm": 0.294921875,
556
+ "learning_rate": 9.681174353198687e-06,
557
+ "loss": 0.0042,
558
+ "mean_token_accuracy": 0.9989552617073059,
559
+ "num_tokens": 7370194.0,
560
+ "step": 270
561
+ },
562
+ {
563
+ "entropy": 0.6441201329231262,
564
+ "epoch": 0.20786092214663643,
565
+ "grad_norm": 11.5,
566
+ "learning_rate": 9.657576526406872e-06,
567
+ "loss": 0.0184,
568
+ "mean_token_accuracy": 0.9953326106071472,
569
+ "num_tokens": 7507836.0,
570
+ "step": 275
571
+ },
572
+ {
573
+ "entropy": 0.6464055299758911,
574
+ "epoch": 0.21164021164021163,
575
+ "grad_norm": 3.796875,
576
+ "learning_rate": 9.633167179059859e-06,
577
+ "loss": 0.0127,
578
+ "mean_token_accuracy": 0.9968541741371155,
579
+ "num_tokens": 7644279.0,
580
+ "step": 280
581
+ },
582
+ {
583
+ "entropy": 0.6436957597732544,
584
+ "epoch": 0.21541950113378686,
585
+ "grad_norm": 3.75,
586
+ "learning_rate": 9.607950564160682e-06,
587
+ "loss": 0.0113,
588
+ "mean_token_accuracy": 0.9973994374275208,
589
+ "num_tokens": 7780875.0,
590
+ "step": 285
591
+ },
592
+ {
593
+ "entropy": 0.6544555306434632,
594
+ "epoch": 0.21919879062736206,
595
+ "grad_norm": 4.78125,
596
+ "learning_rate": 9.581931075367979e-06,
597
+ "loss": 0.0199,
598
+ "mean_token_accuracy": 0.9953313350677491,
599
+ "num_tokens": 7917606.0,
600
+ "step": 290
601
+ },
602
+ {
603
+ "entropy": 0.6784600496292115,
604
+ "epoch": 0.22297808012093726,
605
+ "grad_norm": 1.8515625,
606
+ "learning_rate": 9.555113246230443e-06,
607
+ "loss": 0.0097,
608
+ "mean_token_accuracy": 0.9973721981048584,
609
+ "num_tokens": 8054140.0,
610
+ "step": 295
611
+ },
612
+ {
613
+ "entropy": 0.7043081760406494,
614
+ "epoch": 0.22675736961451248,
615
+ "grad_norm": 1.421875,
616
+ "learning_rate": 9.527501749396924e-06,
617
+ "loss": 0.0164,
618
+ "mean_token_accuracy": 0.9963916540145874,
619
+ "num_tokens": 8189168.0,
620
+ "step": 300
621
+ },
622
+ {
623
+ "entropy": 0.6775729060173035,
624
+ "epoch": 0.23053665910808768,
625
+ "grad_norm": 0.77734375,
626
+ "learning_rate": 9.499101395802277e-06,
627
+ "loss": 0.0017,
628
+ "mean_token_accuracy": 0.9994764447212219,
629
+ "num_tokens": 8325521.0,
630
+ "step": 305
631
+ },
632
+ {
633
+ "entropy": 0.620274543762207,
634
+ "epoch": 0.23431594860166288,
635
+ "grad_norm": 6.59375,
636
+ "learning_rate": 9.469917133829114e-06,
637
+ "loss": 0.0039,
638
+ "mean_token_accuracy": 0.9984441757202148,
639
+ "num_tokens": 8460969.0,
640
+ "step": 310
641
+ },
642
+ {
643
+ "entropy": 0.5944037318229676,
644
+ "epoch": 0.23809523809523808,
645
+ "grad_norm": 2.734375,
646
+ "learning_rate": 9.439954048445628e-06,
647
+ "loss": 0.0158,
648
+ "mean_token_accuracy": 0.9974146604537963,
649
+ "num_tokens": 8598326.0,
650
+ "step": 315
651
+ },
652
+ {
653
+ "entropy": 0.5941187143325806,
654
+ "epoch": 0.2418745275888133,
655
+ "grad_norm": 5.0625,
656
+ "learning_rate": 9.409217360319594e-06,
657
+ "loss": 0.0145,
658
+ "mean_token_accuracy": 0.9974010944366455,
659
+ "num_tokens": 8734691.0,
660
+ "step": 320
661
+ },
662
+ {
663
+ "entropy": 0.6153214812278748,
664
+ "epoch": 0.2456538170823885,
665
+ "grad_norm": 3.796875,
666
+ "learning_rate": 9.377712424908743e-06,
667
+ "loss": 0.0205,
668
+ "mean_token_accuracy": 0.9963401436805726,
669
+ "num_tokens": 8871429.0,
670
+ "step": 325
671
+ },
672
+ {
673
+ "entropy": 0.6333611607551575,
674
+ "epoch": 0.2494331065759637,
675
+ "grad_norm": 2.59375,
676
+ "learning_rate": 9.345444731527642e-06,
677
+ "loss": 0.0192,
678
+ "mean_token_accuracy": 0.9963601469993592,
679
+ "num_tokens": 9006202.0,
680
+ "step": 330
681
+ },
682
+ {
683
+ "entropy": 0.6577699303627014,
684
+ "epoch": 0.25321239606953894,
685
+ "grad_norm": 6.625,
686
+ "learning_rate": 9.312419902391256e-06,
687
+ "loss": 0.0231,
688
+ "mean_token_accuracy": 0.9957833290100098,
689
+ "num_tokens": 9140608.0,
690
+ "step": 335
691
+ },
692
+ {
693
+ "entropy": 0.6712118268013001,
694
+ "epoch": 0.25699168556311414,
695
+ "grad_norm": 5.59375,
696
+ "learning_rate": 9.278643691635352e-06,
697
+ "loss": 0.0213,
698
+ "mean_token_accuracy": 0.9947887420654297,
699
+ "num_tokens": 9276772.0,
700
+ "step": 340
701
+ },
702
+ {
703
+ "entropy": 0.6831606507301331,
704
+ "epoch": 0.26077097505668934,
705
+ "grad_norm": 0.119140625,
706
+ "learning_rate": 9.244121984313916e-06,
707
+ "loss": 0.0082,
708
+ "mean_token_accuracy": 0.9984793663024902,
709
+ "num_tokens": 9414540.0,
710
+ "step": 345
711
+ },
712
+ {
713
+ "entropy": 0.6921569228172302,
714
+ "epoch": 0.26455026455026454,
715
+ "grad_norm": 0.23046875,
716
+ "learning_rate": 9.208860795373765e-06,
717
+ "loss": 0.0119,
718
+ "mean_token_accuracy": 0.9963441491127014,
719
+ "num_tokens": 9550306.0,
720
+ "step": 350
721
+ },
722
+ {
723
+ "entropy": 0.6849629878997803,
724
+ "epoch": 0.26832955404383974,
725
+ "grad_norm": 2.34375,
726
+ "learning_rate": 9.172866268606514e-06,
727
+ "loss": 0.0103,
728
+ "mean_token_accuracy": 0.9968430995941162,
729
+ "num_tokens": 9686568.0,
730
+ "step": 355
731
+ },
732
+ {
733
+ "entropy": 0.6642722845077514,
734
+ "epoch": 0.272108843537415,
735
+ "grad_norm": 7.34375,
736
+ "learning_rate": 9.136144675578114e-06,
737
+ "loss": 0.0193,
738
+ "mean_token_accuracy": 0.996327793598175,
739
+ "num_tokens": 9822879.0,
740
+ "step": 360
741
+ },
742
+ {
743
+ "entropy": 0.6514992356300354,
744
+ "epoch": 0.2758881330309902,
745
+ "grad_norm": 5.78125,
746
+ "learning_rate": 9.098702414536107e-06,
747
+ "loss": 0.0089,
748
+ "mean_token_accuracy": 0.9973956227302552,
749
+ "num_tokens": 9959308.0,
750
+ "step": 365
751
+ },
752
+ {
753
+ "entropy": 0.6542440056800842,
754
+ "epoch": 0.2796674225245654,
755
+ "grad_norm": 1.9375,
756
+ "learning_rate": 9.060546009294818e-06,
757
+ "loss": 0.0127,
758
+ "mean_token_accuracy": 0.9979299783706665,
759
+ "num_tokens": 10096575.0,
760
+ "step": 370
761
+ },
762
+ {
763
+ "entropy": 0.6637985825538635,
764
+ "epoch": 0.2834467120181406,
765
+ "grad_norm": 6.90625,
766
+ "learning_rate": 9.021682108098671e-06,
767
+ "loss": 0.0106,
768
+ "mean_token_accuracy": 0.9973872303962708,
769
+ "num_tokens": 10232621.0,
770
+ "step": 375
771
+ },
772
+ {
773
+ "entropy": 0.6581239342689514,
774
+ "epoch": 0.2872260015117158,
775
+ "grad_norm": 2.578125,
776
+ "learning_rate": 8.982117482463817e-06,
777
+ "loss": 0.0216,
778
+ "mean_token_accuracy": 0.9948861002922058,
779
+ "num_tokens": 10368276.0,
780
+ "step": 380
781
+ },
782
+ {
783
+ "entropy": 0.6503197908401489,
784
+ "epoch": 0.291005291005291,
785
+ "grad_norm": 4.0625,
786
+ "learning_rate": 8.94185902599829e-06,
787
+ "loss": 0.0108,
788
+ "mean_token_accuracy": 0.9968842387199401,
789
+ "num_tokens": 10506811.0,
790
+ "step": 385
791
+ },
792
+ {
793
+ "entropy": 0.6437017798423768,
794
+ "epoch": 0.2947845804988662,
795
+ "grad_norm": 0.3125,
796
+ "learning_rate": 8.900913753200887e-06,
797
+ "loss": 0.0044,
798
+ "mean_token_accuracy": 0.9989569306373596,
799
+ "num_tokens": 10642953.0,
800
+ "step": 390
801
+ },
802
+ {
803
+ "entropy": 0.6417622208595276,
804
+ "epoch": 0.29856386999244144,
805
+ "grad_norm": 0.014892578125,
806
+ "learning_rate": 8.859288798238988e-06,
807
+ "loss": 0.0098,
808
+ "mean_token_accuracy": 0.9979151129722595,
809
+ "num_tokens": 10780398.0,
810
+ "step": 395
811
+ },
812
+ {
813
+ "entropy": 0.6507818102836609,
814
+ "epoch": 0.30234315948601664,
815
+ "grad_norm": 2.921875,
816
+ "learning_rate": 8.816991413705515e-06,
817
+ "loss": 0.0225,
818
+ "mean_token_accuracy": 0.9968856692314148,
819
+ "num_tokens": 10916779.0,
820
+ "step": 400
821
+ },
822
+ {
823
+ "entropy": 0.6669408321380615,
824
+ "epoch": 0.30612244897959184,
825
+ "grad_norm": 4.8125,
826
+ "learning_rate": 8.774028969355273e-06,
827
+ "loss": 0.0171,
828
+ "mean_token_accuracy": 0.997943377494812,
829
+ "num_tokens": 11054380.0,
830
+ "step": 405
831
+ },
832
+ {
833
+ "entropy": 0.6745324969291687,
834
+ "epoch": 0.30990173847316704,
835
+ "grad_norm": 4.5,
836
+ "learning_rate": 8.730408950820864e-06,
837
+ "loss": 0.0142,
838
+ "mean_token_accuracy": 0.9974199891090393,
839
+ "num_tokens": 11191785.0,
840
+ "step": 410
841
+ },
842
+ {
843
+ "entropy": 0.6699419260025025,
844
+ "epoch": 0.31368102796674224,
845
+ "grad_norm": 8.3125,
846
+ "learning_rate": 8.686138958308415e-06,
847
+ "loss": 0.0095,
848
+ "mean_token_accuracy": 0.9984320163726806,
849
+ "num_tokens": 11329447.0,
850
+ "step": 415
851
+ },
852
+ {
853
+ "entropy": 0.6696412801742554,
854
+ "epoch": 0.31746031746031744,
855
+ "grad_norm": 3.59375,
856
+ "learning_rate": 8.641226705273344e-06,
857
+ "loss": 0.0114,
858
+ "mean_token_accuracy": 0.9963794231414795,
859
+ "num_tokens": 11464507.0,
860
+ "step": 420
861
+ },
862
+ {
863
+ "entropy": 0.6751126527786255,
864
+ "epoch": 0.32123960695389264,
865
+ "grad_norm": 2.671875,
866
+ "learning_rate": 8.595680017076403e-06,
867
+ "loss": 0.0126,
868
+ "mean_token_accuracy": 0.9973998427391052,
869
+ "num_tokens": 11599832.0,
870
+ "step": 425
871
+ },
872
+ {
873
+ "entropy": 0.6786884784698486,
874
+ "epoch": 0.3250188964474679,
875
+ "grad_norm": 0.34765625,
876
+ "learning_rate": 8.549506829620193e-06,
877
+ "loss": 0.001,
878
+ "mean_token_accuracy": 1.0,
879
+ "num_tokens": 11734763.0,
880
+ "step": 430
881
+ },
882
+ {
883
+ "entropy": 0.6726262927055359,
884
+ "epoch": 0.3287981859410431,
885
+ "grad_norm": 1.6015625,
886
+ "learning_rate": 8.502715187966455e-06,
887
+ "loss": 0.0082,
888
+ "mean_token_accuracy": 0.9984387040138245,
889
+ "num_tokens": 11873050.0,
890
+ "step": 435
891
+ },
892
+ {
893
+ "entropy": 0.6710472464561462,
894
+ "epoch": 0.3325774754346183,
895
+ "grad_norm": 2.203125,
896
+ "learning_rate": 8.455313244934324e-06,
897
+ "loss": 0.0118,
898
+ "mean_token_accuracy": 0.9974292159080506,
899
+ "num_tokens": 12008849.0,
900
+ "step": 440
901
+ },
902
+ {
903
+ "entropy": 0.6699627637863159,
904
+ "epoch": 0.3363567649281935,
905
+ "grad_norm": 4.625,
906
+ "learning_rate": 8.407309259679801e-06,
907
+ "loss": 0.0032,
908
+ "mean_token_accuracy": 0.9984238147735596,
909
+ "num_tokens": 12144651.0,
910
+ "step": 445
911
+ },
912
+ {
913
+ "entropy": 0.6549316048622131,
914
+ "epoch": 0.3401360544217687,
915
+ "grad_norm": 2.78125,
916
+ "learning_rate": 8.358711596256712e-06,
917
+ "loss": 0.0162,
918
+ "mean_token_accuracy": 0.9953205347061157,
919
+ "num_tokens": 12281801.0,
920
+ "step": 450
921
+ },
922
+ {
923
+ "entropy": 0.6543108463287354,
924
+ "epoch": 0.3439153439153439,
925
+ "grad_norm": 0.078125,
926
+ "learning_rate": 8.309528722159383e-06,
927
+ "loss": 0.0068,
928
+ "mean_token_accuracy": 0.9994680881500244,
929
+ "num_tokens": 12418753.0,
930
+ "step": 455
931
+ },
932
+ {
933
+ "entropy": 0.6507919788360595,
934
+ "epoch": 0.3476946334089191,
935
+ "grad_norm": 2.984375,
936
+ "learning_rate": 8.259769206847286e-06,
937
+ "loss": 0.0074,
938
+ "mean_token_accuracy": 0.9963487148284912,
939
+ "num_tokens": 12555884.0,
940
+ "step": 460
941
+ },
942
+ {
943
+ "entropy": 0.6534615635871888,
944
+ "epoch": 0.35147392290249435,
945
+ "grad_norm": 2.703125,
946
+ "learning_rate": 8.209441720251934e-06,
947
+ "loss": 0.0078,
948
+ "mean_token_accuracy": 0.9984305024147033,
949
+ "num_tokens": 12692226.0,
950
+ "step": 465
951
+ },
952
+ {
953
+ "entropy": 0.6645334720611572,
954
+ "epoch": 0.35525321239606955,
955
+ "grad_norm": 6.21875,
956
+ "learning_rate": 8.158555031266255e-06,
957
+ "loss": 0.021,
958
+ "mean_token_accuracy": 0.9947924494743348,
959
+ "num_tokens": 12829578.0,
960
+ "step": 470
961
+ },
962
+ {
963
+ "entropy": 0.6773025989532471,
964
+ "epoch": 0.35903250188964475,
965
+ "grad_norm": 20.875,
966
+ "learning_rate": 8.107118006216732e-06,
967
+ "loss": 0.0171,
968
+ "mean_token_accuracy": 0.9963470816612243,
969
+ "num_tokens": 12967378.0,
970
+ "step": 475
971
+ },
972
+ {
973
+ "entropy": 0.6804514646530151,
974
+ "epoch": 0.36281179138321995,
975
+ "grad_norm": 2.453125,
976
+ "learning_rate": 8.055139607318558e-06,
977
+ "loss": 0.0099,
978
+ "mean_token_accuracy": 0.9979258418083191,
979
+ "num_tokens": 13106566.0,
980
+ "step": 480
981
+ },
982
+ {
983
+ "entropy": 0.673608911037445,
984
+ "epoch": 0.36659108087679515,
985
+ "grad_norm": 0.16796875,
986
+ "learning_rate": 8.002628891114104e-06,
987
+ "loss": 0.0021,
988
+ "mean_token_accuracy": 0.9994818687438964,
989
+ "num_tokens": 13241319.0,
990
+ "step": 485
991
+ },
992
+ {
993
+ "entropy": 0.6570681810379029,
994
+ "epoch": 0.37037037037037035,
995
+ "grad_norm": 2.765625,
996
+ "learning_rate": 7.949595006894917e-06,
997
+ "loss": 0.0105,
998
+ "mean_token_accuracy": 0.9979082107543945,
999
+ "num_tokens": 13378007.0,
1000
+ "step": 490
1001
+ },
1002
+ {
1003
+ "entropy": 0.6474269986152649,
1004
+ "epoch": 0.3741496598639456,
1005
+ "grad_norm": 1.71875,
1006
+ "learning_rate": 7.896047195107599e-06,
1007
+ "loss": 0.018,
1008
+ "mean_token_accuracy": 0.9958997368812561,
1009
+ "num_tokens": 13514007.0,
1010
+ "step": 495
1011
+ },
1012
+ {
1013
+ "entropy": 0.6409961104393005,
1014
+ "epoch": 0.3779289493575208,
1015
+ "grad_norm": 2.828125,
1016
+ "learning_rate": 7.841994785743765e-06,
1017
+ "loss": 0.0116,
1018
+ "mean_token_accuracy": 0.9973924398422241,
1019
+ "num_tokens": 13650732.0,
1020
+ "step": 500
1021
+ },
1022
+ {
1023
+ "entropy": 0.6309547066688538,
1024
+ "epoch": 0.381708238851096,
1025
+ "grad_norm": 0.32421875,
1026
+ "learning_rate": 7.787447196714428e-06,
1027
+ "loss": 0.0063,
1028
+ "mean_token_accuracy": 0.9984575271606445,
1029
+ "num_tokens": 13788024.0,
1030
+ "step": 505
1031
+ },
1032
+ {
1033
+ "entropy": 0.6259261012077332,
1034
+ "epoch": 0.3854875283446712,
1035
+ "grad_norm": 0.03759765625,
1036
+ "learning_rate": 7.73241393220905e-06,
1037
+ "loss": 0.0061,
1038
+ "mean_token_accuracy": 0.9979136824607849,
1039
+ "num_tokens": 13924676.0,
1040
+ "step": 510
1041
+ },
1042
+ {
1043
+ "entropy": 0.6200960636138916,
1044
+ "epoch": 0.3892668178382464,
1045
+ "grad_norm": 0.015869140625,
1046
+ "learning_rate": 7.676904581039559e-06,
1047
+ "loss": 0.0068,
1048
+ "mean_token_accuracy": 0.9984575867652893,
1049
+ "num_tokens": 14061789.0,
1050
+ "step": 515
1051
+ },
1052
+ {
1053
+ "entropy": 0.6154240965843201,
1054
+ "epoch": 0.3930461073318216,
1055
+ "grad_norm": 0.0255126953125,
1056
+ "learning_rate": 7.620928814969636e-06,
1057
+ "loss": 0.0131,
1058
+ "mean_token_accuracy": 0.9969032168388366,
1059
+ "num_tokens": 14198995.0,
1060
+ "step": 520
1061
+ },
1062
+ {
1063
+ "entropy": 0.6134608268737793,
1064
+ "epoch": 0.3968253968253968,
1065
+ "grad_norm": 0.01055908203125,
1066
+ "learning_rate": 7.564496387029532e-06,
1067
+ "loss": 0.0082,
1068
+ "mean_token_accuracy": 0.998963737487793,
1069
+ "num_tokens": 14335595.0,
1070
+ "step": 525
1071
+ },
1072
+ {
1073
+ "entropy": 0.6157876014709472,
1074
+ "epoch": 0.40060468631897206,
1075
+ "grad_norm": 0.65234375,
1076
+ "learning_rate": 7.507617129816733e-06,
1077
+ "loss": 0.012,
1078
+ "mean_token_accuracy": 0.9963508725166321,
1079
+ "num_tokens": 14471802.0,
1080
+ "step": 530
1081
+ },
1082
+ {
1083
+ "entropy": 0.6165508031845093,
1084
+ "epoch": 0.40438397581254726,
1085
+ "grad_norm": 0.01116943359375,
1086
+ "learning_rate": 7.450300953782768e-06,
1087
+ "loss": 0.0006,
1088
+ "mean_token_accuracy": 1.0,
1089
+ "num_tokens": 14607953.0,
1090
+ "step": 535
1091
+ },
1092
+ {
1093
+ "entropy": 0.6160001993179322,
1094
+ "epoch": 0.40816326530612246,
1095
+ "grad_norm": 0.041015625,
1096
+ "learning_rate": 7.392557845506433e-06,
1097
+ "loss": 0.0027,
1098
+ "mean_token_accuracy": 0.9994897961616516,
1099
+ "num_tokens": 14745750.0,
1100
+ "step": 540
1101
+ },
1102
+ {
1103
+ "entropy": 0.6151047348976135,
1104
+ "epoch": 0.41194255479969766,
1105
+ "grad_norm": 9.4375,
1106
+ "learning_rate": 7.3343978659537775e-06,
1107
+ "loss": 0.0128,
1108
+ "mean_token_accuracy": 0.9963156104087829,
1109
+ "num_tokens": 14882036.0,
1110
+ "step": 545
1111
+ },
1112
+ {
1113
+ "entropy": 0.6159613847732544,
1114
+ "epoch": 0.41572184429327286,
1115
+ "grad_norm": 0.09814453125,
1116
+ "learning_rate": 7.275831148725101e-06,
1117
+ "loss": 0.0033,
1118
+ "mean_token_accuracy": 0.9989569306373596,
1119
+ "num_tokens": 15018906.0,
1120
+ "step": 550
1121
+ },
1122
+ {
1123
+ "entropy": 0.6163604021072387,
1124
+ "epoch": 0.41950113378684806,
1125
+ "grad_norm": 0.042724609375,
1126
+ "learning_rate": 7.216867898289319e-06,
1127
+ "loss": 0.0014,
1128
+ "mean_token_accuracy": 0.998973035812378,
1129
+ "num_tokens": 15156568.0,
1130
+ "step": 555
1131
+ },
1132
+ {
1133
+ "entropy": 0.6148820281028747,
1134
+ "epoch": 0.42328042328042326,
1135
+ "grad_norm": 5.65625,
1136
+ "learning_rate": 7.157518388205969e-06,
1137
+ "loss": 0.0116,
1138
+ "mean_token_accuracy": 0.9968932032585144,
1139
+ "num_tokens": 15291333.0,
1140
+ "step": 560
1141
+ },
1142
+ {
1143
+ "entropy": 0.6163430094718934,
1144
+ "epoch": 0.4270597127739985,
1145
+ "grad_norm": 0.052978515625,
1146
+ "learning_rate": 7.097792959335169e-06,
1147
+ "loss": 0.0082,
1148
+ "mean_token_accuracy": 0.9979112029075623,
1149
+ "num_tokens": 15430789.0,
1150
+ "step": 565
1151
+ },
1152
+ {
1153
+ "entropy": 0.6200400829315186,
1154
+ "epoch": 0.4308390022675737,
1155
+ "grad_norm": 0.349609375,
1156
+ "learning_rate": 7.037702018035878e-06,
1157
+ "loss": 0.0027,
1158
+ "mean_token_accuracy": 0.9989610314369202,
1159
+ "num_tokens": 15565867.0,
1160
+ "step": 570
1161
+ },
1162
+ {
1163
+ "entropy": 0.620907187461853,
1164
+ "epoch": 0.4346182917611489,
1165
+ "grad_norm": 1.7421875,
1166
+ "learning_rate": 6.977256034352713e-06,
1167
+ "loss": 0.0038,
1168
+ "mean_token_accuracy": 0.9994736790657044,
1169
+ "num_tokens": 15700858.0,
1170
+ "step": 575
1171
+ },
1172
+ {
1173
+ "entropy": 0.6191686034202576,
1174
+ "epoch": 0.4383975812547241,
1175
+ "grad_norm": 0.1298828125,
1176
+ "learning_rate": 6.916465540191692e-06,
1177
+ "loss": 0.0069,
1178
+ "mean_token_accuracy": 0.9984344959259033,
1179
+ "num_tokens": 15835800.0,
1180
+ "step": 580
1181
+ },
1182
+ {
1183
+ "entropy": 0.6172895193099975,
1184
+ "epoch": 0.4421768707482993,
1185
+ "grad_norm": 0.019287109375,
1186
+ "learning_rate": 6.855341127485183e-06,
1187
+ "loss": 0.0115,
1188
+ "mean_token_accuracy": 0.9989583373069764,
1189
+ "num_tokens": 15971026.0,
1190
+ "step": 585
1191
+ },
1192
+ {
1193
+ "entropy": 0.6186184287071228,
1194
+ "epoch": 0.4459561602418745,
1195
+ "grad_norm": 4.09375,
1196
+ "learning_rate": 6.793893446346405e-06,
1197
+ "loss": 0.0122,
1198
+ "mean_token_accuracy": 0.9963900685310364,
1199
+ "num_tokens": 16107078.0,
1200
+ "step": 590
1201
+ },
1202
+ {
1203
+ "entropy": 0.623417603969574,
1204
+ "epoch": 0.4497354497354497,
1205
+ "grad_norm": 0.0634765625,
1206
+ "learning_rate": 6.7321332032137875e-06,
1207
+ "loss": 0.0008,
1208
+ "mean_token_accuracy": 0.9994750618934631,
1209
+ "num_tokens": 16242831.0,
1210
+ "step": 595
1211
+ },
1212
+ {
1213
+ "entropy": 0.6259370088577271,
1214
+ "epoch": 0.45351473922902497,
1215
+ "grad_norm": 0.134765625,
1216
+ "learning_rate": 6.670071158985521e-06,
1217
+ "loss": 0.0023,
1218
+ "mean_token_accuracy": 0.998976981639862,
1219
+ "num_tokens": 16382471.0,
1220
+ "step": 600
1221
+ },
1222
+ {
1223
+ "entropy": 0.6233020544052124,
1224
+ "epoch": 0.45729402872260017,
1225
+ "grad_norm": 1.46875,
1226
+ "learning_rate": 6.607718127144601e-06,
1227
+ "loss": 0.0039,
1228
+ "mean_token_accuracy": 0.9984204411506653,
1229
+ "num_tokens": 16519391.0,
1230
+ "step": 605
1231
+ },
1232
+ {
1233
+ "entropy": 0.6223879218101501,
1234
+ "epoch": 0.46107331821617537,
1235
+ "grad_norm": 0.6328125,
1236
+ "learning_rate": 6.545084971874738e-06,
1237
+ "loss": 0.0068,
1238
+ "mean_token_accuracy": 0.9979070544242858,
1239
+ "num_tokens": 16654490.0,
1240
+ "step": 610
1241
+ },
1242
+ {
1243
+ "entropy": 0.6202489614486695,
1244
+ "epoch": 0.46485260770975056,
1245
+ "grad_norm": 3.453125,
1246
+ "learning_rate": 6.482182606167409e-06,
1247
+ "loss": 0.0062,
1248
+ "mean_token_accuracy": 0.997891879081726,
1249
+ "num_tokens": 16792155.0,
1250
+ "step": 615
1251
+ },
1252
+ {
1253
+ "entropy": 0.619624662399292,
1254
+ "epoch": 0.46863189720332576,
1255
+ "grad_norm": 2.59375,
1256
+ "learning_rate": 6.419021989920416e-06,
1257
+ "loss": 0.0025,
1258
+ "mean_token_accuracy": 0.9989583373069764,
1259
+ "num_tokens": 16929384.0,
1260
+ "step": 620
1261
+ },
1262
+ {
1263
+ "entropy": 0.6166529059410095,
1264
+ "epoch": 0.47241118669690096,
1265
+ "grad_norm": 1.9453125,
1266
+ "learning_rate": 6.355614128028277e-06,
1267
+ "loss": 0.0104,
1268
+ "mean_token_accuracy": 0.99738609790802,
1269
+ "num_tokens": 17065893.0,
1270
+ "step": 625
1271
+ },
1272
+ {
1273
+ "entropy": 0.6209856390953064,
1274
+ "epoch": 0.47619047619047616,
1275
+ "grad_norm": 1.8203125,
1276
+ "learning_rate": 6.291970068464755e-06,
1277
+ "loss": 0.0086,
1278
+ "mean_token_accuracy": 0.998442780971527,
1279
+ "num_tokens": 17201908.0,
1280
+ "step": 630
1281
+ },
1282
+ {
1283
+ "entropy": 0.6346417427062988,
1284
+ "epoch": 0.4799697656840514,
1285
+ "grad_norm": 0.1435546875,
1286
+ "learning_rate": 6.228100900357914e-06,
1287
+ "loss": 0.0098,
1288
+ "mean_token_accuracy": 0.9994791626930237,
1289
+ "num_tokens": 17338449.0,
1290
+ "step": 635
1291
+ },
1292
+ {
1293
+ "entropy": 0.6447597503662109,
1294
+ "epoch": 0.4837490551776266,
1295
+ "grad_norm": 0.59375,
1296
+ "learning_rate": 6.164017752057972e-06,
1297
+ "loss": 0.0023,
1298
+ "mean_token_accuracy": 0.9994764447212219,
1299
+ "num_tokens": 17474392.0,
1300
+ "step": 640
1301
+ },
1302
+ {
1303
+ "entropy": 0.6463032722473144,
1304
+ "epoch": 0.4875283446712018,
1305
+ "grad_norm": 2.375,
1306
+ "learning_rate": 6.099731789198344e-06,
1307
+ "loss": 0.0069,
1308
+ "mean_token_accuracy": 0.9979228138923645,
1309
+ "num_tokens": 17610576.0,
1310
+ "step": 645
1311
+ },
1312
+ {
1313
+ "entropy": 0.6476381182670593,
1314
+ "epoch": 0.491307634164777,
1315
+ "grad_norm": 3.203125,
1316
+ "learning_rate": 6.035254212750172e-06,
1317
+ "loss": 0.0031,
1318
+ "mean_token_accuracy": 0.9994845390319824,
1319
+ "num_tokens": 17748320.0,
1320
+ "step": 650
1321
+ },
1322
+ {
1323
+ "entropy": 0.6472541451454162,
1324
+ "epoch": 0.4950869236583522,
1325
+ "grad_norm": 4.0,
1326
+ "learning_rate": 5.970596257070711e-06,
1327
+ "loss": 0.0076,
1328
+ "mean_token_accuracy": 0.9984358310699463,
1329
+ "num_tokens": 17884176.0,
1330
+ "step": 655
1331
+ },
1332
+ {
1333
+ "entropy": 0.6473153591156006,
1334
+ "epoch": 0.4988662131519274,
1335
+ "grad_norm": 0.318359375,
1336
+ "learning_rate": 5.905769187945889e-06,
1337
+ "loss": 0.0036,
1338
+ "mean_token_accuracy": 0.9989583134651184,
1339
+ "num_tokens": 18022027.0,
1340
+ "step": 660
1341
+ },
1342
+ {
1343
+ "epoch": 0.5003779289493575,
1344
+ "eval_entropy": 0.6450142338871956,
1345
+ "eval_loss": 0.005529946647584438,
1346
+ "eval_mean_token_accuracy": 0.9985507105787595,
1347
+ "eval_num_tokens": 18076506.0,
1348
+ "eval_runtime": 16.3105,
1349
+ "eval_samples_per_second": 34.885,
1350
+ "eval_steps_per_second": 4.414,
1351
+ "step": 662
1352
+ },
1353
+ {
1354
+ "entropy": 0.6454153895378113,
1355
+ "epoch": 0.5026455026455027,
1356
+ "grad_norm": 0.1201171875,
1357
+ "learning_rate": 5.840784300627396e-06,
1358
+ "loss": 0.0043,
1359
+ "mean_token_accuracy": 0.9989359736442566,
1360
+ "num_tokens": 18156884.0,
1361
+ "step": 665
1362
+ },
1363
+ {
1364
+ "entropy": 0.6470770359039306,
1365
+ "epoch": 0.5064247921390779,
1366
+ "grad_norm": 8.0,
1367
+ "learning_rate": 5.775652917864633e-06,
1368
+ "loss": 0.0146,
1369
+ "mean_token_accuracy": 0.9974155783653259,
1370
+ "num_tokens": 18292386.0,
1371
+ "step": 670
1372
+ },
1373
+ {
1374
+ "entropy": 0.6470168590545654,
1375
+ "epoch": 0.5102040816326531,
1376
+ "grad_norm": 0.0888671875,
1377
+ "learning_rate": 5.710386387931886e-06,
1378
+ "loss": 0.0022,
1379
+ "mean_token_accuracy": 0.9994736790657044,
1380
+ "num_tokens": 18428457.0,
1381
+ "step": 675
1382
+ },
1383
+ {
1384
+ "entropy": 0.640817391872406,
1385
+ "epoch": 0.5139833711262283,
1386
+ "grad_norm": 0.072265625,
1387
+ "learning_rate": 5.644996082651018e-06,
1388
+ "loss": 0.0006,
1389
+ "mean_token_accuracy": 1.0,
1390
+ "num_tokens": 18567506.0,
1391
+ "step": 680
1392
+ },
1393
+ {
1394
+ "entropy": 0.6382765650749207,
1395
+ "epoch": 0.5177626606198035,
1396
+ "grad_norm": 0.0576171875,
1397
+ "learning_rate": 5.579493395410105e-06,
1398
+ "loss": 0.0053,
1399
+ "mean_token_accuracy": 0.9989583253860473,
1400
+ "num_tokens": 18704091.0,
1401
+ "step": 685
1402
+ },
1403
+ {
1404
+ "entropy": 0.6377130270004272,
1405
+ "epoch": 0.5215419501133787,
1406
+ "grad_norm": 6.21875,
1407
+ "learning_rate": 5.51388973917828e-06,
1408
+ "loss": 0.0115,
1409
+ "mean_token_accuracy": 0.9979159593582153,
1410
+ "num_tokens": 18839539.0,
1411
+ "step": 690
1412
+ },
1413
+ {
1414
+ "entropy": 0.6364631175994873,
1415
+ "epoch": 0.5253212396069539,
1416
+ "grad_norm": 0.2041015625,
1417
+ "learning_rate": 5.448196544517168e-06,
1418
+ "loss": 0.0005,
1419
+ "mean_token_accuracy": 1.0,
1420
+ "num_tokens": 18978610.0,
1421
+ "step": 695
1422
+ },
1423
+ {
1424
+ "entropy": 0.6324193835258484,
1425
+ "epoch": 0.5291005291005291,
1426
+ "grad_norm": 2.09375,
1427
+ "learning_rate": 5.382425257589277e-06,
1428
+ "loss": 0.0069,
1429
+ "mean_token_accuracy": 0.9984482288360595,
1430
+ "num_tokens": 19117618.0,
1431
+ "step": 700
1432
+ },
1433
+ {
1434
+ "entropy": 0.6341057658195496,
1435
+ "epoch": 0.5328798185941043,
1436
+ "grad_norm": 1.75,
1437
+ "learning_rate": 5.316587338163649e-06,
1438
+ "loss": 0.0088,
1439
+ "mean_token_accuracy": 0.997915244102478,
1440
+ "num_tokens": 19255561.0,
1441
+ "step": 705
1442
+ },
1443
+ {
1444
+ "entropy": 0.6376821398735046,
1445
+ "epoch": 0.5366591080876795,
1446
+ "grad_norm": 0.055419921875,
1447
+ "learning_rate": 5.2506942576191466e-06,
1448
+ "loss": 0.0036,
1449
+ "mean_token_accuracy": 0.9984469294548035,
1450
+ "num_tokens": 19391688.0,
1451
+ "step": 710
1452
+ },
1453
+ {
1454
+ "entropy": 0.6377916216850281,
1455
+ "epoch": 0.5404383975812547,
1456
+ "grad_norm": 1.9375,
1457
+ "learning_rate": 5.184757496945726e-06,
1458
+ "loss": 0.0067,
1459
+ "mean_token_accuracy": 0.9989583253860473,
1460
+ "num_tokens": 19528671.0,
1461
+ "step": 715
1462
+ },
1463
+ {
1464
+ "entropy": 0.641215717792511,
1465
+ "epoch": 0.54421768707483,
1466
+ "grad_norm": 0.0322265625,
1467
+ "learning_rate": 5.118788544744016e-06,
1468
+ "loss": 0.0021,
1469
+ "mean_token_accuracy": 0.9989583253860473,
1470
+ "num_tokens": 19665410.0,
1471
+ "step": 720
1472
+ },
1473
+ {
1474
+ "entropy": 0.6453086376190186,
1475
+ "epoch": 0.5479969765684052,
1476
+ "grad_norm": 5.71875,
1477
+ "learning_rate": 5.052798895223597e-06,
1478
+ "loss": 0.0059,
1479
+ "mean_token_accuracy": 0.9984346270561218,
1480
+ "num_tokens": 19803509.0,
1481
+ "step": 725
1482
+ },
1483
+ {
1484
+ "entropy": 0.6470367074012756,
1485
+ "epoch": 0.5517762660619804,
1486
+ "grad_norm": 2.71875,
1487
+ "learning_rate": 4.986800046200278e-06,
1488
+ "loss": 0.0038,
1489
+ "mean_token_accuracy": 0.9989582180976868,
1490
+ "num_tokens": 19941001.0,
1491
+ "step": 730
1492
+ },
1493
+ {
1494
+ "entropy": 0.6464150190353394,
1495
+ "epoch": 0.5555555555555556,
1496
+ "grad_norm": 0.044189453125,
1497
+ "learning_rate": 4.920803497092757e-06,
1498
+ "loss": 0.0017,
1499
+ "mean_token_accuracy": 0.9994805216789245,
1500
+ "num_tokens": 20076889.0,
1501
+ "step": 735
1502
+ },
1503
+ {
1504
+ "entropy": 0.6467163801193238,
1505
+ "epoch": 0.5593348450491308,
1506
+ "grad_norm": 5.4375,
1507
+ "learning_rate": 4.854820746919005e-06,
1508
+ "loss": 0.0069,
1509
+ "mean_token_accuracy": 0.9968555450439454,
1510
+ "num_tokens": 20212064.0,
1511
+ "step": 740
1512
+ },
1513
+ {
1514
+ "entropy": 0.6457946181297303,
1515
+ "epoch": 0.563114134542706,
1516
+ "grad_norm": 5.4375,
1517
+ "learning_rate": 4.78886329229271e-06,
1518
+ "loss": 0.0099,
1519
+ "mean_token_accuracy": 0.9974038600921631,
1520
+ "num_tokens": 20348078.0,
1521
+ "step": 745
1522
+ },
1523
+ {
1524
+ "entropy": 0.6479183673858643,
1525
+ "epoch": 0.5668934240362812,
1526
+ "grad_norm": 1.5390625,
1527
+ "learning_rate": 4.7229426254201504e-06,
1528
+ "loss": 0.0047,
1529
+ "mean_token_accuracy": 0.9984535336494446,
1530
+ "num_tokens": 20483248.0,
1531
+ "step": 750
1532
+ },
1533
+ {
1534
+ "entropy": 0.6519662022590638,
1535
+ "epoch": 0.5706727135298564,
1536
+ "grad_norm": 0.1123046875,
1537
+ "learning_rate": 4.657070232097832e-06,
1538
+ "loss": 0.0005,
1539
+ "mean_token_accuracy": 1.0,
1540
+ "num_tokens": 20619605.0,
1541
+ "step": 755
1542
+ },
1543
+ {
1544
+ "entropy": 0.6520264983177185,
1545
+ "epoch": 0.5744520030234316,
1546
+ "grad_norm": 0.1875,
1547
+ "learning_rate": 4.591257589711233e-06,
1548
+ "loss": 0.0009,
1549
+ "mean_token_accuracy": 1.0,
1550
+ "num_tokens": 20756384.0,
1551
+ "step": 760
1552
+ },
1553
+ {
1554
+ "entropy": 0.6485134243965149,
1555
+ "epoch": 0.5782312925170068,
1556
+ "grad_norm": 0.318359375,
1557
+ "learning_rate": 4.525516165235034e-06,
1558
+ "loss": 0.0041,
1559
+ "mean_token_accuracy": 0.9994832038879394,
1560
+ "num_tokens": 20892551.0,
1561
+ "step": 765
1562
+ },
1563
+ {
1564
+ "entropy": 0.6489724397659302,
1565
+ "epoch": 0.582010582010582,
1566
+ "grad_norm": 0.15234375,
1567
+ "learning_rate": 4.45985741323514e-06,
1568
+ "loss": 0.0002,
1569
+ "mean_token_accuracy": 1.0,
1570
+ "num_tokens": 21028519.0,
1571
+ "step": 770
1572
+ },
1573
+ {
1574
+ "entropy": 0.644879674911499,
1575
+ "epoch": 0.5857898715041572,
1576
+ "grad_norm": 0.146484375,
1577
+ "learning_rate": 4.394292773872882e-06,
1578
+ "loss": 0.0092,
1579
+ "mean_token_accuracy": 0.9979193449020386,
1580
+ "num_tokens": 21165463.0,
1581
+ "step": 775
1582
+ },
1583
+ {
1584
+ "entropy": 0.6453619122505188,
1585
+ "epoch": 0.5895691609977324,
1586
+ "grad_norm": 0.07421875,
1587
+ "learning_rate": 4.3288336709117246e-06,
1588
+ "loss": 0.0019,
1589
+ "mean_token_accuracy": 0.9989473581314087,
1590
+ "num_tokens": 21302633.0,
1591
+ "step": 780
1592
+ },
1593
+ {
1594
+ "entropy": 0.6452991962432861,
1595
+ "epoch": 0.5933484504913077,
1596
+ "grad_norm": 0.6328125,
1597
+ "learning_rate": 4.263491509726812e-06,
1598
+ "loss": 0.0036,
1599
+ "mean_token_accuracy": 0.9989500641822815,
1600
+ "num_tokens": 21440200.0,
1601
+ "step": 785
1602
+ },
1603
+ {
1604
+ "entropy": 0.646388852596283,
1605
+ "epoch": 0.5971277399848829,
1606
+ "grad_norm": 0.06298828125,
1607
+ "learning_rate": 4.198277675317758e-06,
1608
+ "loss": 0.0021,
1609
+ "mean_token_accuracy": 0.9994832038879394,
1610
+ "num_tokens": 21575128.0,
1611
+ "step": 790
1612
+ },
1613
+ {
1614
+ "entropy": 0.6419386029243469,
1615
+ "epoch": 0.6009070294784581,
1616
+ "grad_norm": 0.306640625,
1617
+ "learning_rate": 4.133203530324938e-06,
1618
+ "loss": 0.0028,
1619
+ "mean_token_accuracy": 0.9994871735572814,
1620
+ "num_tokens": 21712652.0,
1621
+ "step": 795
1622
+ },
1623
+ {
1624
+ "entropy": 0.6412655353546143,
1625
+ "epoch": 0.6046863189720333,
1626
+ "grad_norm": 1.0625,
1627
+ "learning_rate": 4.068280413049715e-06,
1628
+ "loss": 0.0025,
1629
+ "mean_token_accuracy": 0.9989610433578491,
1630
+ "num_tokens": 21849921.0,
1631
+ "step": 800
1632
+ },
1633
+ {
1634
+ "entropy": 0.6409095525741577,
1635
+ "epoch": 0.6084656084656085,
1636
+ "grad_norm": 0.1494140625,
1637
+ "learning_rate": 4.003519635478889e-06,
1638
+ "loss": 0.0034,
1639
+ "mean_token_accuracy": 0.9994764447212219,
1640
+ "num_tokens": 21987599.0,
1641
+ "step": 805
1642
+ },
1643
+ {
1644
+ "entropy": 0.6357825875282288,
1645
+ "epoch": 0.6122448979591837,
1646
+ "grad_norm": 0.400390625,
1647
+ "learning_rate": 3.938932481313725e-06,
1648
+ "loss": 0.0004,
1649
+ "mean_token_accuracy": 1.0,
1650
+ "num_tokens": 22125270.0,
1651
+ "step": 810
1652
+ },
1653
+ {
1654
+ "entropy": 0.6361251711845398,
1655
+ "epoch": 0.6160241874527589,
1656
+ "grad_norm": 0.359375,
1657
+ "learning_rate": 3.874530204003937e-06,
1658
+ "loss": 0.0006,
1659
+ "mean_token_accuracy": 1.0,
1660
+ "num_tokens": 22259815.0,
1661
+ "step": 815
1662
+ },
1663
+ {
1664
+ "entropy": 0.6323129415512085,
1665
+ "epoch": 0.6198034769463341,
1666
+ "grad_norm": 1.0234375,
1667
+ "learning_rate": 3.8103240247869077e-06,
1668
+ "loss": 0.0051,
1669
+ "mean_token_accuracy": 0.9979180097579956,
1670
+ "num_tokens": 22395743.0,
1671
+ "step": 820
1672
+ },
1673
+ {
1674
+ "entropy": 0.6306747913360595,
1675
+ "epoch": 0.6235827664399093,
1676
+ "grad_norm": 0.00927734375,
1677
+ "learning_rate": 3.7463251307325432e-06,
1678
+ "loss": 0.0032,
1679
+ "mean_token_accuracy": 0.9994818687438964,
1680
+ "num_tokens": 22531904.0,
1681
+ "step": 825
1682
+ },
1683
+ {
1684
+ "entropy": 0.6301920533180236,
1685
+ "epoch": 0.6273620559334845,
1686
+ "grad_norm": 0.07421875,
1687
+ "learning_rate": 3.682544672794085e-06,
1688
+ "loss": 0.0054,
1689
+ "mean_token_accuracy": 0.998952841758728,
1690
+ "num_tokens": 22668840.0,
1691
+ "step": 830
1692
+ },
1693
+ {
1694
+ "entropy": 0.632958173751831,
1695
+ "epoch": 0.6311413454270597,
1696
+ "grad_norm": 0.859375,
1697
+ "learning_rate": 3.618993763865185e-06,
1698
+ "loss": 0.0024,
1699
+ "mean_token_accuracy": 0.9994778037071228,
1700
+ "num_tokens": 22805120.0,
1701
+ "step": 835
1702
+ },
1703
+ {
1704
+ "entropy": 0.6310474514961243,
1705
+ "epoch": 0.6349206349206349,
1706
+ "grad_norm": 0.1953125,
1707
+ "learning_rate": 3.5556834768436498e-06,
1708
+ "loss": 0.0004,
1709
+ "mean_token_accuracy": 1.0,
1710
+ "num_tokens": 22940501.0,
1711
+ "step": 840
1712
+ },
1713
+ {
1714
+ "entropy": 0.6288277864456177,
1715
+ "epoch": 0.6386999244142101,
1716
+ "grad_norm": 0.05615234375,
1717
+ "learning_rate": 3.4926248427021293e-06,
1718
+ "loss": 0.0026,
1719
+ "mean_token_accuracy": 0.9994910955429077,
1720
+ "num_tokens": 23077572.0,
1721
+ "step": 845
1722
+ },
1723
+ {
1724
+ "entropy": 0.6270468473434448,
1725
+ "epoch": 0.6424792139077853,
1726
+ "grad_norm": 0.095703125,
1727
+ "learning_rate": 3.4298288485661147e-06,
1728
+ "loss": 0.0008,
1729
+ "mean_token_accuracy": 0.9994778037071228,
1730
+ "num_tokens": 23213864.0,
1731
+ "step": 850
1732
+ },
1733
+ {
1734
+ "entropy": 0.6278021454811096,
1735
+ "epoch": 0.6462585034013606,
1736
+ "grad_norm": 2.734375,
1737
+ "learning_rate": 3.3673064357995844e-06,
1738
+ "loss": 0.0067,
1739
+ "mean_token_accuracy": 0.9979260802268982,
1740
+ "num_tokens": 23351632.0,
1741
+ "step": 855
1742
+ },
1743
+ {
1744
+ "entropy": 0.6229908108711243,
1745
+ "epoch": 0.6500377928949358,
1746
+ "grad_norm": 8.0,
1747
+ "learning_rate": 3.3050684980986105e-06,
1748
+ "loss": 0.003,
1749
+ "mean_token_accuracy": 0.9984301805496216,
1750
+ "num_tokens": 23486525.0,
1751
+ "step": 860
1752
+ },
1753
+ {
1754
+ "entropy": 0.6223344326019287,
1755
+ "epoch": 0.653817082388511,
1756
+ "grad_norm": 0.1806640625,
1757
+ "learning_rate": 3.2431258795932863e-06,
1758
+ "loss": 0.0024,
1759
+ "mean_token_accuracy": 0.9984441995620728,
1760
+ "num_tokens": 23624986.0,
1761
+ "step": 865
1762
+ },
1763
+ {
1764
+ "entropy": 0.6258201241493225,
1765
+ "epoch": 0.6575963718820862,
1766
+ "grad_norm": 3.03125,
1767
+ "learning_rate": 3.181489372958272e-06,
1768
+ "loss": 0.0042,
1769
+ "mean_token_accuracy": 0.9989372611045837,
1770
+ "num_tokens": 23762002.0,
1771
+ "step": 870
1772
+ },
1773
+ {
1774
+ "entropy": 0.6200236201286315,
1775
+ "epoch": 0.6613756613756614,
1776
+ "grad_norm": 0.11767578125,
1777
+ "learning_rate": 3.120169717532319e-06,
1778
+ "loss": 0.0047,
1779
+ "mean_token_accuracy": 0.9994845390319824,
1780
+ "num_tokens": 23897588.0,
1781
+ "step": 875
1782
+ },
1783
+ {
1784
+ "entropy": 0.6229296207427979,
1785
+ "epoch": 0.6651549508692366,
1786
+ "grad_norm": 6.90625,
1787
+ "learning_rate": 3.059177597447087e-06,
1788
+ "loss": 0.0063,
1789
+ "mean_token_accuracy": 0.9984667658805847,
1790
+ "num_tokens": 24036641.0,
1791
+ "step": 880
1792
+ },
1793
+ {
1794
+ "entropy": 0.6244211196899414,
1795
+ "epoch": 0.6689342403628118,
1796
+ "grad_norm": 0.029052734375,
1797
+ "learning_rate": 2.9985236397655726e-06,
1798
+ "loss": 0.0131,
1799
+ "mean_token_accuracy": 0.9974199652671814,
1800
+ "num_tokens": 24171544.0,
1801
+ "step": 885
1802
+ },
1803
+ {
1804
+ "entropy": 0.6283087849617004,
1805
+ "epoch": 0.672713529856387,
1806
+ "grad_norm": 0.054931640625,
1807
+ "learning_rate": 2.9382184126304834e-06,
1808
+ "loss": 0.0013,
1809
+ "mean_token_accuracy": 0.9994858622550964,
1810
+ "num_tokens": 24309060.0,
1811
+ "step": 890
1812
+ },
1813
+ {
1814
+ "entropy": 0.6263309836387634,
1815
+ "epoch": 0.6764928193499622,
1816
+ "grad_norm": 0.00958251953125,
1817
+ "learning_rate": 2.8782724234228876e-06,
1818
+ "loss": 0.0029,
1819
+ "mean_token_accuracy": 0.9989610075950622,
1820
+ "num_tokens": 24444653.0,
1821
+ "step": 895
1822
+ },
1823
+ {
1824
+ "entropy": 0.6234323501586914,
1825
+ "epoch": 0.6802721088435374,
1826
+ "grad_norm": 0.05419921875,
1827
+ "learning_rate": 2.818696116931431e-06,
1828
+ "loss": 0.0005,
1829
+ "mean_token_accuracy": 1.0,
1830
+ "num_tokens": 24580885.0,
1831
+ "step": 900
1832
+ },
1833
+ {
1834
+ "entropy": 0.6275152206420899,
1835
+ "epoch": 0.6840513983371126,
1836
+ "grad_norm": 0.00946044921875,
1837
+ "learning_rate": 2.7594998735324905e-06,
1838
+ "loss": 0.0016,
1839
+ "mean_token_accuracy": 0.9994845390319824,
1840
+ "num_tokens": 24719457.0,
1841
+ "step": 905
1842
+ },
1843
+ {
1844
+ "entropy": 0.6263895153999328,
1845
+ "epoch": 0.6878306878306878,
1846
+ "grad_norm": 0.06494140625,
1847
+ "learning_rate": 2.7006940073815136e-06,
1848
+ "loss": 0.0011,
1849
+ "mean_token_accuracy": 1.0,
1850
+ "num_tokens": 24856055.0,
1851
+ "step": 910
1852
+ },
1853
+ {
1854
+ "entropy": 0.6252614855766296,
1855
+ "epoch": 0.691609977324263,
1856
+ "grad_norm": 1.0234375,
1857
+ "learning_rate": 2.6422887646159234e-06,
1858
+ "loss": 0.0063,
1859
+ "mean_token_accuracy": 0.997401237487793,
1860
+ "num_tokens": 24992445.0,
1861
+ "step": 915
1862
+ },
1863
+ {
1864
+ "entropy": 0.6215481519699096,
1865
+ "epoch": 0.6953892668178382,
1866
+ "grad_norm": 11.75,
1867
+ "learning_rate": 2.584294321569862e-06,
1868
+ "loss": 0.0035,
1869
+ "mean_token_accuracy": 0.9989569306373596,
1870
+ "num_tokens": 25128519.0,
1871
+ "step": 920
1872
+ },
1873
+ {
1874
+ "entropy": 0.6217891812324524,
1875
+ "epoch": 0.6991685563114135,
1876
+ "grad_norm": 0.01300048828125,
1877
+ "learning_rate": 2.526720783001107e-06,
1878
+ "loss": 0.0012,
1879
+ "mean_token_accuracy": 0.9994858622550964,
1880
+ "num_tokens": 25266464.0,
1881
+ "step": 925
1882
+ },
1883
+ {
1884
+ "entropy": 0.6230118870735168,
1885
+ "epoch": 0.7029478458049887,
1886
+ "grad_norm": 0.068359375,
1887
+ "learning_rate": 2.469578180330444e-06,
1888
+ "loss": 0.0026,
1889
+ "mean_token_accuracy": 0.998940396308899,
1890
+ "num_tokens": 25402715.0,
1891
+ "step": 930
1892
+ },
1893
+ {
1894
+ "entropy": 0.6251353025436401,
1895
+ "epoch": 0.7067271352985639,
1896
+ "grad_norm": 0.01531982421875,
1897
+ "learning_rate": 2.4128764698938297e-06,
1898
+ "loss": 0.0023,
1899
+ "mean_token_accuracy": 0.9994764447212219,
1900
+ "num_tokens": 25539042.0,
1901
+ "step": 935
1902
+ },
1903
+ {
1904
+ "entropy": 0.6235007047653198,
1905
+ "epoch": 0.7105064247921391,
1906
+ "grad_norm": 7.53125,
1907
+ "learning_rate": 2.356625531207638e-06,
1908
+ "loss": 0.0105,
1909
+ "mean_token_accuracy": 0.9973794341087341,
1910
+ "num_tokens": 25673951.0,
1911
+ "step": 940
1912
+ },
1913
+ {
1914
+ "entropy": 0.6224774122238159,
1915
+ "epoch": 0.7142857142857143,
1916
+ "grad_norm": 0.5234375,
1917
+ "learning_rate": 2.3008351652472714e-06,
1918
+ "loss": 0.0029,
1919
+ "mean_token_accuracy": 0.9989608526229858,
1920
+ "num_tokens": 25809956.0,
1921
+ "step": 945
1922
+ },
1923
+ {
1924
+ "entropy": 0.6207262992858886,
1925
+ "epoch": 0.7180650037792895,
1926
+ "grad_norm": 3.8125,
1927
+ "learning_rate": 2.245515092739488e-06,
1928
+ "loss": 0.0041,
1929
+ "mean_token_accuracy": 0.9984666347503662,
1930
+ "num_tokens": 25946173.0,
1931
+ "step": 950
1932
+ },
1933
+ {
1934
+ "entropy": 0.6216147780418396,
1935
+ "epoch": 0.7218442932728647,
1936
+ "grad_norm": 0.828125,
1937
+ "learning_rate": 2.1906749524686856e-06,
1938
+ "loss": 0.0046,
1939
+ "mean_token_accuracy": 0.9994805216789245,
1940
+ "num_tokens": 26082522.0,
1941
+ "step": 955
1942
+ },
1943
+ {
1944
+ "entropy": 0.6248481035232544,
1945
+ "epoch": 0.7256235827664399,
1946
+ "grad_norm": 0.09814453125,
1947
+ "learning_rate": 2.136324299597474e-06,
1948
+ "loss": 0.0068,
1949
+ "mean_token_accuracy": 0.9989418029785156,
1950
+ "num_tokens": 26220220.0,
1951
+ "step": 960
1952
+ },
1953
+ {
1954
+ "entropy": 0.6239413142204284,
1955
+ "epoch": 0.7294028722600151,
1956
+ "grad_norm": 6.625,
1957
+ "learning_rate": 2.0824726040018174e-06,
1958
+ "loss": 0.005,
1959
+ "mean_token_accuracy": 0.9984360456466674,
1960
+ "num_tokens": 26356547.0,
1961
+ "step": 965
1962
+ },
1963
+ {
1964
+ "entropy": 0.6228319883346558,
1965
+ "epoch": 0.7331821617535903,
1966
+ "grad_norm": 0.03955078125,
1967
+ "learning_rate": 2.0291292486210327e-06,
1968
+ "loss": 0.0025,
1969
+ "mean_token_accuracy": 0.9989743590354919,
1970
+ "num_tokens": 26493643.0,
1971
+ "step": 970
1972
+ },
1973
+ {
1974
+ "entropy": 0.6238638162612915,
1975
+ "epoch": 0.7369614512471655,
1976
+ "grad_norm": 0.04150390625,
1977
+ "learning_rate": 1.976303527822933e-06,
1978
+ "loss": 0.0022,
1979
+ "mean_token_accuracy": 0.9994858622550964,
1980
+ "num_tokens": 26630319.0,
1981
+ "step": 975
1982
+ },
1983
+ {
1984
+ "entropy": 0.6228212952613831,
1985
+ "epoch": 0.7407407407407407,
1986
+ "grad_norm": 0.041748046875,
1987
+ "learning_rate": 1.9240046457844223e-06,
1988
+ "loss": 0.0025,
1989
+ "mean_token_accuracy": 0.9989431500434875,
1990
+ "num_tokens": 26767214.0,
1991
+ "step": 980
1992
+ },
1993
+ {
1994
+ "entropy": 0.6239274144172668,
1995
+ "epoch": 0.7445200302343159,
1996
+ "grad_norm": 2.015625,
1997
+ "learning_rate": 1.8722417148877752e-06,
1998
+ "loss": 0.0053,
1999
+ "mean_token_accuracy": 0.9989583253860473,
2000
+ "num_tokens": 26903449.0,
2001
+ "step": 985
2002
+ },
2003
+ {
2004
+ "entropy": 0.6227865815162659,
2005
+ "epoch": 0.7482993197278912,
2006
+ "grad_norm": 0.046875,
2007
+ "learning_rate": 1.821023754132933e-06,
2008
+ "loss": 0.0081,
2009
+ "mean_token_accuracy": 0.9974170565605164,
2010
+ "num_tokens": 27041212.0,
2011
+ "step": 990
2012
+ },
2013
+ {
2014
+ "entropy": 0.6259052991867066,
2015
+ "epoch": 0.7520786092214664,
2016
+ "grad_norm": 4.03125,
2017
+ "learning_rate": 1.7703596875660645e-06,
2018
+ "loss": 0.0097,
2019
+ "mean_token_accuracy": 0.9979274153709412,
2020
+ "num_tokens": 27177799.0,
2021
+ "step": 995
2022
+ },
2023
+ {
2024
+ "entropy": 0.6243377447128295,
2025
+ "epoch": 0.7558578987150416,
2026
+ "grad_norm": 5.25,
2027
+ "learning_rate": 1.7202583427246633e-06,
2028
+ "loss": 0.0073,
2029
+ "mean_token_accuracy": 0.9968909025192261,
2030
+ "num_tokens": 27315021.0,
2031
+ "step": 1000
2032
+ },
2033
+ {
2034
+ "entropy": 0.6245812058448792,
2035
+ "epoch": 0.7596371882086168,
2036
+ "grad_norm": 2.4375,
2037
+ "learning_rate": 1.6707284490994746e-06,
2038
+ "loss": 0.0078,
2039
+ "mean_token_accuracy": 0.997928786277771,
2040
+ "num_tokens": 27450263.0,
2041
+ "step": 1005
2042
+ },
2043
+ {
2044
+ "entropy": 0.6255474805831909,
2045
+ "epoch": 0.763416477702192,
2046
+ "grad_norm": 2.28125,
2047
+ "learning_rate": 1.6217786366134953e-06,
2048
+ "loss": 0.0041,
2049
+ "mean_token_accuracy": 0.9994791626930237,
2050
+ "num_tokens": 27586816.0,
2051
+ "step": 1010
2052
+ },
2053
+ {
2054
+ "entropy": 0.6291083574295044,
2055
+ "epoch": 0.7671957671957672,
2056
+ "grad_norm": 0.05419921875,
2057
+ "learning_rate": 1.5734174341183284e-06,
2058
+ "loss": 0.0014,
2059
+ "mean_token_accuracy": 0.9994750618934631,
2060
+ "num_tokens": 27722646.0,
2061
+ "step": 1015
2062
+ },
2063
+ {
2064
+ "entropy": 0.6277247428894043,
2065
+ "epoch": 0.7709750566893424,
2066
+ "grad_norm": 0.25,
2067
+ "learning_rate": 1.525653267908132e-06,
2068
+ "loss": 0.0007,
2069
+ "mean_token_accuracy": 1.0,
2070
+ "num_tokens": 27860053.0,
2071
+ "step": 1020
2072
+ },
2073
+ {
2074
+ "entropy": 0.6258515357971192,
2075
+ "epoch": 0.7747543461829176,
2076
+ "grad_norm": 0.1376953125,
2077
+ "learning_rate": 1.4784944602514662e-06,
2078
+ "loss": 0.0039,
2079
+ "mean_token_accuracy": 0.9989677429199219,
2080
+ "num_tokens": 27998175.0,
2081
+ "step": 1025
2082
+ },
2083
+ {
2084
+ "entropy": 0.6294663429260254,
2085
+ "epoch": 0.7785336356764928,
2086
+ "grad_norm": 0.044677734375,
2087
+ "learning_rate": 1.4319492279412388e-06,
2088
+ "loss": 0.0014,
2089
+ "mean_token_accuracy": 0.9994652390480041,
2090
+ "num_tokens": 28132887.0,
2091
+ "step": 1030
2092
+ },
2093
+ {
2094
+ "entropy": 0.6290977954864502,
2095
+ "epoch": 0.782312925170068,
2096
+ "grad_norm": 5.3125,
2097
+ "learning_rate": 1.3860256808630429e-06,
2098
+ "loss": 0.002,
2099
+ "mean_token_accuracy": 0.9994910955429077,
2100
+ "num_tokens": 28268986.0,
2101
+ "step": 1035
2102
+ },
2103
+ {
2104
+ "entropy": 0.6268069744110107,
2105
+ "epoch": 0.7860922146636432,
2106
+ "grad_norm": 4.15625,
2107
+ "learning_rate": 1.340731820582114e-06,
2108
+ "loss": 0.0085,
2109
+ "mean_token_accuracy": 0.9984454393386841,
2110
+ "num_tokens": 28406196.0,
2111
+ "step": 1040
2112
+ },
2113
+ {
2114
+ "entropy": 0.6279515743255615,
2115
+ "epoch": 0.7898715041572184,
2116
+ "grad_norm": 0.12353515625,
2117
+ "learning_rate": 1.2960755389491703e-06,
2118
+ "loss": 0.0057,
2119
+ "mean_token_accuracy": 0.9989556193351745,
2120
+ "num_tokens": 28543139.0,
2121
+ "step": 1045
2122
+ },
2123
+ {
2124
+ "entropy": 0.6263696193695069,
2125
+ "epoch": 0.7936507936507936,
2126
+ "grad_norm": 0.07763671875,
2127
+ "learning_rate": 1.2520646167253514e-06,
2128
+ "loss": 0.0041,
2129
+ "mean_token_accuracy": 0.9994845390319824,
2130
+ "num_tokens": 28679153.0,
2131
+ "step": 1050
2132
+ },
2133
+ {
2134
+ "entropy": 0.6293219327926636,
2135
+ "epoch": 0.7974300831443688,
2136
+ "grad_norm": 0.12158203125,
2137
+ "learning_rate": 1.2087067222265409e-06,
2138
+ "loss": 0.0006,
2139
+ "mean_token_accuracy": 1.0,
2140
+ "num_tokens": 28815977.0,
2141
+ "step": 1055
2142
+ },
2143
+ {
2144
+ "entropy": 0.6283244490623474,
2145
+ "epoch": 0.8012093726379441,
2146
+ "grad_norm": 2.15625,
2147
+ "learning_rate": 1.166009409987251e-06,
2148
+ "loss": 0.0008,
2149
+ "mean_token_accuracy": 1.0,
2150
+ "num_tokens": 28950890.0,
2151
+ "step": 1060
2152
+ },
2153
+ {
2154
+ "entropy": 0.6297204375267029,
2155
+ "epoch": 0.8049886621315193,
2156
+ "grad_norm": 0.86328125,
2157
+ "learning_rate": 1.1239801194443507e-06,
2158
+ "loss": 0.0039,
2159
+ "mean_token_accuracy": 0.998441469669342,
2160
+ "num_tokens": 29088081.0,
2161
+ "step": 1065
2162
+ },
2163
+ {
2164
+ "entropy": 0.6257842779159546,
2165
+ "epoch": 0.8087679516250945,
2166
+ "grad_norm": 1.453125,
2167
+ "learning_rate": 1.082626173640846e-06,
2168
+ "loss": 0.0072,
2169
+ "mean_token_accuracy": 0.9989416837692261,
2170
+ "num_tokens": 29227491.0,
2171
+ "step": 1070
2172
+ },
2173
+ {
2174
+ "entropy": 0.6270438313484192,
2175
+ "epoch": 0.8125472411186697,
2176
+ "grad_norm": 2.34375,
2177
+ "learning_rate": 1.0419547779499283e-06,
2178
+ "loss": 0.0065,
2179
+ "mean_token_accuracy": 0.9984452605247498,
2180
+ "num_tokens": 29363388.0,
2181
+ "step": 1075
2182
+ },
2183
+ {
2184
+ "entropy": 0.625947892665863,
2185
+ "epoch": 0.8163265306122449,
2186
+ "grad_norm": 5.09375,
2187
+ "learning_rate": 1.0019730188195464e-06,
2188
+ "loss": 0.0055,
2189
+ "mean_token_accuracy": 0.9984427571296692,
2190
+ "num_tokens": 29498004.0,
2191
+ "step": 1080
2192
+ },
2193
+ {
2194
+ "entropy": 0.6268860936164856,
2195
+ "epoch": 0.8201058201058201,
2196
+ "grad_norm": 3.921875,
2197
+ "learning_rate": 9.626878625376784e-07,
2198
+ "loss": 0.0076,
2199
+ "mean_token_accuracy": 0.9984326839447022,
2200
+ "num_tokens": 29634999.0,
2201
+ "step": 1085
2202
+ },
2203
+ {
2204
+ "entropy": 0.6276703119277954,
2205
+ "epoch": 0.8238851095993953,
2206
+ "grad_norm": 0.0291748046875,
2207
+ "learning_rate": 9.241061540185547e-07,
2208
+ "loss": 0.0009,
2209
+ "mean_token_accuracy": 0.9994778037071228,
2210
+ "num_tokens": 29773857.0,
2211
+ "step": 1090
2212
+ },
2213
+ {
2214
+ "entropy": 0.6304038643836976,
2215
+ "epoch": 0.8276643990929705,
2216
+ "grad_norm": 0.0185546875,
2217
+ "learning_rate": 8.862346156100188e-07,
2218
+ "loss": 0.0021,
2219
+ "mean_token_accuracy": 0.9989361643791199,
2220
+ "num_tokens": 29910413.0,
2221
+ "step": 1095
2222
+ },
2223
+ {
2224
+ "entropy": 0.6274094462394715,
2225
+ "epoch": 0.8314436885865457,
2226
+ "grad_norm": 2.328125,
2227
+ "learning_rate": 8.490798459222477e-07,
2228
+ "loss": 0.0006,
2229
+ "mean_token_accuracy": 1.0,
2230
+ "num_tokens": 30047404.0,
2231
+ "step": 1100
2232
+ },
2233
+ {
2234
+ "entropy": 0.6275372743606568,
2235
+ "epoch": 0.8352229780801209,
2236
+ "grad_norm": 0.33984375,
2237
+ "learning_rate": 8.12648318678036e-07,
2238
+ "loss": 0.0061,
2239
+ "mean_token_accuracy": 0.9984293103218078,
2240
+ "num_tokens": 30183552.0,
2241
+ "step": 1105
2242
+ },
2243
+ {
2244
+ "entropy": 0.6304641485214233,
2245
+ "epoch": 0.8390022675736961,
2246
+ "grad_norm": 0.09228515625,
2247
+ "learning_rate": 7.769463815848344e-07,
2248
+ "loss": 0.0038,
2249
+ "mean_token_accuracy": 0.9994736790657044,
2250
+ "num_tokens": 30319430.0,
2251
+ "step": 1110
2252
+ },
2253
+ {
2254
+ "entropy": 0.6279330015182495,
2255
+ "epoch": 0.8427815570672713,
2256
+ "grad_norm": 10.1875,
2257
+ "learning_rate": 7.419802552287453e-07,
2258
+ "loss": 0.011,
2259
+ "mean_token_accuracy": 0.9979191064834595,
2260
+ "num_tokens": 30457112.0,
2261
+ "step": 1115
2262
+ },
2263
+ {
2264
+ "entropy": 0.6278607010841369,
2265
+ "epoch": 0.8465608465608465,
2266
+ "grad_norm": 0.73046875,
2267
+ "learning_rate": 7.077560319906696e-07,
2268
+ "loss": 0.0092,
2269
+ "mean_token_accuracy": 0.9979125261306763,
2270
+ "num_tokens": 30593910.0,
2271
+ "step": 1120
2272
+ },
2273
+ {
2274
+ "entropy": 0.6293578505516052,
2275
+ "epoch": 0.8503401360544217,
2276
+ "grad_norm": 0.427734375,
2277
+ "learning_rate": 6.742796749847908e-07,
2278
+ "loss": 0.0017,
2279
+ "mean_token_accuracy": 0.9994884967803955,
2280
+ "num_tokens": 30730370.0,
2281
+ "step": 1125
2282
+ },
2283
+ {
2284
+ "entropy": 0.6294125199317933,
2285
+ "epoch": 0.854119425547997,
2286
+ "grad_norm": 0.01611328125,
2287
+ "learning_rate": 6.415570170195801e-07,
2288
+ "loss": 0.0001,
2289
+ "mean_token_accuracy": 1.0,
2290
+ "num_tokens": 30864681.0,
2291
+ "step": 1130
2292
+ },
2293
+ {
2294
+ "entropy": 0.6303628087043762,
2295
+ "epoch": 0.8578987150415722,
2296
+ "grad_norm": 0.5859375,
2297
+ "learning_rate": 6.095937595815104e-07,
2298
+ "loss": 0.0005,
2299
+ "mean_token_accuracy": 1.0,
2300
+ "num_tokens": 30999618.0,
2301
+ "step": 1135
2302
+ },
2303
+ {
2304
+ "entropy": 0.6290588498115539,
2305
+ "epoch": 0.8616780045351474,
2306
+ "grad_norm": 20.875,
2307
+ "learning_rate": 5.783954718416468e-07,
2308
+ "loss": 0.012,
2309
+ "mean_token_accuracy": 0.9984236717224121,
2310
+ "num_tokens": 31136806.0,
2311
+ "step": 1140
2312
+ },
2313
+ {
2314
+ "entropy": 0.6272745132446289,
2315
+ "epoch": 0.8654572940287226,
2316
+ "grad_norm": 0.08447265625,
2317
+ "learning_rate": 5.47967589685292e-07,
2318
+ "loss": 0.0034,
2319
+ "mean_token_accuracy": 0.9994764447212219,
2320
+ "num_tokens": 31272561.0,
2321
+ "step": 1145
2322
+ },
2323
+ {
2324
+ "entropy": 0.6306499123573304,
2325
+ "epoch": 0.8692365835222978,
2326
+ "grad_norm": 3.265625,
2327
+ "learning_rate": 5.183154147648578e-07,
2328
+ "loss": 0.0006,
2329
+ "mean_token_accuracy": 1.0,
2330
+ "num_tokens": 31407553.0,
2331
+ "step": 1150
2332
+ },
2333
+ {
2334
+ "entropy": 0.6274000406265259,
2335
+ "epoch": 0.873015873015873,
2336
+ "grad_norm": 2.140625,
2337
+ "learning_rate": 4.894441135761197e-07,
2338
+ "loss": 0.0025,
2339
+ "mean_token_accuracy": 0.998951506614685,
2340
+ "num_tokens": 31543669.0,
2341
+ "step": 1155
2342
+ },
2343
+ {
2344
+ "entropy": 0.6274757504463195,
2345
+ "epoch": 0.8767951625094482,
2346
+ "grad_norm": 6.3125,
2347
+ "learning_rate": 4.613587165580269e-07,
2348
+ "loss": 0.0027,
2349
+ "mean_token_accuracy": 0.998970365524292,
2350
+ "num_tokens": 31679842.0,
2351
+ "step": 1160
2352
+ },
2353
+ {
2354
+ "entropy": 0.6302230358123779,
2355
+ "epoch": 0.8805744520030234,
2356
+ "grad_norm": 0.10107421875,
2357
+ "learning_rate": 4.34064117216218e-07,
2358
+ "loss": 0.0002,
2359
+ "mean_token_accuracy": 1.0,
2360
+ "num_tokens": 31817091.0,
2361
+ "step": 1165
2362
+ },
2363
+ {
2364
+ "entropy": 0.6279751658439636,
2365
+ "epoch": 0.8843537414965986,
2366
+ "grad_norm": 3.921875,
2367
+ "learning_rate": 4.0756507127038494e-07,
2368
+ "loss": 0.0075,
2369
+ "mean_token_accuracy": 0.9979239463806152,
2370
+ "num_tokens": 31953781.0,
2371
+ "step": 1170
2372
+ },
2373
+ {
2374
+ "entropy": 0.627412760257721,
2375
+ "epoch": 0.8881330309901738,
2376
+ "grad_norm": 1.171875,
2377
+ "learning_rate": 3.8186619582565974e-07,
2378
+ "loss": 0.0023,
2379
+ "mean_token_accuracy": 0.998963725566864,
2380
+ "num_tokens": 32091277.0,
2381
+ "step": 1175
2382
+ },
2383
+ {
2384
+ "entropy": 0.6293660879135132,
2385
+ "epoch": 0.891912320483749,
2386
+ "grad_norm": 0.06640625,
2387
+ "learning_rate": 3.569719685681405e-07,
2388
+ "loss": 0.0017,
2389
+ "mean_token_accuracy": 0.9989610433578491,
2390
+ "num_tokens": 32227162.0,
2391
+ "step": 1180
2392
+ },
2393
+ {
2394
+ "entropy": 0.6281276702880859,
2395
+ "epoch": 0.8956916099773242,
2396
+ "grad_norm": 0.98828125,
2397
+ "learning_rate": 3.3288672698471804e-07,
2398
+ "loss": 0.0029,
2399
+ "mean_token_accuracy": 0.9989610433578491,
2400
+ "num_tokens": 32364049.0,
2401
+ "step": 1185
2402
+ },
2403
+ {
2404
+ "entropy": 0.6279310822486878,
2405
+ "epoch": 0.8994708994708994,
2406
+ "grad_norm": 1.71875,
2407
+ "learning_rate": 3.096146676073253e-07,
2408
+ "loss": 0.0046,
2409
+ "mean_token_accuracy": 0.9978629589080811,
2410
+ "num_tokens": 32500177.0,
2411
+ "step": 1190
2412
+ },
2413
+ {
2414
+ "entropy": 0.6304468154907227,
2415
+ "epoch": 0.9032501889644747,
2416
+ "grad_norm": 2.078125,
2417
+ "learning_rate": 2.8715984528174757e-07,
2418
+ "loss": 0.0034,
2419
+ "mean_token_accuracy": 0.9979340195655823,
2420
+ "num_tokens": 32637884.0,
2421
+ "step": 1195
2422
+ },
2423
+ {
2424
+ "entropy": 0.6291935801506042,
2425
+ "epoch": 0.9070294784580499,
2426
+ "grad_norm": 0.173828125,
2427
+ "learning_rate": 2.6552617246111966e-07,
2428
+ "loss": 0.0013,
2429
+ "mean_token_accuracy": 0.9994791626930237,
2430
+ "num_tokens": 32775860.0,
2431
+ "step": 1200
2432
+ },
2433
+ {
2434
+ "entropy": 0.628238570690155,
2435
+ "epoch": 0.9108087679516251,
2436
+ "grad_norm": 0.25390625,
2437
+ "learning_rate": 2.447174185242324e-07,
2438
+ "loss": 0.0054,
2439
+ "mean_token_accuracy": 0.9984263181686401,
2440
+ "num_tokens": 32915185.0,
2441
+ "step": 1205
2442
+ },
2443
+ {
2444
+ "entropy": 0.6287149310111999,
2445
+ "epoch": 0.9145880574452003,
2446
+ "grad_norm": 0.3671875,
2447
+ "learning_rate": 2.247372091187705e-07,
2448
+ "loss": 0.0023,
2449
+ "mean_token_accuracy": 0.9994910955429077,
2450
+ "num_tokens": 33051666.0,
2451
+ "step": 1210
2452
+ },
2453
+ {
2454
+ "entropy": 0.6283146739006042,
2455
+ "epoch": 0.9183673469387755,
2456
+ "grad_norm": 1.78125,
2457
+ "learning_rate": 2.0558902552959058e-07,
2458
+ "loss": 0.0021,
2459
+ "mean_token_accuracy": 0.9994652390480041,
2460
+ "num_tokens": 33187522.0,
2461
+ "step": 1215
2462
+ },
2463
+ {
2464
+ "entropy": 0.6290141940116882,
2465
+ "epoch": 0.9221466364323507,
2466
+ "grad_norm": 0.384765625,
2467
+ "learning_rate": 1.8727620407215375e-07,
2468
+ "loss": 0.0009,
2469
+ "mean_token_accuracy": 1.0,
2470
+ "num_tokens": 33325395.0,
2471
+ "step": 1220
2472
+ },
2473
+ {
2474
+ "entropy": 0.6285704016685486,
2475
+ "epoch": 0.9259259259259259,
2476
+ "grad_norm": 4.90625,
2477
+ "learning_rate": 1.6980193551121848e-07,
2478
+ "loss": 0.0042,
2479
+ "mean_token_accuracy": 0.9984615325927735,
2480
+ "num_tokens": 33461743.0,
2481
+ "step": 1225
2482
+ },
2483
+ {
2484
+ "entropy": 0.6292840957641601,
2485
+ "epoch": 0.9297052154195011,
2486
+ "grad_norm": 1.1171875,
2487
+ "learning_rate": 1.5316926450488878e-07,
2488
+ "loss": 0.0031,
2489
+ "mean_token_accuracy": 0.9994845390319824,
2490
+ "num_tokens": 33599818.0,
2491
+ "step": 1230
2492
+ },
2493
+ {
2494
+ "entropy": 0.6308977961540222,
2495
+ "epoch": 0.9334845049130763,
2496
+ "grad_norm": 1.5390625,
2497
+ "learning_rate": 1.373810890741284e-07,
2498
+ "loss": 0.0121,
2499
+ "mean_token_accuracy": 0.9984320402145386,
2500
+ "num_tokens": 33736511.0,
2501
+ "step": 1235
2502
+ },
2503
+ {
2504
+ "entropy": 0.6314628601074219,
2505
+ "epoch": 0.9372637944066515,
2506
+ "grad_norm": 0.05224609375,
2507
+ "learning_rate": 1.22440160097817e-07,
2508
+ "loss": 0.0004,
2509
+ "mean_token_accuracy": 1.0,
2510
+ "num_tokens": 33875284.0,
2511
+ "step": 1240
2512
+ },
2513
+ {
2514
+ "entropy": 0.6279171347618103,
2515
+ "epoch": 0.9410430839002267,
2516
+ "grad_norm": 0.12353515625,
2517
+ "learning_rate": 1.0834908083344253e-07,
2518
+ "loss": 0.0004,
2519
+ "mean_token_accuracy": 1.0,
2520
+ "num_tokens": 34010539.0,
2521
+ "step": 1245
2522
+ },
2523
+ {
2524
+ "entropy": 0.6286433458328247,
2525
+ "epoch": 0.9448223733938019,
2526
+ "grad_norm": 0.09033203125,
2527
+ "learning_rate": 9.511030646352615e-08,
2528
+ "loss": 0.0057,
2529
+ "mean_token_accuracy": 0.9989795923233032,
2530
+ "num_tokens": 34146309.0,
2531
+ "step": 1250
2532
+ },
2533
+ {
2534
+ "entropy": 0.6301220536231995,
2535
+ "epoch": 0.9486016628873771,
2536
+ "grad_norm": 4.09375,
2537
+ "learning_rate": 8.27261436678306e-08,
2538
+ "loss": 0.0084,
2539
+ "mean_token_accuracy": 0.9979233145713806,
2540
+ "num_tokens": 34280942.0,
2541
+ "step": 1255
2542
+ },
2543
+ {
2544
+ "entropy": 0.6293729186058045,
2545
+ "epoch": 0.9523809523809523,
2546
+ "grad_norm": 0.265625,
2547
+ "learning_rate": 7.11987502214595e-08,
2548
+ "loss": 0.0006,
2549
+ "mean_token_accuracy": 1.0,
2550
+ "num_tokens": 34417174.0,
2551
+ "step": 1260
2552
+ },
2553
+ {
2554
+ "entropy": 0.6268362164497375,
2555
+ "epoch": 0.9561602418745276,
2556
+ "grad_norm": 1.4296875,
2557
+ "learning_rate": 6.053013461889023e-08,
2558
+ "loss": 0.0013,
2559
+ "mean_token_accuracy": 0.9994764447212219,
2560
+ "num_tokens": 34553146.0,
2561
+ "step": 1265
2562
+ },
2563
+ {
2564
+ "entropy": 0.626698911190033,
2565
+ "epoch": 0.9599395313681028,
2566
+ "grad_norm": 0.0306396484375,
2567
+ "learning_rate": 5.072215572402006e-08,
2568
+ "loss": 0.0009,
2569
+ "mean_token_accuracy": 0.9994832038879394,
2570
+ "num_tokens": 34690822.0,
2571
+ "step": 1270
2572
+ },
2573
+ {
2574
+ "entropy": 0.6287196755409241,
2575
+ "epoch": 0.963718820861678,
2576
+ "grad_norm": 1.0859375,
2577
+ "learning_rate": 4.177652244628627e-08,
2578
+ "loss": 0.0045,
2579
+ "mean_token_accuracy": 0.9984400272369385,
2580
+ "num_tokens": 34828094.0,
2581
+ "step": 1275
2582
+ },
2583
+ {
2584
+ "entropy": 0.6283414721488952,
2585
+ "epoch": 0.9674981103552532,
2586
+ "grad_norm": 0.765625,
2587
+ "learning_rate": 3.369479344290938e-08,
2588
+ "loss": 0.0036,
2589
+ "mean_token_accuracy": 0.9989580750465393,
2590
+ "num_tokens": 34965771.0,
2591
+ "step": 1280
2592
+ },
2593
+ {
2594
+ "entropy": 0.6298310279846191,
2595
+ "epoch": 0.9712773998488284,
2596
+ "grad_norm": 0.0947265625,
2597
+ "learning_rate": 2.6478376847318687e-08,
2598
+ "loss": 0.004,
2599
+ "mean_token_accuracy": 0.9989501357078552,
2600
+ "num_tokens": 35102076.0,
2601
+ "step": 1285
2602
+ },
2603
+ {
2604
+ "entropy": 0.6282585024833679,
2605
+ "epoch": 0.9750566893424036,
2606
+ "grad_norm": 0.0296630859375,
2607
+ "learning_rate": 2.012853002380466e-08,
2608
+ "loss": 0.0061,
2609
+ "mean_token_accuracy": 0.9994832038879394,
2610
+ "num_tokens": 35237791.0,
2611
+ "step": 1290
2612
+ },
2613
+ {
2614
+ "entropy": 0.6272670507431031,
2615
+ "epoch": 0.9788359788359788,
2616
+ "grad_norm": 5.25,
2617
+ "learning_rate": 1.4646359348439165e-08,
2618
+ "loss": 0.0024,
2619
+ "mean_token_accuracy": 0.9994832038879394,
2620
+ "num_tokens": 35375040.0,
2621
+ "step": 1295
2622
+ },
2623
+ {
2624
+ "entropy": 0.629937744140625,
2625
+ "epoch": 0.982615268329554,
2626
+ "grad_norm": 3.078125,
2627
+ "learning_rate": 1.0032820016302458e-08,
2628
+ "loss": 0.0056,
2629
+ "mean_token_accuracy": 0.997934103012085,
2630
+ "num_tokens": 35512390.0,
2631
+ "step": 1300
2632
+ },
2633
+ {
2634
+ "entropy": 0.6293532013893127,
2635
+ "epoch": 0.9863945578231292,
2636
+ "grad_norm": 0.291015625,
2637
+ "learning_rate": 6.288715875057416e-09,
2638
+ "loss": 0.0021,
2639
+ "mean_token_accuracy": 0.9984112977981567,
2640
+ "num_tokens": 35651051.0,
2641
+ "step": 1305
2642
+ },
2643
+ {
2644
+ "entropy": 0.629024076461792,
2645
+ "epoch": 0.9901738473167044,
2646
+ "grad_norm": 0.083984375,
2647
+ "learning_rate": 3.41469928488547e-09,
2648
+ "loss": 0.0043,
2649
+ "mean_token_accuracy": 0.9984126091003418,
2650
+ "num_tokens": 35790379.0,
2651
+ "step": 1310
2652
+ },
2653
+ {
2654
+ "entropy": 0.6272889018058777,
2655
+ "epoch": 0.9939531368102796,
2656
+ "grad_norm": 2.140625,
2657
+ "learning_rate": 1.4112710048247436e-09,
2658
+ "loss": 0.0053,
2659
+ "mean_token_accuracy": 0.9989569664001465,
2660
+ "num_tokens": 35926573.0,
2661
+ "step": 1315
2662
+ },
2663
+ {
2664
+ "entropy": 0.6294793486595154,
2665
+ "epoch": 0.9977324263038548,
2666
+ "grad_norm": 2.828125,
2667
+ "learning_rate": 2.787801055181838e-10,
2668
+ "loss": 0.0093,
2669
+ "mean_token_accuracy": 0.9979259252548218,
2670
+ "num_tokens": 36062587.0,
2671
+ "step": 1320
2672
+ }
2673
+ ],
2674
+ "logging_steps": 5,
2675
+ "max_steps": 1323,
2676
+ "num_input_tokens_seen": 0,
2677
+ "num_train_epochs": 1,
2678
+ "save_steps": 1,
2679
+ "stateful_callbacks": {
2680
+ "TrainerControl": {
2681
+ "args": {
2682
+ "should_epoch_stop": false,
2683
+ "should_evaluate": false,
2684
+ "should_log": false,
2685
+ "should_save": true,
2686
+ "should_training_stop": true
2687
+ },
2688
+ "attributes": {}
2689
+ }
2690
+ },
2691
+ "total_flos": 7.85961760271106e+17,
2692
+ "train_batch_size": 2,
2693
+ "trial_name": null,
2694
+ "trial_params": null
2695
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e92b7ff4c9e2438b0e73ac23303f60236c8cd784fdcf00d6d4757cb29865cb14
3
+ size 6417