tony24254 commited on
Commit
5b60e22
·
verified ·
1 Parent(s): 8ea0712

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. Mu-Math/group_01/adapter/README.md +202 -0
  2. Mu-Math/group_01/adapter/adapter_config.json +34 -0
  3. Mu-Math/group_01/adapter/added_tokens.json +24 -0
  4. Mu-Math/group_01/adapter/chat_template.jinja +54 -0
  5. Mu-Math/group_01/adapter/merges.txt +0 -0
  6. Mu-Math/group_01/adapter/special_tokens_map.json +31 -0
  7. Mu-Math/group_01/adapter/tokenizer_config.json +207 -0
  8. Mu-Math/group_01/adapter/vocab.json +0 -0
  9. Mu-Math/group_01/checkpoints/checkpoint-1200/README.md +202 -0
  10. Mu-Math/group_01/checkpoints/checkpoint-1200/adapter_config.json +34 -0
  11. Mu-Math/group_01/checkpoints/checkpoint-1200/adapter_model.safetensors +3 -0
  12. Mu-Math/group_01/checkpoints/checkpoint-1200/added_tokens.json +24 -0
  13. Mu-Math/group_01/checkpoints/checkpoint-1200/chat_template.jinja +54 -0
  14. Mu-Math/group_01/checkpoints/checkpoint-1200/merges.txt +0 -0
  15. Mu-Math/group_01/checkpoints/checkpoint-1200/special_tokens_map.json +31 -0
  16. Mu-Math/group_01/checkpoints/checkpoint-1200/tokenizer_config.json +207 -0
  17. Mu-Math/group_01/checkpoints/checkpoint-1200/trainer_state.json +1721 -0
  18. Mu-Math/group_01/checkpoints/checkpoint-1200/vocab.json +0 -0
  19. Mu-Math/group_01/checkpoints/checkpoint-1500/trainer_state.json +2141 -0
  20. Mu-Math/group_01/checkpoints/checkpoint-1500/vocab.json +0 -0
  21. Mu-Math/group_01/checkpoints/checkpoint-1800/README.md +202 -0
  22. Mu-Math/group_01/checkpoints/checkpoint-1800/adapter_config.json +34 -0
  23. Mu-Math/group_01/checkpoints/checkpoint-1800/added_tokens.json +24 -0
  24. Mu-Math/group_01/checkpoints/checkpoint-1800/chat_template.jinja +54 -0
  25. Mu-Math/group_01/checkpoints/checkpoint-1800/merges.txt +0 -0
  26. Mu-Math/group_01/checkpoints/checkpoint-1800/special_tokens_map.json +31 -0
  27. Mu-Math/group_01/checkpoints/checkpoint-1800/tokenizer_config.json +207 -0
  28. Mu-Math/group_01/checkpoints/checkpoint-1800/trainer_state.json +2561 -0
  29. Mu-Math/group_01/checkpoints/checkpoint-1800/vocab.json +0 -0
  30. Mu-Math/group_01/checkpoints/checkpoint-300/README.md +202 -0
  31. Mu-Math/group_01/checkpoints/checkpoint-300/adapter_config.json +34 -0
  32. Mu-Math/group_01/checkpoints/checkpoint-300/added_tokens.json +24 -0
  33. Mu-Math/group_01/checkpoints/checkpoint-300/chat_template.jinja +54 -0
  34. Mu-Math/group_01/checkpoints/checkpoint-300/merges.txt +0 -0
  35. Mu-Math/group_01/checkpoints/checkpoint-300/special_tokens_map.json +31 -0
  36. Mu-Math/group_01/checkpoints/checkpoint-300/tokenizer_config.json +207 -0
  37. Mu-Math/group_01/checkpoints/checkpoint-300/trainer_state.json +461 -0
  38. Mu-Math/group_01/checkpoints/checkpoint-300/vocab.json +0 -0
  39. Mu-Math/group_01/checkpoints/checkpoint-600/README.md +202 -0
  40. Mu-Math/group_01/checkpoints/checkpoint-600/adapter_config.json +34 -0
  41. Mu-Math/group_01/checkpoints/checkpoint-600/added_tokens.json +24 -0
  42. Mu-Math/group_01/checkpoints/checkpoint-600/chat_template.jinja +54 -0
  43. Mu-Math/group_01/checkpoints/checkpoint-600/merges.txt +0 -0
  44. Mu-Math/group_01/checkpoints/checkpoint-600/special_tokens_map.json +31 -0
  45. Mu-Math/group_01/checkpoints/checkpoint-600/tokenizer_config.json +207 -0
  46. Mu-Math/group_01/checkpoints/checkpoint-600/trainer_state.json +881 -0
  47. Mu-Math/group_01/checkpoints/checkpoint-600/vocab.json +0 -0
  48. Mu-Math/group_01/metadata.json +2718 -0
  49. Mu-Math/group_01/prompt_group.json +613 -0
  50. Mu-Math/group_01/tokenizer/added_tokens.json +24 -0
Mu-Math/group_01/adapter/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: /hkfs/work/workspace/scratch/tum_fmp0582-dndworkspace/不冻结Qwen训练/models/Qwen2.5-1.5B-Instruct
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.12.0
Mu-Math/group_01/adapter/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "/hkfs/work/workspace/scratch/tum_fmp0582-dndworkspace/\u4e0d\u51bb\u7ed3Qwen\u8bad\u7ec3/models/Qwen2.5-1.5B-Instruct",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 128,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 64,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "down_proj",
24
+ "up_proj",
25
+ "gate_proj",
26
+ "q_proj",
27
+ "o_proj",
28
+ "v_proj",
29
+ "k_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
Mu-Math/group_01/adapter/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
Mu-Math/group_01/adapter/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
Mu-Math/group_01/adapter/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
Mu-Math/group_01/adapter/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
Mu-Math/group_01/adapter/tokenizer_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "split_special_tokens": false,
205
+ "tokenizer_class": "Qwen2Tokenizer",
206
+ "unk_token": null
207
+ }
Mu-Math/group_01/adapter/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
Mu-Math/group_01/checkpoints/checkpoint-1200/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: /hkfs/work/workspace/scratch/tum_fmp0582-dndworkspace/不冻结Qwen训练/models/Qwen2.5-1.5B-Instruct
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.12.0
Mu-Math/group_01/checkpoints/checkpoint-1200/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "/hkfs/work/workspace/scratch/tum_fmp0582-dndworkspace/\u4e0d\u51bb\u7ed3Qwen\u8bad\u7ec3/models/Qwen2.5-1.5B-Instruct",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 128,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 64,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "down_proj",
24
+ "up_proj",
25
+ "gate_proj",
26
+ "q_proj",
27
+ "o_proj",
28
+ "v_proj",
29
+ "k_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
Mu-Math/group_01/checkpoints/checkpoint-1200/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95fc4eb548728c9534569935cd9c99cb7491d84a7d08ebbb7a0ac5682ac912b5
3
+ size 295488936
Mu-Math/group_01/checkpoints/checkpoint-1200/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
Mu-Math/group_01/checkpoints/checkpoint-1200/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
Mu-Math/group_01/checkpoints/checkpoint-1200/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
Mu-Math/group_01/checkpoints/checkpoint-1200/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
Mu-Math/group_01/checkpoints/checkpoint-1200/tokenizer_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "split_special_tokens": false,
205
+ "tokenizer_class": "Qwen2Tokenizer",
206
+ "unk_token": null
207
+ }
Mu-Math/group_01/checkpoints/checkpoint-1200/trainer_state.json ADDED
@@ -0,0 +1,1721 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 300.0,
6
+ "eval_steps": 500,
7
+ "global_step": 1200,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.32,
14
+ "grad_norm": 11.867908477783203,
15
+ "learning_rate": 0.0,
16
+ "loss": 1.9204,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 1.32,
21
+ "grad_norm": 7.492858409881592,
22
+ "learning_rate": 7.4074074074074075e-06,
23
+ "loss": 1.8831,
24
+ "step": 5
25
+ },
26
+ {
27
+ "epoch": 2.64,
28
+ "grad_norm": 3.9426615238189697,
29
+ "learning_rate": 1.6666666666666667e-05,
30
+ "loss": 1.6453,
31
+ "step": 10
32
+ },
33
+ {
34
+ "epoch": 3.96,
35
+ "grad_norm": 1.769984483718872,
36
+ "learning_rate": 2.5925925925925925e-05,
37
+ "loss": 1.2506,
38
+ "step": 15
39
+ },
40
+ {
41
+ "epoch": 5.0,
42
+ "grad_norm": 1.108256220817566,
43
+ "learning_rate": 3.518518518518519e-05,
44
+ "loss": 1.0012,
45
+ "step": 20
46
+ },
47
+ {
48
+ "epoch": 6.32,
49
+ "grad_norm": 0.5219796299934387,
50
+ "learning_rate": 4.4444444444444447e-05,
51
+ "loss": 0.8034,
52
+ "step": 25
53
+ },
54
+ {
55
+ "epoch": 7.64,
56
+ "grad_norm": 0.6449305415153503,
57
+ "learning_rate": 5.370370370370371e-05,
58
+ "loss": 0.6539,
59
+ "step": 30
60
+ },
61
+ {
62
+ "epoch": 8.96,
63
+ "grad_norm": 0.580233633518219,
64
+ "learning_rate": 6.296296296296296e-05,
65
+ "loss": 0.5474,
66
+ "step": 35
67
+ },
68
+ {
69
+ "epoch": 10.0,
70
+ "grad_norm": 1.5570186376571655,
71
+ "learning_rate": 7.222222222222222e-05,
72
+ "loss": 0.4811,
73
+ "step": 40
74
+ },
75
+ {
76
+ "epoch": 11.32,
77
+ "grad_norm": 0.5841688513755798,
78
+ "learning_rate": 8.148148148148148e-05,
79
+ "loss": 0.3477,
80
+ "step": 45
81
+ },
82
+ {
83
+ "epoch": 12.64,
84
+ "grad_norm": 0.7968279719352722,
85
+ "learning_rate": 9.074074074074075e-05,
86
+ "loss": 0.2089,
87
+ "step": 50
88
+ },
89
+ {
90
+ "epoch": 13.96,
91
+ "grad_norm": 0.8396451473236084,
92
+ "learning_rate": 0.0001,
93
+ "loss": 0.1357,
94
+ "step": 55
95
+ },
96
+ {
97
+ "epoch": 15.0,
98
+ "grad_norm": 2.7755286693573,
99
+ "learning_rate": 9.971363115693013e-05,
100
+ "loss": 0.1235,
101
+ "step": 60
102
+ },
103
+ {
104
+ "epoch": 16.32,
105
+ "grad_norm": 0.6953228116035461,
106
+ "learning_rate": 9.942726231386026e-05,
107
+ "loss": 0.0755,
108
+ "step": 65
109
+ },
110
+ {
111
+ "epoch": 17.64,
112
+ "grad_norm": 1.1248857975006104,
113
+ "learning_rate": 9.914089347079038e-05,
114
+ "loss": 0.0546,
115
+ "step": 70
116
+ },
117
+ {
118
+ "epoch": 18.96,
119
+ "grad_norm": 0.5247378945350647,
120
+ "learning_rate": 9.885452462772051e-05,
121
+ "loss": 0.0602,
122
+ "step": 75
123
+ },
124
+ {
125
+ "epoch": 20.0,
126
+ "grad_norm": 1.827890157699585,
127
+ "learning_rate": 9.856815578465064e-05,
128
+ "loss": 0.0501,
129
+ "step": 80
130
+ },
131
+ {
132
+ "epoch": 21.32,
133
+ "grad_norm": 0.23602119088172913,
134
+ "learning_rate": 9.828178694158075e-05,
135
+ "loss": 0.0443,
136
+ "step": 85
137
+ },
138
+ {
139
+ "epoch": 22.64,
140
+ "grad_norm": 0.2811133861541748,
141
+ "learning_rate": 9.799541809851088e-05,
142
+ "loss": 0.0448,
143
+ "step": 90
144
+ },
145
+ {
146
+ "epoch": 23.96,
147
+ "grad_norm": 0.29150959849357605,
148
+ "learning_rate": 9.7709049255441e-05,
149
+ "loss": 0.0426,
150
+ "step": 95
151
+ },
152
+ {
153
+ "epoch": 25.0,
154
+ "grad_norm": 1.4590457677841187,
155
+ "learning_rate": 9.742268041237114e-05,
156
+ "loss": 0.04,
157
+ "step": 100
158
+ },
159
+ {
160
+ "epoch": 26.32,
161
+ "grad_norm": 0.15335209667682648,
162
+ "learning_rate": 9.713631156930127e-05,
163
+ "loss": 0.0374,
164
+ "step": 105
165
+ },
166
+ {
167
+ "epoch": 27.64,
168
+ "grad_norm": 0.3241201639175415,
169
+ "learning_rate": 9.68499427262314e-05,
170
+ "loss": 0.0378,
171
+ "step": 110
172
+ },
173
+ {
174
+ "epoch": 28.96,
175
+ "grad_norm": 0.18619631230831146,
176
+ "learning_rate": 9.656357388316152e-05,
177
+ "loss": 0.0374,
178
+ "step": 115
179
+ },
180
+ {
181
+ "epoch": 30.0,
182
+ "grad_norm": 0.4512801170349121,
183
+ "learning_rate": 9.627720504009165e-05,
184
+ "loss": 0.0342,
185
+ "step": 120
186
+ },
187
+ {
188
+ "epoch": 31.32,
189
+ "grad_norm": 0.21706914901733398,
190
+ "learning_rate": 9.599083619702178e-05,
191
+ "loss": 0.0369,
192
+ "step": 125
193
+ },
194
+ {
195
+ "epoch": 32.64,
196
+ "grad_norm": 0.42762166261672974,
197
+ "learning_rate": 9.57044673539519e-05,
198
+ "loss": 0.0355,
199
+ "step": 130
200
+ },
201
+ {
202
+ "epoch": 33.96,
203
+ "grad_norm": 0.1793977916240692,
204
+ "learning_rate": 9.541809851088203e-05,
205
+ "loss": 0.0347,
206
+ "step": 135
207
+ },
208
+ {
209
+ "epoch": 35.0,
210
+ "grad_norm": 1.866305947303772,
211
+ "learning_rate": 9.513172966781214e-05,
212
+ "loss": 0.0368,
213
+ "step": 140
214
+ },
215
+ {
216
+ "epoch": 36.32,
217
+ "grad_norm": 0.09879657626152039,
218
+ "learning_rate": 9.484536082474227e-05,
219
+ "loss": 0.0347,
220
+ "step": 145
221
+ },
222
+ {
223
+ "epoch": 37.64,
224
+ "grad_norm": 0.09229481220245361,
225
+ "learning_rate": 9.45589919816724e-05,
226
+ "loss": 0.0338,
227
+ "step": 150
228
+ },
229
+ {
230
+ "epoch": 38.96,
231
+ "grad_norm": 0.11409584432840347,
232
+ "learning_rate": 9.427262313860252e-05,
233
+ "loss": 0.0339,
234
+ "step": 155
235
+ },
236
+ {
237
+ "epoch": 40.0,
238
+ "grad_norm": 0.35678204894065857,
239
+ "learning_rate": 9.398625429553265e-05,
240
+ "loss": 0.0347,
241
+ "step": 160
242
+ },
243
+ {
244
+ "epoch": 41.32,
245
+ "grad_norm": 0.08212767541408539,
246
+ "learning_rate": 9.369988545246277e-05,
247
+ "loss": 0.0345,
248
+ "step": 165
249
+ },
250
+ {
251
+ "epoch": 42.64,
252
+ "grad_norm": 0.07271627336740494,
253
+ "learning_rate": 9.34135166093929e-05,
254
+ "loss": 0.0305,
255
+ "step": 170
256
+ },
257
+ {
258
+ "epoch": 43.96,
259
+ "grad_norm": 0.23211534321308136,
260
+ "learning_rate": 9.312714776632303e-05,
261
+ "loss": 0.0321,
262
+ "step": 175
263
+ },
264
+ {
265
+ "epoch": 45.0,
266
+ "grad_norm": 0.23425568640232086,
267
+ "learning_rate": 9.284077892325315e-05,
268
+ "loss": 0.0334,
269
+ "step": 180
270
+ },
271
+ {
272
+ "epoch": 46.32,
273
+ "grad_norm": 0.07825004309415817,
274
+ "learning_rate": 9.255441008018328e-05,
275
+ "loss": 0.0349,
276
+ "step": 185
277
+ },
278
+ {
279
+ "epoch": 47.64,
280
+ "grad_norm": 0.06621824949979782,
281
+ "learning_rate": 9.22680412371134e-05,
282
+ "loss": 0.0302,
283
+ "step": 190
284
+ },
285
+ {
286
+ "epoch": 48.96,
287
+ "grad_norm": 0.0967830941081047,
288
+ "learning_rate": 9.198167239404353e-05,
289
+ "loss": 0.0316,
290
+ "step": 195
291
+ },
292
+ {
293
+ "epoch": 50.0,
294
+ "grad_norm": 0.39718347787857056,
295
+ "learning_rate": 9.169530355097366e-05,
296
+ "loss": 0.0307,
297
+ "step": 200
298
+ },
299
+ {
300
+ "epoch": 51.32,
301
+ "grad_norm": 0.06881817430257797,
302
+ "learning_rate": 9.140893470790379e-05,
303
+ "loss": 0.0291,
304
+ "step": 205
305
+ },
306
+ {
307
+ "epoch": 52.64,
308
+ "grad_norm": 0.07241260260343552,
309
+ "learning_rate": 9.112256586483391e-05,
310
+ "loss": 0.032,
311
+ "step": 210
312
+ },
313
+ {
314
+ "epoch": 53.96,
315
+ "grad_norm": 0.08191649615764618,
316
+ "learning_rate": 9.083619702176404e-05,
317
+ "loss": 0.0293,
318
+ "step": 215
319
+ },
320
+ {
321
+ "epoch": 55.0,
322
+ "grad_norm": 0.20381148159503937,
323
+ "learning_rate": 9.054982817869416e-05,
324
+ "loss": 0.033,
325
+ "step": 220
326
+ },
327
+ {
328
+ "epoch": 56.32,
329
+ "grad_norm": 0.0765785425901413,
330
+ "learning_rate": 9.026345933562429e-05,
331
+ "loss": 0.0323,
332
+ "step": 225
333
+ },
334
+ {
335
+ "epoch": 57.64,
336
+ "grad_norm": 0.0698801577091217,
337
+ "learning_rate": 8.997709049255442e-05,
338
+ "loss": 0.0324,
339
+ "step": 230
340
+ },
341
+ {
342
+ "epoch": 58.96,
343
+ "grad_norm": 0.08089473098516464,
344
+ "learning_rate": 8.969072164948454e-05,
345
+ "loss": 0.0314,
346
+ "step": 235
347
+ },
348
+ {
349
+ "epoch": 60.0,
350
+ "grad_norm": 0.22270062565803528,
351
+ "learning_rate": 8.940435280641467e-05,
352
+ "loss": 0.0303,
353
+ "step": 240
354
+ },
355
+ {
356
+ "epoch": 61.32,
357
+ "grad_norm": 0.07712433487176895,
358
+ "learning_rate": 8.91179839633448e-05,
359
+ "loss": 0.0324,
360
+ "step": 245
361
+ },
362
+ {
363
+ "epoch": 62.64,
364
+ "grad_norm": 0.05860769376158714,
365
+ "learning_rate": 8.883161512027491e-05,
366
+ "loss": 0.0321,
367
+ "step": 250
368
+ },
369
+ {
370
+ "epoch": 63.96,
371
+ "grad_norm": 0.05999445170164108,
372
+ "learning_rate": 8.854524627720504e-05,
373
+ "loss": 0.0315,
374
+ "step": 255
375
+ },
376
+ {
377
+ "epoch": 65.0,
378
+ "grad_norm": 0.20564565062522888,
379
+ "learning_rate": 8.825887743413516e-05,
380
+ "loss": 0.0364,
381
+ "step": 260
382
+ },
383
+ {
384
+ "epoch": 66.32,
385
+ "grad_norm": 0.0610821433365345,
386
+ "learning_rate": 8.797250859106529e-05,
387
+ "loss": 0.0311,
388
+ "step": 265
389
+ },
390
+ {
391
+ "epoch": 67.64,
392
+ "grad_norm": 0.05693706497550011,
393
+ "learning_rate": 8.768613974799542e-05,
394
+ "loss": 0.0294,
395
+ "step": 270
396
+ },
397
+ {
398
+ "epoch": 68.96,
399
+ "grad_norm": 0.06817185133695602,
400
+ "learning_rate": 8.739977090492554e-05,
401
+ "loss": 0.0284,
402
+ "step": 275
403
+ },
404
+ {
405
+ "epoch": 70.0,
406
+ "grad_norm": 0.17458151280879974,
407
+ "learning_rate": 8.711340206185567e-05,
408
+ "loss": 0.0291,
409
+ "step": 280
410
+ },
411
+ {
412
+ "epoch": 71.32,
413
+ "grad_norm": 0.07353579252958298,
414
+ "learning_rate": 8.682703321878581e-05,
415
+ "loss": 0.0324,
416
+ "step": 285
417
+ },
418
+ {
419
+ "epoch": 72.64,
420
+ "grad_norm": 0.061573781073093414,
421
+ "learning_rate": 8.654066437571594e-05,
422
+ "loss": 0.0305,
423
+ "step": 290
424
+ },
425
+ {
426
+ "epoch": 73.96,
427
+ "grad_norm": 0.07544506341218948,
428
+ "learning_rate": 8.625429553264606e-05,
429
+ "loss": 0.0294,
430
+ "step": 295
431
+ },
432
+ {
433
+ "epoch": 75.0,
434
+ "grad_norm": 0.11100324243307114,
435
+ "learning_rate": 8.596792668957619e-05,
436
+ "loss": 0.03,
437
+ "step": 300
438
+ },
439
+ {
440
+ "epoch": 76.32,
441
+ "grad_norm": 0.0491141714155674,
442
+ "learning_rate": 8.56815578465063e-05,
443
+ "loss": 0.0296,
444
+ "step": 305
445
+ },
446
+ {
447
+ "epoch": 77.64,
448
+ "grad_norm": 0.07668624073266983,
449
+ "learning_rate": 8.539518900343643e-05,
450
+ "loss": 0.0345,
451
+ "step": 310
452
+ },
453
+ {
454
+ "epoch": 78.96,
455
+ "grad_norm": 0.07898231595754623,
456
+ "learning_rate": 8.510882016036655e-05,
457
+ "loss": 0.0306,
458
+ "step": 315
459
+ },
460
+ {
461
+ "epoch": 80.0,
462
+ "grad_norm": 0.3391458988189697,
463
+ "learning_rate": 8.482245131729668e-05,
464
+ "loss": 0.0334,
465
+ "step": 320
466
+ },
467
+ {
468
+ "epoch": 81.32,
469
+ "grad_norm": 0.05544694885611534,
470
+ "learning_rate": 8.453608247422681e-05,
471
+ "loss": 0.0282,
472
+ "step": 325
473
+ },
474
+ {
475
+ "epoch": 82.64,
476
+ "grad_norm": 0.05032579228281975,
477
+ "learning_rate": 8.424971363115693e-05,
478
+ "loss": 0.0323,
479
+ "step": 330
480
+ },
481
+ {
482
+ "epoch": 83.96,
483
+ "grad_norm": 0.05664476007223129,
484
+ "learning_rate": 8.396334478808706e-05,
485
+ "loss": 0.0295,
486
+ "step": 335
487
+ },
488
+ {
489
+ "epoch": 85.0,
490
+ "grad_norm": 0.24190960824489594,
491
+ "learning_rate": 8.367697594501719e-05,
492
+ "loss": 0.033,
493
+ "step": 340
494
+ },
495
+ {
496
+ "epoch": 86.32,
497
+ "grad_norm": 0.05068003758788109,
498
+ "learning_rate": 8.339060710194731e-05,
499
+ "loss": 0.0294,
500
+ "step": 345
501
+ },
502
+ {
503
+ "epoch": 87.64,
504
+ "grad_norm": 0.06719321757555008,
505
+ "learning_rate": 8.310423825887744e-05,
506
+ "loss": 0.0297,
507
+ "step": 350
508
+ },
509
+ {
510
+ "epoch": 88.96,
511
+ "grad_norm": 0.05750493332743645,
512
+ "learning_rate": 8.281786941580757e-05,
513
+ "loss": 0.0276,
514
+ "step": 355
515
+ },
516
+ {
517
+ "epoch": 90.0,
518
+ "grad_norm": 0.17318210005760193,
519
+ "learning_rate": 8.253150057273768e-05,
520
+ "loss": 0.0309,
521
+ "step": 360
522
+ },
523
+ {
524
+ "epoch": 91.32,
525
+ "grad_norm": 0.05168261379003525,
526
+ "learning_rate": 8.224513172966782e-05,
527
+ "loss": 0.0284,
528
+ "step": 365
529
+ },
530
+ {
531
+ "epoch": 92.64,
532
+ "grad_norm": 0.053040292114019394,
533
+ "learning_rate": 8.195876288659795e-05,
534
+ "loss": 0.0314,
535
+ "step": 370
536
+ },
537
+ {
538
+ "epoch": 93.96,
539
+ "grad_norm": 0.06162334978580475,
540
+ "learning_rate": 8.167239404352807e-05,
541
+ "loss": 0.0297,
542
+ "step": 375
543
+ },
544
+ {
545
+ "epoch": 95.0,
546
+ "grad_norm": 0.13474801182746887,
547
+ "learning_rate": 8.13860252004582e-05,
548
+ "loss": 0.0271,
549
+ "step": 380
550
+ },
551
+ {
552
+ "epoch": 96.32,
553
+ "grad_norm": 0.05177682265639305,
554
+ "learning_rate": 8.109965635738833e-05,
555
+ "loss": 0.0301,
556
+ "step": 385
557
+ },
558
+ {
559
+ "epoch": 97.64,
560
+ "grad_norm": 0.04276576265692711,
561
+ "learning_rate": 8.081328751431845e-05,
562
+ "loss": 0.0286,
563
+ "step": 390
564
+ },
565
+ {
566
+ "epoch": 98.96,
567
+ "grad_norm": 0.04698758199810982,
568
+ "learning_rate": 8.052691867124858e-05,
569
+ "loss": 0.0284,
570
+ "step": 395
571
+ },
572
+ {
573
+ "epoch": 100.0,
574
+ "grad_norm": 0.14094208180904388,
575
+ "learning_rate": 8.02405498281787e-05,
576
+ "loss": 0.0302,
577
+ "step": 400
578
+ },
579
+ {
580
+ "epoch": 101.32,
581
+ "grad_norm": 0.0528222993016243,
582
+ "learning_rate": 7.995418098510883e-05,
583
+ "loss": 0.0304,
584
+ "step": 405
585
+ },
586
+ {
587
+ "epoch": 102.64,
588
+ "grad_norm": 0.053034182637929916,
589
+ "learning_rate": 7.966781214203894e-05,
590
+ "loss": 0.0316,
591
+ "step": 410
592
+ },
593
+ {
594
+ "epoch": 103.96,
595
+ "grad_norm": 0.05732697248458862,
596
+ "learning_rate": 7.938144329896907e-05,
597
+ "loss": 0.0295,
598
+ "step": 415
599
+ },
600
+ {
601
+ "epoch": 105.0,
602
+ "grad_norm": 0.17511749267578125,
603
+ "learning_rate": 7.90950744558992e-05,
604
+ "loss": 0.0317,
605
+ "step": 420
606
+ },
607
+ {
608
+ "epoch": 106.32,
609
+ "grad_norm": 0.04588017240166664,
610
+ "learning_rate": 7.880870561282932e-05,
611
+ "loss": 0.0305,
612
+ "step": 425
613
+ },
614
+ {
615
+ "epoch": 107.64,
616
+ "grad_norm": 0.049282800406217575,
617
+ "learning_rate": 7.852233676975945e-05,
618
+ "loss": 0.031,
619
+ "step": 430
620
+ },
621
+ {
622
+ "epoch": 108.96,
623
+ "grad_norm": 0.04937691241502762,
624
+ "learning_rate": 7.823596792668958e-05,
625
+ "loss": 0.0278,
626
+ "step": 435
627
+ },
628
+ {
629
+ "epoch": 110.0,
630
+ "grad_norm": 0.11863432824611664,
631
+ "learning_rate": 7.79495990836197e-05,
632
+ "loss": 0.0316,
633
+ "step": 440
634
+ },
635
+ {
636
+ "epoch": 111.32,
637
+ "grad_norm": 0.04387475177645683,
638
+ "learning_rate": 7.766323024054983e-05,
639
+ "loss": 0.0283,
640
+ "step": 445
641
+ },
642
+ {
643
+ "epoch": 112.64,
644
+ "grad_norm": 0.04409867897629738,
645
+ "learning_rate": 7.737686139747996e-05,
646
+ "loss": 0.0306,
647
+ "step": 450
648
+ },
649
+ {
650
+ "epoch": 113.96,
651
+ "grad_norm": 0.04834749549627304,
652
+ "learning_rate": 7.709049255441008e-05,
653
+ "loss": 0.0302,
654
+ "step": 455
655
+ },
656
+ {
657
+ "epoch": 115.0,
658
+ "grad_norm": 0.1553424447774887,
659
+ "learning_rate": 7.680412371134021e-05,
660
+ "loss": 0.0326,
661
+ "step": 460
662
+ },
663
+ {
664
+ "epoch": 116.32,
665
+ "grad_norm": 0.05963806435465813,
666
+ "learning_rate": 7.651775486827034e-05,
667
+ "loss": 0.0291,
668
+ "step": 465
669
+ },
670
+ {
671
+ "epoch": 117.64,
672
+ "grad_norm": 0.04697559028863907,
673
+ "learning_rate": 7.623138602520046e-05,
674
+ "loss": 0.027,
675
+ "step": 470
676
+ },
677
+ {
678
+ "epoch": 118.96,
679
+ "grad_norm": 0.04225379601120949,
680
+ "learning_rate": 7.594501718213059e-05,
681
+ "loss": 0.0343,
682
+ "step": 475
683
+ },
684
+ {
685
+ "epoch": 120.0,
686
+ "grad_norm": 0.1076933965086937,
687
+ "learning_rate": 7.565864833906071e-05,
688
+ "loss": 0.0288,
689
+ "step": 480
690
+ },
691
+ {
692
+ "epoch": 121.32,
693
+ "grad_norm": 0.04540383443236351,
694
+ "learning_rate": 7.537227949599084e-05,
695
+ "loss": 0.0291,
696
+ "step": 485
697
+ },
698
+ {
699
+ "epoch": 122.64,
700
+ "grad_norm": 0.05459335818886757,
701
+ "learning_rate": 7.508591065292097e-05,
702
+ "loss": 0.0289,
703
+ "step": 490
704
+ },
705
+ {
706
+ "epoch": 123.96,
707
+ "grad_norm": 0.05171333625912666,
708
+ "learning_rate": 7.47995418098511e-05,
709
+ "loss": 0.0284,
710
+ "step": 495
711
+ },
712
+ {
713
+ "epoch": 125.0,
714
+ "grad_norm": 0.08606769144535065,
715
+ "learning_rate": 7.451317296678122e-05,
716
+ "loss": 0.0314,
717
+ "step": 500
718
+ },
719
+ {
720
+ "epoch": 126.32,
721
+ "grad_norm": 0.040535662323236465,
722
+ "learning_rate": 7.422680412371135e-05,
723
+ "loss": 0.028,
724
+ "step": 505
725
+ },
726
+ {
727
+ "epoch": 127.64,
728
+ "grad_norm": 0.04621696099638939,
729
+ "learning_rate": 7.394043528064147e-05,
730
+ "loss": 0.0281,
731
+ "step": 510
732
+ },
733
+ {
734
+ "epoch": 128.96,
735
+ "grad_norm": 0.04407593980431557,
736
+ "learning_rate": 7.36540664375716e-05,
737
+ "loss": 0.0309,
738
+ "step": 515
739
+ },
740
+ {
741
+ "epoch": 130.0,
742
+ "grad_norm": 0.24090737104415894,
743
+ "learning_rate": 7.336769759450171e-05,
744
+ "loss": 0.0302,
745
+ "step": 520
746
+ },
747
+ {
748
+ "epoch": 131.32,
749
+ "grad_norm": 0.051712971180677414,
750
+ "learning_rate": 7.308132875143184e-05,
751
+ "loss": 0.0305,
752
+ "step": 525
753
+ },
754
+ {
755
+ "epoch": 132.64,
756
+ "grad_norm": 0.0373610258102417,
757
+ "learning_rate": 7.279495990836197e-05,
758
+ "loss": 0.0283,
759
+ "step": 530
760
+ },
761
+ {
762
+ "epoch": 133.96,
763
+ "grad_norm": 0.04424213245511055,
764
+ "learning_rate": 7.250859106529209e-05,
765
+ "loss": 0.0317,
766
+ "step": 535
767
+ },
768
+ {
769
+ "epoch": 135.0,
770
+ "grad_norm": 0.09113436192274094,
771
+ "learning_rate": 7.222222222222222e-05,
772
+ "loss": 0.0302,
773
+ "step": 540
774
+ },
775
+ {
776
+ "epoch": 136.32,
777
+ "grad_norm": 0.03745009005069733,
778
+ "learning_rate": 7.193585337915235e-05,
779
+ "loss": 0.0315,
780
+ "step": 545
781
+ },
782
+ {
783
+ "epoch": 137.64,
784
+ "grad_norm": 0.04058730602264404,
785
+ "learning_rate": 7.164948453608247e-05,
786
+ "loss": 0.0312,
787
+ "step": 550
788
+ },
789
+ {
790
+ "epoch": 138.96,
791
+ "grad_norm": 0.046279069036245346,
792
+ "learning_rate": 7.136311569301261e-05,
793
+ "loss": 0.0295,
794
+ "step": 555
795
+ },
796
+ {
797
+ "epoch": 140.0,
798
+ "grad_norm": 0.17239141464233398,
799
+ "learning_rate": 7.107674684994274e-05,
800
+ "loss": 0.0307,
801
+ "step": 560
802
+ },
803
+ {
804
+ "epoch": 141.32,
805
+ "grad_norm": 0.036460030823946,
806
+ "learning_rate": 7.079037800687286e-05,
807
+ "loss": 0.0284,
808
+ "step": 565
809
+ },
810
+ {
811
+ "epoch": 142.64,
812
+ "grad_norm": 0.03434258699417114,
813
+ "learning_rate": 7.050400916380299e-05,
814
+ "loss": 0.0283,
815
+ "step": 570
816
+ },
817
+ {
818
+ "epoch": 143.96,
819
+ "grad_norm": 0.0470467284321785,
820
+ "learning_rate": 7.02176403207331e-05,
821
+ "loss": 0.0296,
822
+ "step": 575
823
+ },
824
+ {
825
+ "epoch": 145.0,
826
+ "grad_norm": 0.07163394242525101,
827
+ "learning_rate": 6.993127147766323e-05,
828
+ "loss": 0.0256,
829
+ "step": 580
830
+ },
831
+ {
832
+ "epoch": 146.32,
833
+ "grad_norm": 0.042208388447761536,
834
+ "learning_rate": 6.964490263459336e-05,
835
+ "loss": 0.0298,
836
+ "step": 585
837
+ },
838
+ {
839
+ "epoch": 147.64,
840
+ "grad_norm": 0.04421050846576691,
841
+ "learning_rate": 6.935853379152348e-05,
842
+ "loss": 0.0274,
843
+ "step": 590
844
+ },
845
+ {
846
+ "epoch": 148.96,
847
+ "grad_norm": 0.047223106026649475,
848
+ "learning_rate": 6.907216494845361e-05,
849
+ "loss": 0.0311,
850
+ "step": 595
851
+ },
852
+ {
853
+ "epoch": 150.0,
854
+ "grad_norm": 0.1724609136581421,
855
+ "learning_rate": 6.878579610538374e-05,
856
+ "loss": 0.0302,
857
+ "step": 600
858
+ },
859
+ {
860
+ "epoch": 151.32,
861
+ "grad_norm": 0.042247697710990906,
862
+ "learning_rate": 6.849942726231386e-05,
863
+ "loss": 0.0287,
864
+ "step": 605
865
+ },
866
+ {
867
+ "epoch": 152.64,
868
+ "grad_norm": 0.05167734622955322,
869
+ "learning_rate": 6.821305841924399e-05,
870
+ "loss": 0.0279,
871
+ "step": 610
872
+ },
873
+ {
874
+ "epoch": 153.96,
875
+ "grad_norm": 0.03621920198202133,
876
+ "learning_rate": 6.792668957617412e-05,
877
+ "loss": 0.0291,
878
+ "step": 615
879
+ },
880
+ {
881
+ "epoch": 155.0,
882
+ "grad_norm": 0.22533060610294342,
883
+ "learning_rate": 6.764032073310424e-05,
884
+ "loss": 0.0354,
885
+ "step": 620
886
+ },
887
+ {
888
+ "epoch": 156.32,
889
+ "grad_norm": 0.03441638499498367,
890
+ "learning_rate": 6.735395189003437e-05,
891
+ "loss": 0.0285,
892
+ "step": 625
893
+ },
894
+ {
895
+ "epoch": 157.64,
896
+ "grad_norm": 0.03728373721241951,
897
+ "learning_rate": 6.706758304696448e-05,
898
+ "loss": 0.0299,
899
+ "step": 630
900
+ },
901
+ {
902
+ "epoch": 158.96,
903
+ "grad_norm": 0.043604422360658646,
904
+ "learning_rate": 6.678121420389462e-05,
905
+ "loss": 0.0292,
906
+ "step": 635
907
+ },
908
+ {
909
+ "epoch": 160.0,
910
+ "grad_norm": 0.17142102122306824,
911
+ "learning_rate": 6.649484536082475e-05,
912
+ "loss": 0.0331,
913
+ "step": 640
914
+ },
915
+ {
916
+ "epoch": 161.32,
917
+ "grad_norm": 0.03554172441363335,
918
+ "learning_rate": 6.620847651775487e-05,
919
+ "loss": 0.0305,
920
+ "step": 645
921
+ },
922
+ {
923
+ "epoch": 162.64,
924
+ "grad_norm": 0.043817318975925446,
925
+ "learning_rate": 6.5922107674685e-05,
926
+ "loss": 0.0305,
927
+ "step": 650
928
+ },
929
+ {
930
+ "epoch": 163.96,
931
+ "grad_norm": 0.04247381538152695,
932
+ "learning_rate": 6.563573883161513e-05,
933
+ "loss": 0.0302,
934
+ "step": 655
935
+ },
936
+ {
937
+ "epoch": 165.0,
938
+ "grad_norm": 0.09436971694231033,
939
+ "learning_rate": 6.534936998854525e-05,
940
+ "loss": 0.0292,
941
+ "step": 660
942
+ },
943
+ {
944
+ "epoch": 166.32,
945
+ "grad_norm": 0.04177865758538246,
946
+ "learning_rate": 6.506300114547538e-05,
947
+ "loss": 0.0281,
948
+ "step": 665
949
+ },
950
+ {
951
+ "epoch": 167.64,
952
+ "grad_norm": 0.04079804942011833,
953
+ "learning_rate": 6.477663230240551e-05,
954
+ "loss": 0.0288,
955
+ "step": 670
956
+ },
957
+ {
958
+ "epoch": 168.96,
959
+ "grad_norm": 0.039114974439144135,
960
+ "learning_rate": 6.449026345933563e-05,
961
+ "loss": 0.0275,
962
+ "step": 675
963
+ },
964
+ {
965
+ "epoch": 170.0,
966
+ "grad_norm": 0.11661799997091293,
967
+ "learning_rate": 6.420389461626576e-05,
968
+ "loss": 0.0288,
969
+ "step": 680
970
+ },
971
+ {
972
+ "epoch": 171.32,
973
+ "grad_norm": 0.03423461318016052,
974
+ "learning_rate": 6.391752577319587e-05,
975
+ "loss": 0.0306,
976
+ "step": 685
977
+ },
978
+ {
979
+ "epoch": 172.64,
980
+ "grad_norm": 0.03615871071815491,
981
+ "learning_rate": 6.3631156930126e-05,
982
+ "loss": 0.031,
983
+ "step": 690
984
+ },
985
+ {
986
+ "epoch": 173.96,
987
+ "grad_norm": 0.04067518189549446,
988
+ "learning_rate": 6.334478808705613e-05,
989
+ "loss": 0.0312,
990
+ "step": 695
991
+ },
992
+ {
993
+ "epoch": 175.0,
994
+ "grad_norm": 0.11094173789024353,
995
+ "learning_rate": 6.305841924398625e-05,
996
+ "loss": 0.0287,
997
+ "step": 700
998
+ },
999
+ {
1000
+ "epoch": 176.32,
1001
+ "grad_norm": 0.03173477575182915,
1002
+ "learning_rate": 6.277205040091638e-05,
1003
+ "loss": 0.0296,
1004
+ "step": 705
1005
+ },
1006
+ {
1007
+ "epoch": 177.64,
1008
+ "grad_norm": 0.0374116450548172,
1009
+ "learning_rate": 6.24856815578465e-05,
1010
+ "loss": 0.0292,
1011
+ "step": 710
1012
+ },
1013
+ {
1014
+ "epoch": 178.96,
1015
+ "grad_norm": 0.03814936801791191,
1016
+ "learning_rate": 6.219931271477663e-05,
1017
+ "loss": 0.0273,
1018
+ "step": 715
1019
+ },
1020
+ {
1021
+ "epoch": 180.0,
1022
+ "grad_norm": 0.14255362749099731,
1023
+ "learning_rate": 6.191294387170676e-05,
1024
+ "loss": 0.0335,
1025
+ "step": 720
1026
+ },
1027
+ {
1028
+ "epoch": 181.32,
1029
+ "grad_norm": 0.04104507714509964,
1030
+ "learning_rate": 6.162657502863689e-05,
1031
+ "loss": 0.0303,
1032
+ "step": 725
1033
+ },
1034
+ {
1035
+ "epoch": 182.64,
1036
+ "grad_norm": 0.037353888154029846,
1037
+ "learning_rate": 6.134020618556701e-05,
1038
+ "loss": 0.0271,
1039
+ "step": 730
1040
+ },
1041
+ {
1042
+ "epoch": 183.96,
1043
+ "grad_norm": 0.03552788123488426,
1044
+ "learning_rate": 6.105383734249714e-05,
1045
+ "loss": 0.0288,
1046
+ "step": 735
1047
+ },
1048
+ {
1049
+ "epoch": 185.0,
1050
+ "grad_norm": 0.09345243126153946,
1051
+ "learning_rate": 6.076746849942726e-05,
1052
+ "loss": 0.0271,
1053
+ "step": 740
1054
+ },
1055
+ {
1056
+ "epoch": 186.32,
1057
+ "grad_norm": 0.031304650008678436,
1058
+ "learning_rate": 6.0481099656357384e-05,
1059
+ "loss": 0.0287,
1060
+ "step": 745
1061
+ },
1062
+ {
1063
+ "epoch": 187.64,
1064
+ "grad_norm": 0.03588686138391495,
1065
+ "learning_rate": 6.019473081328752e-05,
1066
+ "loss": 0.0292,
1067
+ "step": 750
1068
+ },
1069
+ {
1070
+ "epoch": 188.96,
1071
+ "grad_norm": 0.03166257590055466,
1072
+ "learning_rate": 5.9908361970217644e-05,
1073
+ "loss": 0.0277,
1074
+ "step": 755
1075
+ },
1076
+ {
1077
+ "epoch": 190.0,
1078
+ "grad_norm": 0.09115266799926758,
1079
+ "learning_rate": 5.962199312714777e-05,
1080
+ "loss": 0.0286,
1081
+ "step": 760
1082
+ },
1083
+ {
1084
+ "epoch": 191.32,
1085
+ "grad_norm": 0.028432967141270638,
1086
+ "learning_rate": 5.93356242840779e-05,
1087
+ "loss": 0.0277,
1088
+ "step": 765
1089
+ },
1090
+ {
1091
+ "epoch": 192.64,
1092
+ "grad_norm": 0.04126034304499626,
1093
+ "learning_rate": 5.904925544100802e-05,
1094
+ "loss": 0.0315,
1095
+ "step": 770
1096
+ },
1097
+ {
1098
+ "epoch": 193.96,
1099
+ "grad_norm": 0.04166596010327339,
1100
+ "learning_rate": 5.876288659793815e-05,
1101
+ "loss": 0.0281,
1102
+ "step": 775
1103
+ },
1104
+ {
1105
+ "epoch": 195.0,
1106
+ "grad_norm": 0.11017812788486481,
1107
+ "learning_rate": 5.8476517754868276e-05,
1108
+ "loss": 0.0285,
1109
+ "step": 780
1110
+ },
1111
+ {
1112
+ "epoch": 196.32,
1113
+ "grad_norm": 0.04071119427680969,
1114
+ "learning_rate": 5.81901489117984e-05,
1115
+ "loss": 0.0289,
1116
+ "step": 785
1117
+ },
1118
+ {
1119
+ "epoch": 197.64,
1120
+ "grad_norm": 0.03756481036543846,
1121
+ "learning_rate": 5.790378006872853e-05,
1122
+ "loss": 0.0276,
1123
+ "step": 790
1124
+ },
1125
+ {
1126
+ "epoch": 198.96,
1127
+ "grad_norm": 0.039780210703611374,
1128
+ "learning_rate": 5.761741122565865e-05,
1129
+ "loss": 0.0282,
1130
+ "step": 795
1131
+ },
1132
+ {
1133
+ "epoch": 200.0,
1134
+ "grad_norm": 0.12418342381715775,
1135
+ "learning_rate": 5.7331042382588775e-05,
1136
+ "loss": 0.0296,
1137
+ "step": 800
1138
+ },
1139
+ {
1140
+ "epoch": 201.32,
1141
+ "grad_norm": 0.0338447242975235,
1142
+ "learning_rate": 5.70446735395189e-05,
1143
+ "loss": 0.0286,
1144
+ "step": 805
1145
+ },
1146
+ {
1147
+ "epoch": 202.64,
1148
+ "grad_norm": 0.03490043804049492,
1149
+ "learning_rate": 5.675830469644903e-05,
1150
+ "loss": 0.0306,
1151
+ "step": 810
1152
+ },
1153
+ {
1154
+ "epoch": 203.96,
1155
+ "grad_norm": 0.03847096487879753,
1156
+ "learning_rate": 5.6471935853379155e-05,
1157
+ "loss": 0.0283,
1158
+ "step": 815
1159
+ },
1160
+ {
1161
+ "epoch": 205.0,
1162
+ "grad_norm": 0.10988269001245499,
1163
+ "learning_rate": 5.618556701030928e-05,
1164
+ "loss": 0.0278,
1165
+ "step": 820
1166
+ },
1167
+ {
1168
+ "epoch": 206.32,
1169
+ "grad_norm": 0.034018851816654205,
1170
+ "learning_rate": 5.589919816723941e-05,
1171
+ "loss": 0.03,
1172
+ "step": 825
1173
+ },
1174
+ {
1175
+ "epoch": 207.64,
1176
+ "grad_norm": 0.032927289605140686,
1177
+ "learning_rate": 5.5612829324169534e-05,
1178
+ "loss": 0.0293,
1179
+ "step": 830
1180
+ },
1181
+ {
1182
+ "epoch": 208.96,
1183
+ "grad_norm": 0.03604916110634804,
1184
+ "learning_rate": 5.532646048109966e-05,
1185
+ "loss": 0.0285,
1186
+ "step": 835
1187
+ },
1188
+ {
1189
+ "epoch": 210.0,
1190
+ "grad_norm": 0.09708557277917862,
1191
+ "learning_rate": 5.504009163802979e-05,
1192
+ "loss": 0.0305,
1193
+ "step": 840
1194
+ },
1195
+ {
1196
+ "epoch": 211.32,
1197
+ "grad_norm": 0.03745417296886444,
1198
+ "learning_rate": 5.4753722794959914e-05,
1199
+ "loss": 0.0296,
1200
+ "step": 845
1201
+ },
1202
+ {
1203
+ "epoch": 212.64,
1204
+ "grad_norm": 0.028906095772981644,
1205
+ "learning_rate": 5.4467353951890033e-05,
1206
+ "loss": 0.0277,
1207
+ "step": 850
1208
+ },
1209
+ {
1210
+ "epoch": 213.96,
1211
+ "grad_norm": 0.03228568285703659,
1212
+ "learning_rate": 5.418098510882016e-05,
1213
+ "loss": 0.0295,
1214
+ "step": 855
1215
+ },
1216
+ {
1217
+ "epoch": 215.0,
1218
+ "grad_norm": 0.1302802711725235,
1219
+ "learning_rate": 5.3894616265750286e-05,
1220
+ "loss": 0.031,
1221
+ "step": 860
1222
+ },
1223
+ {
1224
+ "epoch": 216.32,
1225
+ "grad_norm": 0.031472526490688324,
1226
+ "learning_rate": 5.360824742268041e-05,
1227
+ "loss": 0.0286,
1228
+ "step": 865
1229
+ },
1230
+ {
1231
+ "epoch": 217.64,
1232
+ "grad_norm": 0.03589686006307602,
1233
+ "learning_rate": 5.332187857961054e-05,
1234
+ "loss": 0.0308,
1235
+ "step": 870
1236
+ },
1237
+ {
1238
+ "epoch": 218.96,
1239
+ "grad_norm": 0.04117952659726143,
1240
+ "learning_rate": 5.3035509736540666e-05,
1241
+ "loss": 0.0298,
1242
+ "step": 875
1243
+ },
1244
+ {
1245
+ "epoch": 220.0,
1246
+ "grad_norm": 0.16901935636997223,
1247
+ "learning_rate": 5.274914089347079e-05,
1248
+ "loss": 0.0316,
1249
+ "step": 880
1250
+ },
1251
+ {
1252
+ "epoch": 221.32,
1253
+ "grad_norm": 0.03608705848455429,
1254
+ "learning_rate": 5.246277205040092e-05,
1255
+ "loss": 0.0297,
1256
+ "step": 885
1257
+ },
1258
+ {
1259
+ "epoch": 222.64,
1260
+ "grad_norm": 0.028423065319657326,
1261
+ "learning_rate": 5.2176403207331045e-05,
1262
+ "loss": 0.029,
1263
+ "step": 890
1264
+ },
1265
+ {
1266
+ "epoch": 223.96,
1267
+ "grad_norm": 0.03328604996204376,
1268
+ "learning_rate": 5.189003436426118e-05,
1269
+ "loss": 0.0304,
1270
+ "step": 895
1271
+ },
1272
+ {
1273
+ "epoch": 225.0,
1274
+ "grad_norm": 0.1140102967619896,
1275
+ "learning_rate": 5.1603665521191305e-05,
1276
+ "loss": 0.0295,
1277
+ "step": 900
1278
+ },
1279
+ {
1280
+ "epoch": 226.32,
1281
+ "grad_norm": 0.03379100188612938,
1282
+ "learning_rate": 5.131729667812142e-05,
1283
+ "loss": 0.0289,
1284
+ "step": 905
1285
+ },
1286
+ {
1287
+ "epoch": 227.64,
1288
+ "grad_norm": 0.03175675496459007,
1289
+ "learning_rate": 5.1030927835051544e-05,
1290
+ "loss": 0.0272,
1291
+ "step": 910
1292
+ },
1293
+ {
1294
+ "epoch": 228.96,
1295
+ "grad_norm": 0.0344826877117157,
1296
+ "learning_rate": 5.074455899198167e-05,
1297
+ "loss": 0.0308,
1298
+ "step": 915
1299
+ },
1300
+ {
1301
+ "epoch": 230.0,
1302
+ "grad_norm": 0.1841171830892563,
1303
+ "learning_rate": 5.04581901489118e-05,
1304
+ "loss": 0.0308,
1305
+ "step": 920
1306
+ },
1307
+ {
1308
+ "epoch": 231.32,
1309
+ "grad_norm": 0.03660387173295021,
1310
+ "learning_rate": 5.0171821305841924e-05,
1311
+ "loss": 0.0326,
1312
+ "step": 925
1313
+ },
1314
+ {
1315
+ "epoch": 232.64,
1316
+ "grad_norm": 0.03065328672528267,
1317
+ "learning_rate": 4.988545246277205e-05,
1318
+ "loss": 0.0295,
1319
+ "step": 930
1320
+ },
1321
+ {
1322
+ "epoch": 233.96,
1323
+ "grad_norm": 0.03993593156337738,
1324
+ "learning_rate": 4.9599083619702184e-05,
1325
+ "loss": 0.0292,
1326
+ "step": 935
1327
+ },
1328
+ {
1329
+ "epoch": 235.0,
1330
+ "grad_norm": 0.10738981515169144,
1331
+ "learning_rate": 4.931271477663231e-05,
1332
+ "loss": 0.0302,
1333
+ "step": 940
1334
+ },
1335
+ {
1336
+ "epoch": 236.32,
1337
+ "grad_norm": 0.03143048286437988,
1338
+ "learning_rate": 4.902634593356243e-05,
1339
+ "loss": 0.0278,
1340
+ "step": 945
1341
+ },
1342
+ {
1343
+ "epoch": 237.64,
1344
+ "grad_norm": 0.028968214988708496,
1345
+ "learning_rate": 4.8739977090492556e-05,
1346
+ "loss": 0.0271,
1347
+ "step": 950
1348
+ },
1349
+ {
1350
+ "epoch": 238.96,
1351
+ "grad_norm": 0.038674987852573395,
1352
+ "learning_rate": 4.845360824742268e-05,
1353
+ "loss": 0.0297,
1354
+ "step": 955
1355
+ },
1356
+ {
1357
+ "epoch": 240.0,
1358
+ "grad_norm": 0.10797161608934402,
1359
+ "learning_rate": 4.816723940435281e-05,
1360
+ "loss": 0.0278,
1361
+ "step": 960
1362
+ },
1363
+ {
1364
+ "epoch": 241.32,
1365
+ "grad_norm": 0.03592285141348839,
1366
+ "learning_rate": 4.7880870561282936e-05,
1367
+ "loss": 0.0281,
1368
+ "step": 965
1369
+ },
1370
+ {
1371
+ "epoch": 242.64,
1372
+ "grad_norm": 0.031206540763378143,
1373
+ "learning_rate": 4.7594501718213055e-05,
1374
+ "loss": 0.031,
1375
+ "step": 970
1376
+ },
1377
+ {
1378
+ "epoch": 243.96,
1379
+ "grad_norm": 0.03692101314663887,
1380
+ "learning_rate": 4.730813287514318e-05,
1381
+ "loss": 0.0276,
1382
+ "step": 975
1383
+ },
1384
+ {
1385
+ "epoch": 245.0,
1386
+ "grad_norm": 0.1415632963180542,
1387
+ "learning_rate": 4.7021764032073315e-05,
1388
+ "loss": 0.0325,
1389
+ "step": 980
1390
+ },
1391
+ {
1392
+ "epoch": 246.32,
1393
+ "grad_norm": 0.0346578024327755,
1394
+ "learning_rate": 4.673539518900344e-05,
1395
+ "loss": 0.0291,
1396
+ "step": 985
1397
+ },
1398
+ {
1399
+ "epoch": 247.64,
1400
+ "grad_norm": 0.036887165158987045,
1401
+ "learning_rate": 4.644902634593357e-05,
1402
+ "loss": 0.0279,
1403
+ "step": 990
1404
+ },
1405
+ {
1406
+ "epoch": 248.96,
1407
+ "grad_norm": 0.03107571043074131,
1408
+ "learning_rate": 4.6162657502863694e-05,
1409
+ "loss": 0.0277,
1410
+ "step": 995
1411
+ },
1412
+ {
1413
+ "epoch": 250.0,
1414
+ "grad_norm": 0.13857436180114746,
1415
+ "learning_rate": 4.5876288659793814e-05,
1416
+ "loss": 0.0287,
1417
+ "step": 1000
1418
+ },
1419
+ {
1420
+ "epoch": 251.32,
1421
+ "grad_norm": 0.03328908607363701,
1422
+ "learning_rate": 4.558991981672394e-05,
1423
+ "loss": 0.0275,
1424
+ "step": 1005
1425
+ },
1426
+ {
1427
+ "epoch": 252.64,
1428
+ "grad_norm": 0.03218206763267517,
1429
+ "learning_rate": 4.530355097365407e-05,
1430
+ "loss": 0.0295,
1431
+ "step": 1010
1432
+ },
1433
+ {
1434
+ "epoch": 253.96,
1435
+ "grad_norm": 0.030677294358611107,
1436
+ "learning_rate": 4.5017182130584194e-05,
1437
+ "loss": 0.0288,
1438
+ "step": 1015
1439
+ },
1440
+ {
1441
+ "epoch": 255.0,
1442
+ "grad_norm": 0.08906098455190659,
1443
+ "learning_rate": 4.473081328751432e-05,
1444
+ "loss": 0.0283,
1445
+ "step": 1020
1446
+ },
1447
+ {
1448
+ "epoch": 256.32,
1449
+ "grad_norm": 0.0315646268427372,
1450
+ "learning_rate": 4.4444444444444447e-05,
1451
+ "loss": 0.0292,
1452
+ "step": 1025
1453
+ },
1454
+ {
1455
+ "epoch": 257.64,
1456
+ "grad_norm": 0.0322076752781868,
1457
+ "learning_rate": 4.415807560137457e-05,
1458
+ "loss": 0.0286,
1459
+ "step": 1030
1460
+ },
1461
+ {
1462
+ "epoch": 258.96,
1463
+ "grad_norm": 0.03561684116721153,
1464
+ "learning_rate": 4.38717067583047e-05,
1465
+ "loss": 0.0266,
1466
+ "step": 1035
1467
+ },
1468
+ {
1469
+ "epoch": 260.0,
1470
+ "grad_norm": 0.1383010447025299,
1471
+ "learning_rate": 4.3585337915234826e-05,
1472
+ "loss": 0.0291,
1473
+ "step": 1040
1474
+ },
1475
+ {
1476
+ "epoch": 261.32,
1477
+ "grad_norm": 0.02982248179614544,
1478
+ "learning_rate": 4.329896907216495e-05,
1479
+ "loss": 0.0281,
1480
+ "step": 1045
1481
+ },
1482
+ {
1483
+ "epoch": 262.64,
1484
+ "grad_norm": 0.03563191369175911,
1485
+ "learning_rate": 4.301260022909508e-05,
1486
+ "loss": 0.0285,
1487
+ "step": 1050
1488
+ },
1489
+ {
1490
+ "epoch": 263.96,
1491
+ "grad_norm": 0.03730940818786621,
1492
+ "learning_rate": 4.27262313860252e-05,
1493
+ "loss": 0.0298,
1494
+ "step": 1055
1495
+ },
1496
+ {
1497
+ "epoch": 265.0,
1498
+ "grad_norm": 0.12043489515781403,
1499
+ "learning_rate": 4.2439862542955325e-05,
1500
+ "loss": 0.029,
1501
+ "step": 1060
1502
+ },
1503
+ {
1504
+ "epoch": 266.32,
1505
+ "grad_norm": 0.03577538579702377,
1506
+ "learning_rate": 4.215349369988545e-05,
1507
+ "loss": 0.0304,
1508
+ "step": 1065
1509
+ },
1510
+ {
1511
+ "epoch": 267.64,
1512
+ "grad_norm": 0.035051047801971436,
1513
+ "learning_rate": 4.1867124856815585e-05,
1514
+ "loss": 0.0292,
1515
+ "step": 1070
1516
+ },
1517
+ {
1518
+ "epoch": 268.96,
1519
+ "grad_norm": 0.03524423763155937,
1520
+ "learning_rate": 4.158075601374571e-05,
1521
+ "loss": 0.029,
1522
+ "step": 1075
1523
+ },
1524
+ {
1525
+ "epoch": 270.0,
1526
+ "grad_norm": 0.11722230911254883,
1527
+ "learning_rate": 4.129438717067583e-05,
1528
+ "loss": 0.0286,
1529
+ "step": 1080
1530
+ },
1531
+ {
1532
+ "epoch": 271.32,
1533
+ "grad_norm": 0.0350823737680912,
1534
+ "learning_rate": 4.100801832760596e-05,
1535
+ "loss": 0.0295,
1536
+ "step": 1085
1537
+ },
1538
+ {
1539
+ "epoch": 272.64,
1540
+ "grad_norm": 0.03372941538691521,
1541
+ "learning_rate": 4.0721649484536084e-05,
1542
+ "loss": 0.0288,
1543
+ "step": 1090
1544
+ },
1545
+ {
1546
+ "epoch": 273.96,
1547
+ "grad_norm": 0.028644917532801628,
1548
+ "learning_rate": 4.043528064146621e-05,
1549
+ "loss": 0.0326,
1550
+ "step": 1095
1551
+ },
1552
+ {
1553
+ "epoch": 275.0,
1554
+ "grad_norm": 0.10958810150623322,
1555
+ "learning_rate": 4.014891179839634e-05,
1556
+ "loss": 0.0293,
1557
+ "step": 1100
1558
+ },
1559
+ {
1560
+ "epoch": 276.32,
1561
+ "grad_norm": 0.03524491935968399,
1562
+ "learning_rate": 3.9862542955326463e-05,
1563
+ "loss": 0.0289,
1564
+ "step": 1105
1565
+ },
1566
+ {
1567
+ "epoch": 277.64,
1568
+ "grad_norm": 0.028043361380696297,
1569
+ "learning_rate": 3.957617411225659e-05,
1570
+ "loss": 0.0292,
1571
+ "step": 1110
1572
+ },
1573
+ {
1574
+ "epoch": 278.96,
1575
+ "grad_norm": 0.03574656322598457,
1576
+ "learning_rate": 3.9289805269186716e-05,
1577
+ "loss": 0.028,
1578
+ "step": 1115
1579
+ },
1580
+ {
1581
+ "epoch": 280.0,
1582
+ "grad_norm": 0.12416456639766693,
1583
+ "learning_rate": 3.900343642611684e-05,
1584
+ "loss": 0.0278,
1585
+ "step": 1120
1586
+ },
1587
+ {
1588
+ "epoch": 281.32,
1589
+ "grad_norm": 0.02984347939491272,
1590
+ "learning_rate": 3.871706758304697e-05,
1591
+ "loss": 0.0324,
1592
+ "step": 1125
1593
+ },
1594
+ {
1595
+ "epoch": 282.64,
1596
+ "grad_norm": 0.03649289906024933,
1597
+ "learning_rate": 3.8430698739977096e-05,
1598
+ "loss": 0.0281,
1599
+ "step": 1130
1600
+ },
1601
+ {
1602
+ "epoch": 283.96,
1603
+ "grad_norm": 0.03943822532892227,
1604
+ "learning_rate": 3.8144329896907216e-05,
1605
+ "loss": 0.0268,
1606
+ "step": 1135
1607
+ },
1608
+ {
1609
+ "epoch": 285.0,
1610
+ "grad_norm": 0.14334431290626526,
1611
+ "learning_rate": 3.785796105383734e-05,
1612
+ "loss": 0.0305,
1613
+ "step": 1140
1614
+ },
1615
+ {
1616
+ "epoch": 286.32,
1617
+ "grad_norm": 0.030261779204010963,
1618
+ "learning_rate": 3.757159221076747e-05,
1619
+ "loss": 0.028,
1620
+ "step": 1145
1621
+ },
1622
+ {
1623
+ "epoch": 287.64,
1624
+ "grad_norm": 0.03134704381227493,
1625
+ "learning_rate": 3.7285223367697595e-05,
1626
+ "loss": 0.0267,
1627
+ "step": 1150
1628
+ },
1629
+ {
1630
+ "epoch": 288.96,
1631
+ "grad_norm": 0.031728796660900116,
1632
+ "learning_rate": 3.699885452462772e-05,
1633
+ "loss": 0.028,
1634
+ "step": 1155
1635
+ },
1636
+ {
1637
+ "epoch": 290.0,
1638
+ "grad_norm": 0.15487806499004364,
1639
+ "learning_rate": 3.671248568155785e-05,
1640
+ "loss": 0.031,
1641
+ "step": 1160
1642
+ },
1643
+ {
1644
+ "epoch": 291.32,
1645
+ "grad_norm": 0.033745523542165756,
1646
+ "learning_rate": 3.6426116838487974e-05,
1647
+ "loss": 0.0298,
1648
+ "step": 1165
1649
+ },
1650
+ {
1651
+ "epoch": 292.64,
1652
+ "grad_norm": 0.026857230812311172,
1653
+ "learning_rate": 3.61397479954181e-05,
1654
+ "loss": 0.0265,
1655
+ "step": 1170
1656
+ },
1657
+ {
1658
+ "epoch": 293.96,
1659
+ "grad_norm": 0.03467594459652901,
1660
+ "learning_rate": 3.585337915234823e-05,
1661
+ "loss": 0.0291,
1662
+ "step": 1175
1663
+ },
1664
+ {
1665
+ "epoch": 295.0,
1666
+ "grad_norm": 0.1255461573600769,
1667
+ "learning_rate": 3.5567010309278354e-05,
1668
+ "loss": 0.0304,
1669
+ "step": 1180
1670
+ },
1671
+ {
1672
+ "epoch": 296.32,
1673
+ "grad_norm": 0.03569836914539337,
1674
+ "learning_rate": 3.528064146620848e-05,
1675
+ "loss": 0.0275,
1676
+ "step": 1185
1677
+ },
1678
+ {
1679
+ "epoch": 297.64,
1680
+ "grad_norm": 0.03207559511065483,
1681
+ "learning_rate": 3.49942726231386e-05,
1682
+ "loss": 0.0288,
1683
+ "step": 1190
1684
+ },
1685
+ {
1686
+ "epoch": 298.96,
1687
+ "grad_norm": 0.03445427492260933,
1688
+ "learning_rate": 3.4707903780068726e-05,
1689
+ "loss": 0.0274,
1690
+ "step": 1195
1691
+ },
1692
+ {
1693
+ "epoch": 300.0,
1694
+ "grad_norm": 0.11089900881052017,
1695
+ "learning_rate": 3.442153493699885e-05,
1696
+ "loss": 0.0268,
1697
+ "step": 1200
1698
+ }
1699
+ ],
1700
+ "logging_steps": 5,
1701
+ "max_steps": 1800,
1702
+ "num_input_tokens_seen": 0,
1703
+ "num_train_epochs": 450,
1704
+ "save_steps": 300,
1705
+ "stateful_callbacks": {
1706
+ "TrainerControl": {
1707
+ "args": {
1708
+ "should_epoch_stop": false,
1709
+ "should_evaluate": false,
1710
+ "should_log": false,
1711
+ "should_save": true,
1712
+ "should_training_stop": false
1713
+ },
1714
+ "attributes": {}
1715
+ }
1716
+ },
1717
+ "total_flos": 1.0205427400704e+18,
1718
+ "train_batch_size": 2,
1719
+ "trial_name": null,
1720
+ "trial_params": null
1721
+ }
Mu-Math/group_01/checkpoints/checkpoint-1200/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
Mu-Math/group_01/checkpoints/checkpoint-1500/trainer_state.json ADDED
@@ -0,0 +1,2141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 375.0,
6
+ "eval_steps": 500,
7
+ "global_step": 1500,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.32,
14
+ "grad_norm": 11.867908477783203,
15
+ "learning_rate": 0.0,
16
+ "loss": 1.9204,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 1.32,
21
+ "grad_norm": 7.492858409881592,
22
+ "learning_rate": 7.4074074074074075e-06,
23
+ "loss": 1.8831,
24
+ "step": 5
25
+ },
26
+ {
27
+ "epoch": 2.64,
28
+ "grad_norm": 3.9426615238189697,
29
+ "learning_rate": 1.6666666666666667e-05,
30
+ "loss": 1.6453,
31
+ "step": 10
32
+ },
33
+ {
34
+ "epoch": 3.96,
35
+ "grad_norm": 1.769984483718872,
36
+ "learning_rate": 2.5925925925925925e-05,
37
+ "loss": 1.2506,
38
+ "step": 15
39
+ },
40
+ {
41
+ "epoch": 5.0,
42
+ "grad_norm": 1.108256220817566,
43
+ "learning_rate": 3.518518518518519e-05,
44
+ "loss": 1.0012,
45
+ "step": 20
46
+ },
47
+ {
48
+ "epoch": 6.32,
49
+ "grad_norm": 0.5219796299934387,
50
+ "learning_rate": 4.4444444444444447e-05,
51
+ "loss": 0.8034,
52
+ "step": 25
53
+ },
54
+ {
55
+ "epoch": 7.64,
56
+ "grad_norm": 0.6449305415153503,
57
+ "learning_rate": 5.370370370370371e-05,
58
+ "loss": 0.6539,
59
+ "step": 30
60
+ },
61
+ {
62
+ "epoch": 8.96,
63
+ "grad_norm": 0.580233633518219,
64
+ "learning_rate": 6.296296296296296e-05,
65
+ "loss": 0.5474,
66
+ "step": 35
67
+ },
68
+ {
69
+ "epoch": 10.0,
70
+ "grad_norm": 1.5570186376571655,
71
+ "learning_rate": 7.222222222222222e-05,
72
+ "loss": 0.4811,
73
+ "step": 40
74
+ },
75
+ {
76
+ "epoch": 11.32,
77
+ "grad_norm": 0.5841688513755798,
78
+ "learning_rate": 8.148148148148148e-05,
79
+ "loss": 0.3477,
80
+ "step": 45
81
+ },
82
+ {
83
+ "epoch": 12.64,
84
+ "grad_norm": 0.7968279719352722,
85
+ "learning_rate": 9.074074074074075e-05,
86
+ "loss": 0.2089,
87
+ "step": 50
88
+ },
89
+ {
90
+ "epoch": 13.96,
91
+ "grad_norm": 0.8396451473236084,
92
+ "learning_rate": 0.0001,
93
+ "loss": 0.1357,
94
+ "step": 55
95
+ },
96
+ {
97
+ "epoch": 15.0,
98
+ "grad_norm": 2.7755286693573,
99
+ "learning_rate": 9.971363115693013e-05,
100
+ "loss": 0.1235,
101
+ "step": 60
102
+ },
103
+ {
104
+ "epoch": 16.32,
105
+ "grad_norm": 0.6953228116035461,
106
+ "learning_rate": 9.942726231386026e-05,
107
+ "loss": 0.0755,
108
+ "step": 65
109
+ },
110
+ {
111
+ "epoch": 17.64,
112
+ "grad_norm": 1.1248857975006104,
113
+ "learning_rate": 9.914089347079038e-05,
114
+ "loss": 0.0546,
115
+ "step": 70
116
+ },
117
+ {
118
+ "epoch": 18.96,
119
+ "grad_norm": 0.5247378945350647,
120
+ "learning_rate": 9.885452462772051e-05,
121
+ "loss": 0.0602,
122
+ "step": 75
123
+ },
124
+ {
125
+ "epoch": 20.0,
126
+ "grad_norm": 1.827890157699585,
127
+ "learning_rate": 9.856815578465064e-05,
128
+ "loss": 0.0501,
129
+ "step": 80
130
+ },
131
+ {
132
+ "epoch": 21.32,
133
+ "grad_norm": 0.23602119088172913,
134
+ "learning_rate": 9.828178694158075e-05,
135
+ "loss": 0.0443,
136
+ "step": 85
137
+ },
138
+ {
139
+ "epoch": 22.64,
140
+ "grad_norm": 0.2811133861541748,
141
+ "learning_rate": 9.799541809851088e-05,
142
+ "loss": 0.0448,
143
+ "step": 90
144
+ },
145
+ {
146
+ "epoch": 23.96,
147
+ "grad_norm": 0.29150959849357605,
148
+ "learning_rate": 9.7709049255441e-05,
149
+ "loss": 0.0426,
150
+ "step": 95
151
+ },
152
+ {
153
+ "epoch": 25.0,
154
+ "grad_norm": 1.4590457677841187,
155
+ "learning_rate": 9.742268041237114e-05,
156
+ "loss": 0.04,
157
+ "step": 100
158
+ },
159
+ {
160
+ "epoch": 26.32,
161
+ "grad_norm": 0.15335209667682648,
162
+ "learning_rate": 9.713631156930127e-05,
163
+ "loss": 0.0374,
164
+ "step": 105
165
+ },
166
+ {
167
+ "epoch": 27.64,
168
+ "grad_norm": 0.3241201639175415,
169
+ "learning_rate": 9.68499427262314e-05,
170
+ "loss": 0.0378,
171
+ "step": 110
172
+ },
173
+ {
174
+ "epoch": 28.96,
175
+ "grad_norm": 0.18619631230831146,
176
+ "learning_rate": 9.656357388316152e-05,
177
+ "loss": 0.0374,
178
+ "step": 115
179
+ },
180
+ {
181
+ "epoch": 30.0,
182
+ "grad_norm": 0.4512801170349121,
183
+ "learning_rate": 9.627720504009165e-05,
184
+ "loss": 0.0342,
185
+ "step": 120
186
+ },
187
+ {
188
+ "epoch": 31.32,
189
+ "grad_norm": 0.21706914901733398,
190
+ "learning_rate": 9.599083619702178e-05,
191
+ "loss": 0.0369,
192
+ "step": 125
193
+ },
194
+ {
195
+ "epoch": 32.64,
196
+ "grad_norm": 0.42762166261672974,
197
+ "learning_rate": 9.57044673539519e-05,
198
+ "loss": 0.0355,
199
+ "step": 130
200
+ },
201
+ {
202
+ "epoch": 33.96,
203
+ "grad_norm": 0.1793977916240692,
204
+ "learning_rate": 9.541809851088203e-05,
205
+ "loss": 0.0347,
206
+ "step": 135
207
+ },
208
+ {
209
+ "epoch": 35.0,
210
+ "grad_norm": 1.866305947303772,
211
+ "learning_rate": 9.513172966781214e-05,
212
+ "loss": 0.0368,
213
+ "step": 140
214
+ },
215
+ {
216
+ "epoch": 36.32,
217
+ "grad_norm": 0.09879657626152039,
218
+ "learning_rate": 9.484536082474227e-05,
219
+ "loss": 0.0347,
220
+ "step": 145
221
+ },
222
+ {
223
+ "epoch": 37.64,
224
+ "grad_norm": 0.09229481220245361,
225
+ "learning_rate": 9.45589919816724e-05,
226
+ "loss": 0.0338,
227
+ "step": 150
228
+ },
229
+ {
230
+ "epoch": 38.96,
231
+ "grad_norm": 0.11409584432840347,
232
+ "learning_rate": 9.427262313860252e-05,
233
+ "loss": 0.0339,
234
+ "step": 155
235
+ },
236
+ {
237
+ "epoch": 40.0,
238
+ "grad_norm": 0.35678204894065857,
239
+ "learning_rate": 9.398625429553265e-05,
240
+ "loss": 0.0347,
241
+ "step": 160
242
+ },
243
+ {
244
+ "epoch": 41.32,
245
+ "grad_norm": 0.08212767541408539,
246
+ "learning_rate": 9.369988545246277e-05,
247
+ "loss": 0.0345,
248
+ "step": 165
249
+ },
250
+ {
251
+ "epoch": 42.64,
252
+ "grad_norm": 0.07271627336740494,
253
+ "learning_rate": 9.34135166093929e-05,
254
+ "loss": 0.0305,
255
+ "step": 170
256
+ },
257
+ {
258
+ "epoch": 43.96,
259
+ "grad_norm": 0.23211534321308136,
260
+ "learning_rate": 9.312714776632303e-05,
261
+ "loss": 0.0321,
262
+ "step": 175
263
+ },
264
+ {
265
+ "epoch": 45.0,
266
+ "grad_norm": 0.23425568640232086,
267
+ "learning_rate": 9.284077892325315e-05,
268
+ "loss": 0.0334,
269
+ "step": 180
270
+ },
271
+ {
272
+ "epoch": 46.32,
273
+ "grad_norm": 0.07825004309415817,
274
+ "learning_rate": 9.255441008018328e-05,
275
+ "loss": 0.0349,
276
+ "step": 185
277
+ },
278
+ {
279
+ "epoch": 47.64,
280
+ "grad_norm": 0.06621824949979782,
281
+ "learning_rate": 9.22680412371134e-05,
282
+ "loss": 0.0302,
283
+ "step": 190
284
+ },
285
+ {
286
+ "epoch": 48.96,
287
+ "grad_norm": 0.0967830941081047,
288
+ "learning_rate": 9.198167239404353e-05,
289
+ "loss": 0.0316,
290
+ "step": 195
291
+ },
292
+ {
293
+ "epoch": 50.0,
294
+ "grad_norm": 0.39718347787857056,
295
+ "learning_rate": 9.169530355097366e-05,
296
+ "loss": 0.0307,
297
+ "step": 200
298
+ },
299
+ {
300
+ "epoch": 51.32,
301
+ "grad_norm": 0.06881817430257797,
302
+ "learning_rate": 9.140893470790379e-05,
303
+ "loss": 0.0291,
304
+ "step": 205
305
+ },
306
+ {
307
+ "epoch": 52.64,
308
+ "grad_norm": 0.07241260260343552,
309
+ "learning_rate": 9.112256586483391e-05,
310
+ "loss": 0.032,
311
+ "step": 210
312
+ },
313
+ {
314
+ "epoch": 53.96,
315
+ "grad_norm": 0.08191649615764618,
316
+ "learning_rate": 9.083619702176404e-05,
317
+ "loss": 0.0293,
318
+ "step": 215
319
+ },
320
+ {
321
+ "epoch": 55.0,
322
+ "grad_norm": 0.20381148159503937,
323
+ "learning_rate": 9.054982817869416e-05,
324
+ "loss": 0.033,
325
+ "step": 220
326
+ },
327
+ {
328
+ "epoch": 56.32,
329
+ "grad_norm": 0.0765785425901413,
330
+ "learning_rate": 9.026345933562429e-05,
331
+ "loss": 0.0323,
332
+ "step": 225
333
+ },
334
+ {
335
+ "epoch": 57.64,
336
+ "grad_norm": 0.0698801577091217,
337
+ "learning_rate": 8.997709049255442e-05,
338
+ "loss": 0.0324,
339
+ "step": 230
340
+ },
341
+ {
342
+ "epoch": 58.96,
343
+ "grad_norm": 0.08089473098516464,
344
+ "learning_rate": 8.969072164948454e-05,
345
+ "loss": 0.0314,
346
+ "step": 235
347
+ },
348
+ {
349
+ "epoch": 60.0,
350
+ "grad_norm": 0.22270062565803528,
351
+ "learning_rate": 8.940435280641467e-05,
352
+ "loss": 0.0303,
353
+ "step": 240
354
+ },
355
+ {
356
+ "epoch": 61.32,
357
+ "grad_norm": 0.07712433487176895,
358
+ "learning_rate": 8.91179839633448e-05,
359
+ "loss": 0.0324,
360
+ "step": 245
361
+ },
362
+ {
363
+ "epoch": 62.64,
364
+ "grad_norm": 0.05860769376158714,
365
+ "learning_rate": 8.883161512027491e-05,
366
+ "loss": 0.0321,
367
+ "step": 250
368
+ },
369
+ {
370
+ "epoch": 63.96,
371
+ "grad_norm": 0.05999445170164108,
372
+ "learning_rate": 8.854524627720504e-05,
373
+ "loss": 0.0315,
374
+ "step": 255
375
+ },
376
+ {
377
+ "epoch": 65.0,
378
+ "grad_norm": 0.20564565062522888,
379
+ "learning_rate": 8.825887743413516e-05,
380
+ "loss": 0.0364,
381
+ "step": 260
382
+ },
383
+ {
384
+ "epoch": 66.32,
385
+ "grad_norm": 0.0610821433365345,
386
+ "learning_rate": 8.797250859106529e-05,
387
+ "loss": 0.0311,
388
+ "step": 265
389
+ },
390
+ {
391
+ "epoch": 67.64,
392
+ "grad_norm": 0.05693706497550011,
393
+ "learning_rate": 8.768613974799542e-05,
394
+ "loss": 0.0294,
395
+ "step": 270
396
+ },
397
+ {
398
+ "epoch": 68.96,
399
+ "grad_norm": 0.06817185133695602,
400
+ "learning_rate": 8.739977090492554e-05,
401
+ "loss": 0.0284,
402
+ "step": 275
403
+ },
404
+ {
405
+ "epoch": 70.0,
406
+ "grad_norm": 0.17458151280879974,
407
+ "learning_rate": 8.711340206185567e-05,
408
+ "loss": 0.0291,
409
+ "step": 280
410
+ },
411
+ {
412
+ "epoch": 71.32,
413
+ "grad_norm": 0.07353579252958298,
414
+ "learning_rate": 8.682703321878581e-05,
415
+ "loss": 0.0324,
416
+ "step": 285
417
+ },
418
+ {
419
+ "epoch": 72.64,
420
+ "grad_norm": 0.061573781073093414,
421
+ "learning_rate": 8.654066437571594e-05,
422
+ "loss": 0.0305,
423
+ "step": 290
424
+ },
425
+ {
426
+ "epoch": 73.96,
427
+ "grad_norm": 0.07544506341218948,
428
+ "learning_rate": 8.625429553264606e-05,
429
+ "loss": 0.0294,
430
+ "step": 295
431
+ },
432
+ {
433
+ "epoch": 75.0,
434
+ "grad_norm": 0.11100324243307114,
435
+ "learning_rate": 8.596792668957619e-05,
436
+ "loss": 0.03,
437
+ "step": 300
438
+ },
439
+ {
440
+ "epoch": 76.32,
441
+ "grad_norm": 0.0491141714155674,
442
+ "learning_rate": 8.56815578465063e-05,
443
+ "loss": 0.0296,
444
+ "step": 305
445
+ },
446
+ {
447
+ "epoch": 77.64,
448
+ "grad_norm": 0.07668624073266983,
449
+ "learning_rate": 8.539518900343643e-05,
450
+ "loss": 0.0345,
451
+ "step": 310
452
+ },
453
+ {
454
+ "epoch": 78.96,
455
+ "grad_norm": 0.07898231595754623,
456
+ "learning_rate": 8.510882016036655e-05,
457
+ "loss": 0.0306,
458
+ "step": 315
459
+ },
460
+ {
461
+ "epoch": 80.0,
462
+ "grad_norm": 0.3391458988189697,
463
+ "learning_rate": 8.482245131729668e-05,
464
+ "loss": 0.0334,
465
+ "step": 320
466
+ },
467
+ {
468
+ "epoch": 81.32,
469
+ "grad_norm": 0.05544694885611534,
470
+ "learning_rate": 8.453608247422681e-05,
471
+ "loss": 0.0282,
472
+ "step": 325
473
+ },
474
+ {
475
+ "epoch": 82.64,
476
+ "grad_norm": 0.05032579228281975,
477
+ "learning_rate": 8.424971363115693e-05,
478
+ "loss": 0.0323,
479
+ "step": 330
480
+ },
481
+ {
482
+ "epoch": 83.96,
483
+ "grad_norm": 0.05664476007223129,
484
+ "learning_rate": 8.396334478808706e-05,
485
+ "loss": 0.0295,
486
+ "step": 335
487
+ },
488
+ {
489
+ "epoch": 85.0,
490
+ "grad_norm": 0.24190960824489594,
491
+ "learning_rate": 8.367697594501719e-05,
492
+ "loss": 0.033,
493
+ "step": 340
494
+ },
495
+ {
496
+ "epoch": 86.32,
497
+ "grad_norm": 0.05068003758788109,
498
+ "learning_rate": 8.339060710194731e-05,
499
+ "loss": 0.0294,
500
+ "step": 345
501
+ },
502
+ {
503
+ "epoch": 87.64,
504
+ "grad_norm": 0.06719321757555008,
505
+ "learning_rate": 8.310423825887744e-05,
506
+ "loss": 0.0297,
507
+ "step": 350
508
+ },
509
+ {
510
+ "epoch": 88.96,
511
+ "grad_norm": 0.05750493332743645,
512
+ "learning_rate": 8.281786941580757e-05,
513
+ "loss": 0.0276,
514
+ "step": 355
515
+ },
516
+ {
517
+ "epoch": 90.0,
518
+ "grad_norm": 0.17318210005760193,
519
+ "learning_rate": 8.253150057273768e-05,
520
+ "loss": 0.0309,
521
+ "step": 360
522
+ },
523
+ {
524
+ "epoch": 91.32,
525
+ "grad_norm": 0.05168261379003525,
526
+ "learning_rate": 8.224513172966782e-05,
527
+ "loss": 0.0284,
528
+ "step": 365
529
+ },
530
+ {
531
+ "epoch": 92.64,
532
+ "grad_norm": 0.053040292114019394,
533
+ "learning_rate": 8.195876288659795e-05,
534
+ "loss": 0.0314,
535
+ "step": 370
536
+ },
537
+ {
538
+ "epoch": 93.96,
539
+ "grad_norm": 0.06162334978580475,
540
+ "learning_rate": 8.167239404352807e-05,
541
+ "loss": 0.0297,
542
+ "step": 375
543
+ },
544
+ {
545
+ "epoch": 95.0,
546
+ "grad_norm": 0.13474801182746887,
547
+ "learning_rate": 8.13860252004582e-05,
548
+ "loss": 0.0271,
549
+ "step": 380
550
+ },
551
+ {
552
+ "epoch": 96.32,
553
+ "grad_norm": 0.05177682265639305,
554
+ "learning_rate": 8.109965635738833e-05,
555
+ "loss": 0.0301,
556
+ "step": 385
557
+ },
558
+ {
559
+ "epoch": 97.64,
560
+ "grad_norm": 0.04276576265692711,
561
+ "learning_rate": 8.081328751431845e-05,
562
+ "loss": 0.0286,
563
+ "step": 390
564
+ },
565
+ {
566
+ "epoch": 98.96,
567
+ "grad_norm": 0.04698758199810982,
568
+ "learning_rate": 8.052691867124858e-05,
569
+ "loss": 0.0284,
570
+ "step": 395
571
+ },
572
+ {
573
+ "epoch": 100.0,
574
+ "grad_norm": 0.14094208180904388,
575
+ "learning_rate": 8.02405498281787e-05,
576
+ "loss": 0.0302,
577
+ "step": 400
578
+ },
579
+ {
580
+ "epoch": 101.32,
581
+ "grad_norm": 0.0528222993016243,
582
+ "learning_rate": 7.995418098510883e-05,
583
+ "loss": 0.0304,
584
+ "step": 405
585
+ },
586
+ {
587
+ "epoch": 102.64,
588
+ "grad_norm": 0.053034182637929916,
589
+ "learning_rate": 7.966781214203894e-05,
590
+ "loss": 0.0316,
591
+ "step": 410
592
+ },
593
+ {
594
+ "epoch": 103.96,
595
+ "grad_norm": 0.05732697248458862,
596
+ "learning_rate": 7.938144329896907e-05,
597
+ "loss": 0.0295,
598
+ "step": 415
599
+ },
600
+ {
601
+ "epoch": 105.0,
602
+ "grad_norm": 0.17511749267578125,
603
+ "learning_rate": 7.90950744558992e-05,
604
+ "loss": 0.0317,
605
+ "step": 420
606
+ },
607
+ {
608
+ "epoch": 106.32,
609
+ "grad_norm": 0.04588017240166664,
610
+ "learning_rate": 7.880870561282932e-05,
611
+ "loss": 0.0305,
612
+ "step": 425
613
+ },
614
+ {
615
+ "epoch": 107.64,
616
+ "grad_norm": 0.049282800406217575,
617
+ "learning_rate": 7.852233676975945e-05,
618
+ "loss": 0.031,
619
+ "step": 430
620
+ },
621
+ {
622
+ "epoch": 108.96,
623
+ "grad_norm": 0.04937691241502762,
624
+ "learning_rate": 7.823596792668958e-05,
625
+ "loss": 0.0278,
626
+ "step": 435
627
+ },
628
+ {
629
+ "epoch": 110.0,
630
+ "grad_norm": 0.11863432824611664,
631
+ "learning_rate": 7.79495990836197e-05,
632
+ "loss": 0.0316,
633
+ "step": 440
634
+ },
635
+ {
636
+ "epoch": 111.32,
637
+ "grad_norm": 0.04387475177645683,
638
+ "learning_rate": 7.766323024054983e-05,
639
+ "loss": 0.0283,
640
+ "step": 445
641
+ },
642
+ {
643
+ "epoch": 112.64,
644
+ "grad_norm": 0.04409867897629738,
645
+ "learning_rate": 7.737686139747996e-05,
646
+ "loss": 0.0306,
647
+ "step": 450
648
+ },
649
+ {
650
+ "epoch": 113.96,
651
+ "grad_norm": 0.04834749549627304,
652
+ "learning_rate": 7.709049255441008e-05,
653
+ "loss": 0.0302,
654
+ "step": 455
655
+ },
656
+ {
657
+ "epoch": 115.0,
658
+ "grad_norm": 0.1553424447774887,
659
+ "learning_rate": 7.680412371134021e-05,
660
+ "loss": 0.0326,
661
+ "step": 460
662
+ },
663
+ {
664
+ "epoch": 116.32,
665
+ "grad_norm": 0.05963806435465813,
666
+ "learning_rate": 7.651775486827034e-05,
667
+ "loss": 0.0291,
668
+ "step": 465
669
+ },
670
+ {
671
+ "epoch": 117.64,
672
+ "grad_norm": 0.04697559028863907,
673
+ "learning_rate": 7.623138602520046e-05,
674
+ "loss": 0.027,
675
+ "step": 470
676
+ },
677
+ {
678
+ "epoch": 118.96,
679
+ "grad_norm": 0.04225379601120949,
680
+ "learning_rate": 7.594501718213059e-05,
681
+ "loss": 0.0343,
682
+ "step": 475
683
+ },
684
+ {
685
+ "epoch": 120.0,
686
+ "grad_norm": 0.1076933965086937,
687
+ "learning_rate": 7.565864833906071e-05,
688
+ "loss": 0.0288,
689
+ "step": 480
690
+ },
691
+ {
692
+ "epoch": 121.32,
693
+ "grad_norm": 0.04540383443236351,
694
+ "learning_rate": 7.537227949599084e-05,
695
+ "loss": 0.0291,
696
+ "step": 485
697
+ },
698
+ {
699
+ "epoch": 122.64,
700
+ "grad_norm": 0.05459335818886757,
701
+ "learning_rate": 7.508591065292097e-05,
702
+ "loss": 0.0289,
703
+ "step": 490
704
+ },
705
+ {
706
+ "epoch": 123.96,
707
+ "grad_norm": 0.05171333625912666,
708
+ "learning_rate": 7.47995418098511e-05,
709
+ "loss": 0.0284,
710
+ "step": 495
711
+ },
712
+ {
713
+ "epoch": 125.0,
714
+ "grad_norm": 0.08606769144535065,
715
+ "learning_rate": 7.451317296678122e-05,
716
+ "loss": 0.0314,
717
+ "step": 500
718
+ },
719
+ {
720
+ "epoch": 126.32,
721
+ "grad_norm": 0.040535662323236465,
722
+ "learning_rate": 7.422680412371135e-05,
723
+ "loss": 0.028,
724
+ "step": 505
725
+ },
726
+ {
727
+ "epoch": 127.64,
728
+ "grad_norm": 0.04621696099638939,
729
+ "learning_rate": 7.394043528064147e-05,
730
+ "loss": 0.0281,
731
+ "step": 510
732
+ },
733
+ {
734
+ "epoch": 128.96,
735
+ "grad_norm": 0.04407593980431557,
736
+ "learning_rate": 7.36540664375716e-05,
737
+ "loss": 0.0309,
738
+ "step": 515
739
+ },
740
+ {
741
+ "epoch": 130.0,
742
+ "grad_norm": 0.24090737104415894,
743
+ "learning_rate": 7.336769759450171e-05,
744
+ "loss": 0.0302,
745
+ "step": 520
746
+ },
747
+ {
748
+ "epoch": 131.32,
749
+ "grad_norm": 0.051712971180677414,
750
+ "learning_rate": 7.308132875143184e-05,
751
+ "loss": 0.0305,
752
+ "step": 525
753
+ },
754
+ {
755
+ "epoch": 132.64,
756
+ "grad_norm": 0.0373610258102417,
757
+ "learning_rate": 7.279495990836197e-05,
758
+ "loss": 0.0283,
759
+ "step": 530
760
+ },
761
+ {
762
+ "epoch": 133.96,
763
+ "grad_norm": 0.04424213245511055,
764
+ "learning_rate": 7.250859106529209e-05,
765
+ "loss": 0.0317,
766
+ "step": 535
767
+ },
768
+ {
769
+ "epoch": 135.0,
770
+ "grad_norm": 0.09113436192274094,
771
+ "learning_rate": 7.222222222222222e-05,
772
+ "loss": 0.0302,
773
+ "step": 540
774
+ },
775
+ {
776
+ "epoch": 136.32,
777
+ "grad_norm": 0.03745009005069733,
778
+ "learning_rate": 7.193585337915235e-05,
779
+ "loss": 0.0315,
780
+ "step": 545
781
+ },
782
+ {
783
+ "epoch": 137.64,
784
+ "grad_norm": 0.04058730602264404,
785
+ "learning_rate": 7.164948453608247e-05,
786
+ "loss": 0.0312,
787
+ "step": 550
788
+ },
789
+ {
790
+ "epoch": 138.96,
791
+ "grad_norm": 0.046279069036245346,
792
+ "learning_rate": 7.136311569301261e-05,
793
+ "loss": 0.0295,
794
+ "step": 555
795
+ },
796
+ {
797
+ "epoch": 140.0,
798
+ "grad_norm": 0.17239141464233398,
799
+ "learning_rate": 7.107674684994274e-05,
800
+ "loss": 0.0307,
801
+ "step": 560
802
+ },
803
+ {
804
+ "epoch": 141.32,
805
+ "grad_norm": 0.036460030823946,
806
+ "learning_rate": 7.079037800687286e-05,
807
+ "loss": 0.0284,
808
+ "step": 565
809
+ },
810
+ {
811
+ "epoch": 142.64,
812
+ "grad_norm": 0.03434258699417114,
813
+ "learning_rate": 7.050400916380299e-05,
814
+ "loss": 0.0283,
815
+ "step": 570
816
+ },
817
+ {
818
+ "epoch": 143.96,
819
+ "grad_norm": 0.0470467284321785,
820
+ "learning_rate": 7.02176403207331e-05,
821
+ "loss": 0.0296,
822
+ "step": 575
823
+ },
824
+ {
825
+ "epoch": 145.0,
826
+ "grad_norm": 0.07163394242525101,
827
+ "learning_rate": 6.993127147766323e-05,
828
+ "loss": 0.0256,
829
+ "step": 580
830
+ },
831
+ {
832
+ "epoch": 146.32,
833
+ "grad_norm": 0.042208388447761536,
834
+ "learning_rate": 6.964490263459336e-05,
835
+ "loss": 0.0298,
836
+ "step": 585
837
+ },
838
+ {
839
+ "epoch": 147.64,
840
+ "grad_norm": 0.04421050846576691,
841
+ "learning_rate": 6.935853379152348e-05,
842
+ "loss": 0.0274,
843
+ "step": 590
844
+ },
845
+ {
846
+ "epoch": 148.96,
847
+ "grad_norm": 0.047223106026649475,
848
+ "learning_rate": 6.907216494845361e-05,
849
+ "loss": 0.0311,
850
+ "step": 595
851
+ },
852
+ {
853
+ "epoch": 150.0,
854
+ "grad_norm": 0.1724609136581421,
855
+ "learning_rate": 6.878579610538374e-05,
856
+ "loss": 0.0302,
857
+ "step": 600
858
+ },
859
+ {
860
+ "epoch": 151.32,
861
+ "grad_norm": 0.042247697710990906,
862
+ "learning_rate": 6.849942726231386e-05,
863
+ "loss": 0.0287,
864
+ "step": 605
865
+ },
866
+ {
867
+ "epoch": 152.64,
868
+ "grad_norm": 0.05167734622955322,
869
+ "learning_rate": 6.821305841924399e-05,
870
+ "loss": 0.0279,
871
+ "step": 610
872
+ },
873
+ {
874
+ "epoch": 153.96,
875
+ "grad_norm": 0.03621920198202133,
876
+ "learning_rate": 6.792668957617412e-05,
877
+ "loss": 0.0291,
878
+ "step": 615
879
+ },
880
+ {
881
+ "epoch": 155.0,
882
+ "grad_norm": 0.22533060610294342,
883
+ "learning_rate": 6.764032073310424e-05,
884
+ "loss": 0.0354,
885
+ "step": 620
886
+ },
887
+ {
888
+ "epoch": 156.32,
889
+ "grad_norm": 0.03441638499498367,
890
+ "learning_rate": 6.735395189003437e-05,
891
+ "loss": 0.0285,
892
+ "step": 625
893
+ },
894
+ {
895
+ "epoch": 157.64,
896
+ "grad_norm": 0.03728373721241951,
897
+ "learning_rate": 6.706758304696448e-05,
898
+ "loss": 0.0299,
899
+ "step": 630
900
+ },
901
+ {
902
+ "epoch": 158.96,
903
+ "grad_norm": 0.043604422360658646,
904
+ "learning_rate": 6.678121420389462e-05,
905
+ "loss": 0.0292,
906
+ "step": 635
907
+ },
908
+ {
909
+ "epoch": 160.0,
910
+ "grad_norm": 0.17142102122306824,
911
+ "learning_rate": 6.649484536082475e-05,
912
+ "loss": 0.0331,
913
+ "step": 640
914
+ },
915
+ {
916
+ "epoch": 161.32,
917
+ "grad_norm": 0.03554172441363335,
918
+ "learning_rate": 6.620847651775487e-05,
919
+ "loss": 0.0305,
920
+ "step": 645
921
+ },
922
+ {
923
+ "epoch": 162.64,
924
+ "grad_norm": 0.043817318975925446,
925
+ "learning_rate": 6.5922107674685e-05,
926
+ "loss": 0.0305,
927
+ "step": 650
928
+ },
929
+ {
930
+ "epoch": 163.96,
931
+ "grad_norm": 0.04247381538152695,
932
+ "learning_rate": 6.563573883161513e-05,
933
+ "loss": 0.0302,
934
+ "step": 655
935
+ },
936
+ {
937
+ "epoch": 165.0,
938
+ "grad_norm": 0.09436971694231033,
939
+ "learning_rate": 6.534936998854525e-05,
940
+ "loss": 0.0292,
941
+ "step": 660
942
+ },
943
+ {
944
+ "epoch": 166.32,
945
+ "grad_norm": 0.04177865758538246,
946
+ "learning_rate": 6.506300114547538e-05,
947
+ "loss": 0.0281,
948
+ "step": 665
949
+ },
950
+ {
951
+ "epoch": 167.64,
952
+ "grad_norm": 0.04079804942011833,
953
+ "learning_rate": 6.477663230240551e-05,
954
+ "loss": 0.0288,
955
+ "step": 670
956
+ },
957
+ {
958
+ "epoch": 168.96,
959
+ "grad_norm": 0.039114974439144135,
960
+ "learning_rate": 6.449026345933563e-05,
961
+ "loss": 0.0275,
962
+ "step": 675
963
+ },
964
+ {
965
+ "epoch": 170.0,
966
+ "grad_norm": 0.11661799997091293,
967
+ "learning_rate": 6.420389461626576e-05,
968
+ "loss": 0.0288,
969
+ "step": 680
970
+ },
971
+ {
972
+ "epoch": 171.32,
973
+ "grad_norm": 0.03423461318016052,
974
+ "learning_rate": 6.391752577319587e-05,
975
+ "loss": 0.0306,
976
+ "step": 685
977
+ },
978
+ {
979
+ "epoch": 172.64,
980
+ "grad_norm": 0.03615871071815491,
981
+ "learning_rate": 6.3631156930126e-05,
982
+ "loss": 0.031,
983
+ "step": 690
984
+ },
985
+ {
986
+ "epoch": 173.96,
987
+ "grad_norm": 0.04067518189549446,
988
+ "learning_rate": 6.334478808705613e-05,
989
+ "loss": 0.0312,
990
+ "step": 695
991
+ },
992
+ {
993
+ "epoch": 175.0,
994
+ "grad_norm": 0.11094173789024353,
995
+ "learning_rate": 6.305841924398625e-05,
996
+ "loss": 0.0287,
997
+ "step": 700
998
+ },
999
+ {
1000
+ "epoch": 176.32,
1001
+ "grad_norm": 0.03173477575182915,
1002
+ "learning_rate": 6.277205040091638e-05,
1003
+ "loss": 0.0296,
1004
+ "step": 705
1005
+ },
1006
+ {
1007
+ "epoch": 177.64,
1008
+ "grad_norm": 0.0374116450548172,
1009
+ "learning_rate": 6.24856815578465e-05,
1010
+ "loss": 0.0292,
1011
+ "step": 710
1012
+ },
1013
+ {
1014
+ "epoch": 178.96,
1015
+ "grad_norm": 0.03814936801791191,
1016
+ "learning_rate": 6.219931271477663e-05,
1017
+ "loss": 0.0273,
1018
+ "step": 715
1019
+ },
1020
+ {
1021
+ "epoch": 180.0,
1022
+ "grad_norm": 0.14255362749099731,
1023
+ "learning_rate": 6.191294387170676e-05,
1024
+ "loss": 0.0335,
1025
+ "step": 720
1026
+ },
1027
+ {
1028
+ "epoch": 181.32,
1029
+ "grad_norm": 0.04104507714509964,
1030
+ "learning_rate": 6.162657502863689e-05,
1031
+ "loss": 0.0303,
1032
+ "step": 725
1033
+ },
1034
+ {
1035
+ "epoch": 182.64,
1036
+ "grad_norm": 0.037353888154029846,
1037
+ "learning_rate": 6.134020618556701e-05,
1038
+ "loss": 0.0271,
1039
+ "step": 730
1040
+ },
1041
+ {
1042
+ "epoch": 183.96,
1043
+ "grad_norm": 0.03552788123488426,
1044
+ "learning_rate": 6.105383734249714e-05,
1045
+ "loss": 0.0288,
1046
+ "step": 735
1047
+ },
1048
+ {
1049
+ "epoch": 185.0,
1050
+ "grad_norm": 0.09345243126153946,
1051
+ "learning_rate": 6.076746849942726e-05,
1052
+ "loss": 0.0271,
1053
+ "step": 740
1054
+ },
1055
+ {
1056
+ "epoch": 186.32,
1057
+ "grad_norm": 0.031304650008678436,
1058
+ "learning_rate": 6.0481099656357384e-05,
1059
+ "loss": 0.0287,
1060
+ "step": 745
1061
+ },
1062
+ {
1063
+ "epoch": 187.64,
1064
+ "grad_norm": 0.03588686138391495,
1065
+ "learning_rate": 6.019473081328752e-05,
1066
+ "loss": 0.0292,
1067
+ "step": 750
1068
+ },
1069
+ {
1070
+ "epoch": 188.96,
1071
+ "grad_norm": 0.03166257590055466,
1072
+ "learning_rate": 5.9908361970217644e-05,
1073
+ "loss": 0.0277,
1074
+ "step": 755
1075
+ },
1076
+ {
1077
+ "epoch": 190.0,
1078
+ "grad_norm": 0.09115266799926758,
1079
+ "learning_rate": 5.962199312714777e-05,
1080
+ "loss": 0.0286,
1081
+ "step": 760
1082
+ },
1083
+ {
1084
+ "epoch": 191.32,
1085
+ "grad_norm": 0.028432967141270638,
1086
+ "learning_rate": 5.93356242840779e-05,
1087
+ "loss": 0.0277,
1088
+ "step": 765
1089
+ },
1090
+ {
1091
+ "epoch": 192.64,
1092
+ "grad_norm": 0.04126034304499626,
1093
+ "learning_rate": 5.904925544100802e-05,
1094
+ "loss": 0.0315,
1095
+ "step": 770
1096
+ },
1097
+ {
1098
+ "epoch": 193.96,
1099
+ "grad_norm": 0.04166596010327339,
1100
+ "learning_rate": 5.876288659793815e-05,
1101
+ "loss": 0.0281,
1102
+ "step": 775
1103
+ },
1104
+ {
1105
+ "epoch": 195.0,
1106
+ "grad_norm": 0.11017812788486481,
1107
+ "learning_rate": 5.8476517754868276e-05,
1108
+ "loss": 0.0285,
1109
+ "step": 780
1110
+ },
1111
+ {
1112
+ "epoch": 196.32,
1113
+ "grad_norm": 0.04071119427680969,
1114
+ "learning_rate": 5.81901489117984e-05,
1115
+ "loss": 0.0289,
1116
+ "step": 785
1117
+ },
1118
+ {
1119
+ "epoch": 197.64,
1120
+ "grad_norm": 0.03756481036543846,
1121
+ "learning_rate": 5.790378006872853e-05,
1122
+ "loss": 0.0276,
1123
+ "step": 790
1124
+ },
1125
+ {
1126
+ "epoch": 198.96,
1127
+ "grad_norm": 0.039780210703611374,
1128
+ "learning_rate": 5.761741122565865e-05,
1129
+ "loss": 0.0282,
1130
+ "step": 795
1131
+ },
1132
+ {
1133
+ "epoch": 200.0,
1134
+ "grad_norm": 0.12418342381715775,
1135
+ "learning_rate": 5.7331042382588775e-05,
1136
+ "loss": 0.0296,
1137
+ "step": 800
1138
+ },
1139
+ {
1140
+ "epoch": 201.32,
1141
+ "grad_norm": 0.0338447242975235,
1142
+ "learning_rate": 5.70446735395189e-05,
1143
+ "loss": 0.0286,
1144
+ "step": 805
1145
+ },
1146
+ {
1147
+ "epoch": 202.64,
1148
+ "grad_norm": 0.03490043804049492,
1149
+ "learning_rate": 5.675830469644903e-05,
1150
+ "loss": 0.0306,
1151
+ "step": 810
1152
+ },
1153
+ {
1154
+ "epoch": 203.96,
1155
+ "grad_norm": 0.03847096487879753,
1156
+ "learning_rate": 5.6471935853379155e-05,
1157
+ "loss": 0.0283,
1158
+ "step": 815
1159
+ },
1160
+ {
1161
+ "epoch": 205.0,
1162
+ "grad_norm": 0.10988269001245499,
1163
+ "learning_rate": 5.618556701030928e-05,
1164
+ "loss": 0.0278,
1165
+ "step": 820
1166
+ },
1167
+ {
1168
+ "epoch": 206.32,
1169
+ "grad_norm": 0.034018851816654205,
1170
+ "learning_rate": 5.589919816723941e-05,
1171
+ "loss": 0.03,
1172
+ "step": 825
1173
+ },
1174
+ {
1175
+ "epoch": 207.64,
1176
+ "grad_norm": 0.032927289605140686,
1177
+ "learning_rate": 5.5612829324169534e-05,
1178
+ "loss": 0.0293,
1179
+ "step": 830
1180
+ },
1181
+ {
1182
+ "epoch": 208.96,
1183
+ "grad_norm": 0.03604916110634804,
1184
+ "learning_rate": 5.532646048109966e-05,
1185
+ "loss": 0.0285,
1186
+ "step": 835
1187
+ },
1188
+ {
1189
+ "epoch": 210.0,
1190
+ "grad_norm": 0.09708557277917862,
1191
+ "learning_rate": 5.504009163802979e-05,
1192
+ "loss": 0.0305,
1193
+ "step": 840
1194
+ },
1195
+ {
1196
+ "epoch": 211.32,
1197
+ "grad_norm": 0.03745417296886444,
1198
+ "learning_rate": 5.4753722794959914e-05,
1199
+ "loss": 0.0296,
1200
+ "step": 845
1201
+ },
1202
+ {
1203
+ "epoch": 212.64,
1204
+ "grad_norm": 0.028906095772981644,
1205
+ "learning_rate": 5.4467353951890033e-05,
1206
+ "loss": 0.0277,
1207
+ "step": 850
1208
+ },
1209
+ {
1210
+ "epoch": 213.96,
1211
+ "grad_norm": 0.03228568285703659,
1212
+ "learning_rate": 5.418098510882016e-05,
1213
+ "loss": 0.0295,
1214
+ "step": 855
1215
+ },
1216
+ {
1217
+ "epoch": 215.0,
1218
+ "grad_norm": 0.1302802711725235,
1219
+ "learning_rate": 5.3894616265750286e-05,
1220
+ "loss": 0.031,
1221
+ "step": 860
1222
+ },
1223
+ {
1224
+ "epoch": 216.32,
1225
+ "grad_norm": 0.031472526490688324,
1226
+ "learning_rate": 5.360824742268041e-05,
1227
+ "loss": 0.0286,
1228
+ "step": 865
1229
+ },
1230
+ {
1231
+ "epoch": 217.64,
1232
+ "grad_norm": 0.03589686006307602,
1233
+ "learning_rate": 5.332187857961054e-05,
1234
+ "loss": 0.0308,
1235
+ "step": 870
1236
+ },
1237
+ {
1238
+ "epoch": 218.96,
1239
+ "grad_norm": 0.04117952659726143,
1240
+ "learning_rate": 5.3035509736540666e-05,
1241
+ "loss": 0.0298,
1242
+ "step": 875
1243
+ },
1244
+ {
1245
+ "epoch": 220.0,
1246
+ "grad_norm": 0.16901935636997223,
1247
+ "learning_rate": 5.274914089347079e-05,
1248
+ "loss": 0.0316,
1249
+ "step": 880
1250
+ },
1251
+ {
1252
+ "epoch": 221.32,
1253
+ "grad_norm": 0.03608705848455429,
1254
+ "learning_rate": 5.246277205040092e-05,
1255
+ "loss": 0.0297,
1256
+ "step": 885
1257
+ },
1258
+ {
1259
+ "epoch": 222.64,
1260
+ "grad_norm": 0.028423065319657326,
1261
+ "learning_rate": 5.2176403207331045e-05,
1262
+ "loss": 0.029,
1263
+ "step": 890
1264
+ },
1265
+ {
1266
+ "epoch": 223.96,
1267
+ "grad_norm": 0.03328604996204376,
1268
+ "learning_rate": 5.189003436426118e-05,
1269
+ "loss": 0.0304,
1270
+ "step": 895
1271
+ },
1272
+ {
1273
+ "epoch": 225.0,
1274
+ "grad_norm": 0.1140102967619896,
1275
+ "learning_rate": 5.1603665521191305e-05,
1276
+ "loss": 0.0295,
1277
+ "step": 900
1278
+ },
1279
+ {
1280
+ "epoch": 226.32,
1281
+ "grad_norm": 0.03379100188612938,
1282
+ "learning_rate": 5.131729667812142e-05,
1283
+ "loss": 0.0289,
1284
+ "step": 905
1285
+ },
1286
+ {
1287
+ "epoch": 227.64,
1288
+ "grad_norm": 0.03175675496459007,
1289
+ "learning_rate": 5.1030927835051544e-05,
1290
+ "loss": 0.0272,
1291
+ "step": 910
1292
+ },
1293
+ {
1294
+ "epoch": 228.96,
1295
+ "grad_norm": 0.0344826877117157,
1296
+ "learning_rate": 5.074455899198167e-05,
1297
+ "loss": 0.0308,
1298
+ "step": 915
1299
+ },
1300
+ {
1301
+ "epoch": 230.0,
1302
+ "grad_norm": 0.1841171830892563,
1303
+ "learning_rate": 5.04581901489118e-05,
1304
+ "loss": 0.0308,
1305
+ "step": 920
1306
+ },
1307
+ {
1308
+ "epoch": 231.32,
1309
+ "grad_norm": 0.03660387173295021,
1310
+ "learning_rate": 5.0171821305841924e-05,
1311
+ "loss": 0.0326,
1312
+ "step": 925
1313
+ },
1314
+ {
1315
+ "epoch": 232.64,
1316
+ "grad_norm": 0.03065328672528267,
1317
+ "learning_rate": 4.988545246277205e-05,
1318
+ "loss": 0.0295,
1319
+ "step": 930
1320
+ },
1321
+ {
1322
+ "epoch": 233.96,
1323
+ "grad_norm": 0.03993593156337738,
1324
+ "learning_rate": 4.9599083619702184e-05,
1325
+ "loss": 0.0292,
1326
+ "step": 935
1327
+ },
1328
+ {
1329
+ "epoch": 235.0,
1330
+ "grad_norm": 0.10738981515169144,
1331
+ "learning_rate": 4.931271477663231e-05,
1332
+ "loss": 0.0302,
1333
+ "step": 940
1334
+ },
1335
+ {
1336
+ "epoch": 236.32,
1337
+ "grad_norm": 0.03143048286437988,
1338
+ "learning_rate": 4.902634593356243e-05,
1339
+ "loss": 0.0278,
1340
+ "step": 945
1341
+ },
1342
+ {
1343
+ "epoch": 237.64,
1344
+ "grad_norm": 0.028968214988708496,
1345
+ "learning_rate": 4.8739977090492556e-05,
1346
+ "loss": 0.0271,
1347
+ "step": 950
1348
+ },
1349
+ {
1350
+ "epoch": 238.96,
1351
+ "grad_norm": 0.038674987852573395,
1352
+ "learning_rate": 4.845360824742268e-05,
1353
+ "loss": 0.0297,
1354
+ "step": 955
1355
+ },
1356
+ {
1357
+ "epoch": 240.0,
1358
+ "grad_norm": 0.10797161608934402,
1359
+ "learning_rate": 4.816723940435281e-05,
1360
+ "loss": 0.0278,
1361
+ "step": 960
1362
+ },
1363
+ {
1364
+ "epoch": 241.32,
1365
+ "grad_norm": 0.03592285141348839,
1366
+ "learning_rate": 4.7880870561282936e-05,
1367
+ "loss": 0.0281,
1368
+ "step": 965
1369
+ },
1370
+ {
1371
+ "epoch": 242.64,
1372
+ "grad_norm": 0.031206540763378143,
1373
+ "learning_rate": 4.7594501718213055e-05,
1374
+ "loss": 0.031,
1375
+ "step": 970
1376
+ },
1377
+ {
1378
+ "epoch": 243.96,
1379
+ "grad_norm": 0.03692101314663887,
1380
+ "learning_rate": 4.730813287514318e-05,
1381
+ "loss": 0.0276,
1382
+ "step": 975
1383
+ },
1384
+ {
1385
+ "epoch": 245.0,
1386
+ "grad_norm": 0.1415632963180542,
1387
+ "learning_rate": 4.7021764032073315e-05,
1388
+ "loss": 0.0325,
1389
+ "step": 980
1390
+ },
1391
+ {
1392
+ "epoch": 246.32,
1393
+ "grad_norm": 0.0346578024327755,
1394
+ "learning_rate": 4.673539518900344e-05,
1395
+ "loss": 0.0291,
1396
+ "step": 985
1397
+ },
1398
+ {
1399
+ "epoch": 247.64,
1400
+ "grad_norm": 0.036887165158987045,
1401
+ "learning_rate": 4.644902634593357e-05,
1402
+ "loss": 0.0279,
1403
+ "step": 990
1404
+ },
1405
+ {
1406
+ "epoch": 248.96,
1407
+ "grad_norm": 0.03107571043074131,
1408
+ "learning_rate": 4.6162657502863694e-05,
1409
+ "loss": 0.0277,
1410
+ "step": 995
1411
+ },
1412
+ {
1413
+ "epoch": 250.0,
1414
+ "grad_norm": 0.13857436180114746,
1415
+ "learning_rate": 4.5876288659793814e-05,
1416
+ "loss": 0.0287,
1417
+ "step": 1000
1418
+ },
1419
+ {
1420
+ "epoch": 251.32,
1421
+ "grad_norm": 0.03328908607363701,
1422
+ "learning_rate": 4.558991981672394e-05,
1423
+ "loss": 0.0275,
1424
+ "step": 1005
1425
+ },
1426
+ {
1427
+ "epoch": 252.64,
1428
+ "grad_norm": 0.03218206763267517,
1429
+ "learning_rate": 4.530355097365407e-05,
1430
+ "loss": 0.0295,
1431
+ "step": 1010
1432
+ },
1433
+ {
1434
+ "epoch": 253.96,
1435
+ "grad_norm": 0.030677294358611107,
1436
+ "learning_rate": 4.5017182130584194e-05,
1437
+ "loss": 0.0288,
1438
+ "step": 1015
1439
+ },
1440
+ {
1441
+ "epoch": 255.0,
1442
+ "grad_norm": 0.08906098455190659,
1443
+ "learning_rate": 4.473081328751432e-05,
1444
+ "loss": 0.0283,
1445
+ "step": 1020
1446
+ },
1447
+ {
1448
+ "epoch": 256.32,
1449
+ "grad_norm": 0.0315646268427372,
1450
+ "learning_rate": 4.4444444444444447e-05,
1451
+ "loss": 0.0292,
1452
+ "step": 1025
1453
+ },
1454
+ {
1455
+ "epoch": 257.64,
1456
+ "grad_norm": 0.0322076752781868,
1457
+ "learning_rate": 4.415807560137457e-05,
1458
+ "loss": 0.0286,
1459
+ "step": 1030
1460
+ },
1461
+ {
1462
+ "epoch": 258.96,
1463
+ "grad_norm": 0.03561684116721153,
1464
+ "learning_rate": 4.38717067583047e-05,
1465
+ "loss": 0.0266,
1466
+ "step": 1035
1467
+ },
1468
+ {
1469
+ "epoch": 260.0,
1470
+ "grad_norm": 0.1383010447025299,
1471
+ "learning_rate": 4.3585337915234826e-05,
1472
+ "loss": 0.0291,
1473
+ "step": 1040
1474
+ },
1475
+ {
1476
+ "epoch": 261.32,
1477
+ "grad_norm": 0.02982248179614544,
1478
+ "learning_rate": 4.329896907216495e-05,
1479
+ "loss": 0.0281,
1480
+ "step": 1045
1481
+ },
1482
+ {
1483
+ "epoch": 262.64,
1484
+ "grad_norm": 0.03563191369175911,
1485
+ "learning_rate": 4.301260022909508e-05,
1486
+ "loss": 0.0285,
1487
+ "step": 1050
1488
+ },
1489
+ {
1490
+ "epoch": 263.96,
1491
+ "grad_norm": 0.03730940818786621,
1492
+ "learning_rate": 4.27262313860252e-05,
1493
+ "loss": 0.0298,
1494
+ "step": 1055
1495
+ },
1496
+ {
1497
+ "epoch": 265.0,
1498
+ "grad_norm": 0.12043489515781403,
1499
+ "learning_rate": 4.2439862542955325e-05,
1500
+ "loss": 0.029,
1501
+ "step": 1060
1502
+ },
1503
+ {
1504
+ "epoch": 266.32,
1505
+ "grad_norm": 0.03577538579702377,
1506
+ "learning_rate": 4.215349369988545e-05,
1507
+ "loss": 0.0304,
1508
+ "step": 1065
1509
+ },
1510
+ {
1511
+ "epoch": 267.64,
1512
+ "grad_norm": 0.035051047801971436,
1513
+ "learning_rate": 4.1867124856815585e-05,
1514
+ "loss": 0.0292,
1515
+ "step": 1070
1516
+ },
1517
+ {
1518
+ "epoch": 268.96,
1519
+ "grad_norm": 0.03524423763155937,
1520
+ "learning_rate": 4.158075601374571e-05,
1521
+ "loss": 0.029,
1522
+ "step": 1075
1523
+ },
1524
+ {
1525
+ "epoch": 270.0,
1526
+ "grad_norm": 0.11722230911254883,
1527
+ "learning_rate": 4.129438717067583e-05,
1528
+ "loss": 0.0286,
1529
+ "step": 1080
1530
+ },
1531
+ {
1532
+ "epoch": 271.32,
1533
+ "grad_norm": 0.0350823737680912,
1534
+ "learning_rate": 4.100801832760596e-05,
1535
+ "loss": 0.0295,
1536
+ "step": 1085
1537
+ },
1538
+ {
1539
+ "epoch": 272.64,
1540
+ "grad_norm": 0.03372941538691521,
1541
+ "learning_rate": 4.0721649484536084e-05,
1542
+ "loss": 0.0288,
1543
+ "step": 1090
1544
+ },
1545
+ {
1546
+ "epoch": 273.96,
1547
+ "grad_norm": 0.028644917532801628,
1548
+ "learning_rate": 4.043528064146621e-05,
1549
+ "loss": 0.0326,
1550
+ "step": 1095
1551
+ },
1552
+ {
1553
+ "epoch": 275.0,
1554
+ "grad_norm": 0.10958810150623322,
1555
+ "learning_rate": 4.014891179839634e-05,
1556
+ "loss": 0.0293,
1557
+ "step": 1100
1558
+ },
1559
+ {
1560
+ "epoch": 276.32,
1561
+ "grad_norm": 0.03524491935968399,
1562
+ "learning_rate": 3.9862542955326463e-05,
1563
+ "loss": 0.0289,
1564
+ "step": 1105
1565
+ },
1566
+ {
1567
+ "epoch": 277.64,
1568
+ "grad_norm": 0.028043361380696297,
1569
+ "learning_rate": 3.957617411225659e-05,
1570
+ "loss": 0.0292,
1571
+ "step": 1110
1572
+ },
1573
+ {
1574
+ "epoch": 278.96,
1575
+ "grad_norm": 0.03574656322598457,
1576
+ "learning_rate": 3.9289805269186716e-05,
1577
+ "loss": 0.028,
1578
+ "step": 1115
1579
+ },
1580
+ {
1581
+ "epoch": 280.0,
1582
+ "grad_norm": 0.12416456639766693,
1583
+ "learning_rate": 3.900343642611684e-05,
1584
+ "loss": 0.0278,
1585
+ "step": 1120
1586
+ },
1587
+ {
1588
+ "epoch": 281.32,
1589
+ "grad_norm": 0.02984347939491272,
1590
+ "learning_rate": 3.871706758304697e-05,
1591
+ "loss": 0.0324,
1592
+ "step": 1125
1593
+ },
1594
+ {
1595
+ "epoch": 282.64,
1596
+ "grad_norm": 0.03649289906024933,
1597
+ "learning_rate": 3.8430698739977096e-05,
1598
+ "loss": 0.0281,
1599
+ "step": 1130
1600
+ },
1601
+ {
1602
+ "epoch": 283.96,
1603
+ "grad_norm": 0.03943822532892227,
1604
+ "learning_rate": 3.8144329896907216e-05,
1605
+ "loss": 0.0268,
1606
+ "step": 1135
1607
+ },
1608
+ {
1609
+ "epoch": 285.0,
1610
+ "grad_norm": 0.14334431290626526,
1611
+ "learning_rate": 3.785796105383734e-05,
1612
+ "loss": 0.0305,
1613
+ "step": 1140
1614
+ },
1615
+ {
1616
+ "epoch": 286.32,
1617
+ "grad_norm": 0.030261779204010963,
1618
+ "learning_rate": 3.757159221076747e-05,
1619
+ "loss": 0.028,
1620
+ "step": 1145
1621
+ },
1622
+ {
1623
+ "epoch": 287.64,
1624
+ "grad_norm": 0.03134704381227493,
1625
+ "learning_rate": 3.7285223367697595e-05,
1626
+ "loss": 0.0267,
1627
+ "step": 1150
1628
+ },
1629
+ {
1630
+ "epoch": 288.96,
1631
+ "grad_norm": 0.031728796660900116,
1632
+ "learning_rate": 3.699885452462772e-05,
1633
+ "loss": 0.028,
1634
+ "step": 1155
1635
+ },
1636
+ {
1637
+ "epoch": 290.0,
1638
+ "grad_norm": 0.15487806499004364,
1639
+ "learning_rate": 3.671248568155785e-05,
1640
+ "loss": 0.031,
1641
+ "step": 1160
1642
+ },
1643
+ {
1644
+ "epoch": 291.32,
1645
+ "grad_norm": 0.033745523542165756,
1646
+ "learning_rate": 3.6426116838487974e-05,
1647
+ "loss": 0.0298,
1648
+ "step": 1165
1649
+ },
1650
+ {
1651
+ "epoch": 292.64,
1652
+ "grad_norm": 0.026857230812311172,
1653
+ "learning_rate": 3.61397479954181e-05,
1654
+ "loss": 0.0265,
1655
+ "step": 1170
1656
+ },
1657
+ {
1658
+ "epoch": 293.96,
1659
+ "grad_norm": 0.03467594459652901,
1660
+ "learning_rate": 3.585337915234823e-05,
1661
+ "loss": 0.0291,
1662
+ "step": 1175
1663
+ },
1664
+ {
1665
+ "epoch": 295.0,
1666
+ "grad_norm": 0.1255461573600769,
1667
+ "learning_rate": 3.5567010309278354e-05,
1668
+ "loss": 0.0304,
1669
+ "step": 1180
1670
+ },
1671
+ {
1672
+ "epoch": 296.32,
1673
+ "grad_norm": 0.03569836914539337,
1674
+ "learning_rate": 3.528064146620848e-05,
1675
+ "loss": 0.0275,
1676
+ "step": 1185
1677
+ },
1678
+ {
1679
+ "epoch": 297.64,
1680
+ "grad_norm": 0.03207559511065483,
1681
+ "learning_rate": 3.49942726231386e-05,
1682
+ "loss": 0.0288,
1683
+ "step": 1190
1684
+ },
1685
+ {
1686
+ "epoch": 298.96,
1687
+ "grad_norm": 0.03445427492260933,
1688
+ "learning_rate": 3.4707903780068726e-05,
1689
+ "loss": 0.0274,
1690
+ "step": 1195
1691
+ },
1692
+ {
1693
+ "epoch": 300.0,
1694
+ "grad_norm": 0.11089900881052017,
1695
+ "learning_rate": 3.442153493699885e-05,
1696
+ "loss": 0.0268,
1697
+ "step": 1200
1698
+ },
1699
+ {
1700
+ "epoch": 301.32,
1701
+ "grad_norm": 0.030901776626706123,
1702
+ "learning_rate": 3.4135166093928986e-05,
1703
+ "loss": 0.0285,
1704
+ "step": 1205
1705
+ },
1706
+ {
1707
+ "epoch": 302.64,
1708
+ "grad_norm": 0.03404972329735756,
1709
+ "learning_rate": 3.384879725085911e-05,
1710
+ "loss": 0.0282,
1711
+ "step": 1210
1712
+ },
1713
+ {
1714
+ "epoch": 303.96,
1715
+ "grad_norm": 0.03297970071434975,
1716
+ "learning_rate": 3.356242840778923e-05,
1717
+ "loss": 0.0289,
1718
+ "step": 1215
1719
+ },
1720
+ {
1721
+ "epoch": 305.0,
1722
+ "grad_norm": 0.08513491600751877,
1723
+ "learning_rate": 3.327605956471936e-05,
1724
+ "loss": 0.0271,
1725
+ "step": 1220
1726
+ },
1727
+ {
1728
+ "epoch": 306.32,
1729
+ "grad_norm": 0.02815438061952591,
1730
+ "learning_rate": 3.2989690721649485e-05,
1731
+ "loss": 0.028,
1732
+ "step": 1225
1733
+ },
1734
+ {
1735
+ "epoch": 307.64,
1736
+ "grad_norm": 0.031231220811605453,
1737
+ "learning_rate": 3.270332187857961e-05,
1738
+ "loss": 0.0308,
1739
+ "step": 1230
1740
+ },
1741
+ {
1742
+ "epoch": 308.96,
1743
+ "grad_norm": 0.03579903766512871,
1744
+ "learning_rate": 3.241695303550974e-05,
1745
+ "loss": 0.0256,
1746
+ "step": 1235
1747
+ },
1748
+ {
1749
+ "epoch": 310.0,
1750
+ "grad_norm": 0.1284906268119812,
1751
+ "learning_rate": 3.2130584192439865e-05,
1752
+ "loss": 0.029,
1753
+ "step": 1240
1754
+ },
1755
+ {
1756
+ "epoch": 311.32,
1757
+ "grad_norm": 0.02885010838508606,
1758
+ "learning_rate": 3.184421534936999e-05,
1759
+ "loss": 0.0267,
1760
+ "step": 1245
1761
+ },
1762
+ {
1763
+ "epoch": 312.64,
1764
+ "grad_norm": 0.040551669895648956,
1765
+ "learning_rate": 3.155784650630012e-05,
1766
+ "loss": 0.0277,
1767
+ "step": 1250
1768
+ },
1769
+ {
1770
+ "epoch": 313.96,
1771
+ "grad_norm": 0.024676747620105743,
1772
+ "learning_rate": 3.1271477663230244e-05,
1773
+ "loss": 0.0296,
1774
+ "step": 1255
1775
+ },
1776
+ {
1777
+ "epoch": 315.0,
1778
+ "grad_norm": 0.1250019669532776,
1779
+ "learning_rate": 3.098510882016037e-05,
1780
+ "loss": 0.0295,
1781
+ "step": 1260
1782
+ },
1783
+ {
1784
+ "epoch": 316.32,
1785
+ "grad_norm": 0.03083103522658348,
1786
+ "learning_rate": 3.06987399770905e-05,
1787
+ "loss": 0.0286,
1788
+ "step": 1265
1789
+ },
1790
+ {
1791
+ "epoch": 317.64,
1792
+ "grad_norm": 0.03254910558462143,
1793
+ "learning_rate": 3.0412371134020617e-05,
1794
+ "loss": 0.0277,
1795
+ "step": 1270
1796
+ },
1797
+ {
1798
+ "epoch": 318.96,
1799
+ "grad_norm": 0.028430206701159477,
1800
+ "learning_rate": 3.0126002290950743e-05,
1801
+ "loss": 0.0268,
1802
+ "step": 1275
1803
+ },
1804
+ {
1805
+ "epoch": 320.0,
1806
+ "grad_norm": 0.10449621081352234,
1807
+ "learning_rate": 2.983963344788087e-05,
1808
+ "loss": 0.0279,
1809
+ "step": 1280
1810
+ },
1811
+ {
1812
+ "epoch": 321.32,
1813
+ "grad_norm": 0.03180396929383278,
1814
+ "learning_rate": 2.9553264604811e-05,
1815
+ "loss": 0.0287,
1816
+ "step": 1285
1817
+ },
1818
+ {
1819
+ "epoch": 322.64,
1820
+ "grad_norm": 0.03462441638112068,
1821
+ "learning_rate": 2.9266895761741126e-05,
1822
+ "loss": 0.0267,
1823
+ "step": 1290
1824
+ },
1825
+ {
1826
+ "epoch": 323.96,
1827
+ "grad_norm": 0.032813649624586105,
1828
+ "learning_rate": 2.8980526918671253e-05,
1829
+ "loss": 0.0272,
1830
+ "step": 1295
1831
+ },
1832
+ {
1833
+ "epoch": 325.0,
1834
+ "grad_norm": 0.11716829985380173,
1835
+ "learning_rate": 2.8694158075601372e-05,
1836
+ "loss": 0.0301,
1837
+ "step": 1300
1838
+ },
1839
+ {
1840
+ "epoch": 326.32,
1841
+ "grad_norm": 0.0283154658973217,
1842
+ "learning_rate": 2.8407789232531502e-05,
1843
+ "loss": 0.0297,
1844
+ "step": 1305
1845
+ },
1846
+ {
1847
+ "epoch": 327.64,
1848
+ "grad_norm": 0.037692759186029434,
1849
+ "learning_rate": 2.812142038946163e-05,
1850
+ "loss": 0.0279,
1851
+ "step": 1310
1852
+ },
1853
+ {
1854
+ "epoch": 328.96,
1855
+ "grad_norm": 0.03138533979654312,
1856
+ "learning_rate": 2.7835051546391755e-05,
1857
+ "loss": 0.0272,
1858
+ "step": 1315
1859
+ },
1860
+ {
1861
+ "epoch": 330.0,
1862
+ "grad_norm": 0.07045339792966843,
1863
+ "learning_rate": 2.754868270332188e-05,
1864
+ "loss": 0.0268,
1865
+ "step": 1320
1866
+ },
1867
+ {
1868
+ "epoch": 331.32,
1869
+ "grad_norm": 0.029422452673316002,
1870
+ "learning_rate": 2.7262313860252005e-05,
1871
+ "loss": 0.0285,
1872
+ "step": 1325
1873
+ },
1874
+ {
1875
+ "epoch": 332.64,
1876
+ "grad_norm": 0.025272730737924576,
1877
+ "learning_rate": 2.697594501718213e-05,
1878
+ "loss": 0.027,
1879
+ "step": 1330
1880
+ },
1881
+ {
1882
+ "epoch": 333.96,
1883
+ "grad_norm": 0.03468950465321541,
1884
+ "learning_rate": 2.6689576174112258e-05,
1885
+ "loss": 0.0281,
1886
+ "step": 1335
1887
+ },
1888
+ {
1889
+ "epoch": 335.0,
1890
+ "grad_norm": 0.1138090044260025,
1891
+ "learning_rate": 2.6403207331042384e-05,
1892
+ "loss": 0.0283,
1893
+ "step": 1340
1894
+ },
1895
+ {
1896
+ "epoch": 336.32,
1897
+ "grad_norm": 0.0285523422062397,
1898
+ "learning_rate": 2.611683848797251e-05,
1899
+ "loss": 0.0292,
1900
+ "step": 1345
1901
+ },
1902
+ {
1903
+ "epoch": 337.64,
1904
+ "grad_norm": 0.034624941647052765,
1905
+ "learning_rate": 2.5830469644902637e-05,
1906
+ "loss": 0.0288,
1907
+ "step": 1350
1908
+ },
1909
+ {
1910
+ "epoch": 338.96,
1911
+ "grad_norm": 0.03252566233277321,
1912
+ "learning_rate": 2.554410080183276e-05,
1913
+ "loss": 0.0262,
1914
+ "step": 1355
1915
+ },
1916
+ {
1917
+ "epoch": 340.0,
1918
+ "grad_norm": 0.10238504409790039,
1919
+ "learning_rate": 2.5257731958762887e-05,
1920
+ "loss": 0.0278,
1921
+ "step": 1360
1922
+ },
1923
+ {
1924
+ "epoch": 341.32,
1925
+ "grad_norm": 0.028706086799502373,
1926
+ "learning_rate": 2.4971363115693013e-05,
1927
+ "loss": 0.0287,
1928
+ "step": 1365
1929
+ },
1930
+ {
1931
+ "epoch": 342.64,
1932
+ "grad_norm": 0.03616653010249138,
1933
+ "learning_rate": 2.468499427262314e-05,
1934
+ "loss": 0.0288,
1935
+ "step": 1370
1936
+ },
1937
+ {
1938
+ "epoch": 343.96,
1939
+ "grad_norm": 0.033927544951438904,
1940
+ "learning_rate": 2.4398625429553266e-05,
1941
+ "loss": 0.0282,
1942
+ "step": 1375
1943
+ },
1944
+ {
1945
+ "epoch": 345.0,
1946
+ "grad_norm": 0.12410403043031693,
1947
+ "learning_rate": 2.4112256586483393e-05,
1948
+ "loss": 0.0289,
1949
+ "step": 1380
1950
+ },
1951
+ {
1952
+ "epoch": 346.32,
1953
+ "grad_norm": 0.033267851918935776,
1954
+ "learning_rate": 2.3825887743413516e-05,
1955
+ "loss": 0.0285,
1956
+ "step": 1385
1957
+ },
1958
+ {
1959
+ "epoch": 347.64,
1960
+ "grad_norm": 0.028466830030083656,
1961
+ "learning_rate": 2.3539518900343642e-05,
1962
+ "loss": 0.0274,
1963
+ "step": 1390
1964
+ },
1965
+ {
1966
+ "epoch": 348.96,
1967
+ "grad_norm": 0.0284014530479908,
1968
+ "learning_rate": 2.3253150057273772e-05,
1969
+ "loss": 0.0289,
1970
+ "step": 1395
1971
+ },
1972
+ {
1973
+ "epoch": 350.0,
1974
+ "grad_norm": 0.10417843610048294,
1975
+ "learning_rate": 2.2966781214203895e-05,
1976
+ "loss": 0.0288,
1977
+ "step": 1400
1978
+ },
1979
+ {
1980
+ "epoch": 351.32,
1981
+ "grad_norm": 0.02494928613305092,
1982
+ "learning_rate": 2.268041237113402e-05,
1983
+ "loss": 0.028,
1984
+ "step": 1405
1985
+ },
1986
+ {
1987
+ "epoch": 352.64,
1988
+ "grad_norm": 0.027743646875023842,
1989
+ "learning_rate": 2.2394043528064148e-05,
1990
+ "loss": 0.0288,
1991
+ "step": 1410
1992
+ },
1993
+ {
1994
+ "epoch": 353.96,
1995
+ "grad_norm": 0.037426408380270004,
1996
+ "learning_rate": 2.210767468499427e-05,
1997
+ "loss": 0.0268,
1998
+ "step": 1415
1999
+ },
2000
+ {
2001
+ "epoch": 355.0,
2002
+ "grad_norm": 0.06390511989593506,
2003
+ "learning_rate": 2.18213058419244e-05,
2004
+ "loss": 0.0281,
2005
+ "step": 1420
2006
+ },
2007
+ {
2008
+ "epoch": 356.32,
2009
+ "grad_norm": 0.02651941403746605,
2010
+ "learning_rate": 2.1534936998854528e-05,
2011
+ "loss": 0.0267,
2012
+ "step": 1425
2013
+ },
2014
+ {
2015
+ "epoch": 357.64,
2016
+ "grad_norm": 0.027626991271972656,
2017
+ "learning_rate": 2.124856815578465e-05,
2018
+ "loss": 0.0278,
2019
+ "step": 1430
2020
+ },
2021
+ {
2022
+ "epoch": 358.96,
2023
+ "grad_norm": 0.0289900004863739,
2024
+ "learning_rate": 2.0962199312714777e-05,
2025
+ "loss": 0.0289,
2026
+ "step": 1435
2027
+ },
2028
+ {
2029
+ "epoch": 360.0,
2030
+ "grad_norm": 0.08335373550653458,
2031
+ "learning_rate": 2.0675830469644904e-05,
2032
+ "loss": 0.0254,
2033
+ "step": 1440
2034
+ },
2035
+ {
2036
+ "epoch": 361.32,
2037
+ "grad_norm": 0.02882411703467369,
2038
+ "learning_rate": 2.038946162657503e-05,
2039
+ "loss": 0.0276,
2040
+ "step": 1445
2041
+ },
2042
+ {
2043
+ "epoch": 362.64,
2044
+ "grad_norm": 0.029498135671019554,
2045
+ "learning_rate": 2.0103092783505157e-05,
2046
+ "loss": 0.0273,
2047
+ "step": 1450
2048
+ },
2049
+ {
2050
+ "epoch": 363.96,
2051
+ "grad_norm": 0.030006349086761475,
2052
+ "learning_rate": 1.981672394043528e-05,
2053
+ "loss": 0.0266,
2054
+ "step": 1455
2055
+ },
2056
+ {
2057
+ "epoch": 365.0,
2058
+ "grad_norm": 0.08131309598684311,
2059
+ "learning_rate": 1.9530355097365406e-05,
2060
+ "loss": 0.0272,
2061
+ "step": 1460
2062
+ },
2063
+ {
2064
+ "epoch": 366.32,
2065
+ "grad_norm": 0.028547124937176704,
2066
+ "learning_rate": 1.9243986254295536e-05,
2067
+ "loss": 0.0267,
2068
+ "step": 1465
2069
+ },
2070
+ {
2071
+ "epoch": 367.64,
2072
+ "grad_norm": 0.027747539803385735,
2073
+ "learning_rate": 1.895761741122566e-05,
2074
+ "loss": 0.0273,
2075
+ "step": 1470
2076
+ },
2077
+ {
2078
+ "epoch": 368.96,
2079
+ "grad_norm": 0.032853253185749054,
2080
+ "learning_rate": 1.8671248568155786e-05,
2081
+ "loss": 0.0293,
2082
+ "step": 1475
2083
+ },
2084
+ {
2085
+ "epoch": 370.0,
2086
+ "grad_norm": 0.10667946934700012,
2087
+ "learning_rate": 1.8384879725085912e-05,
2088
+ "loss": 0.027,
2089
+ "step": 1480
2090
+ },
2091
+ {
2092
+ "epoch": 371.32,
2093
+ "grad_norm": 0.027019130066037178,
2094
+ "learning_rate": 1.809851088201604e-05,
2095
+ "loss": 0.0268,
2096
+ "step": 1485
2097
+ },
2098
+ {
2099
+ "epoch": 372.64,
2100
+ "grad_norm": 0.02968420460820198,
2101
+ "learning_rate": 1.7812142038946165e-05,
2102
+ "loss": 0.0303,
2103
+ "step": 1490
2104
+ },
2105
+ {
2106
+ "epoch": 373.96,
2107
+ "grad_norm": 0.03141555190086365,
2108
+ "learning_rate": 1.7525773195876288e-05,
2109
+ "loss": 0.0285,
2110
+ "step": 1495
2111
+ },
2112
+ {
2113
+ "epoch": 375.0,
2114
+ "grad_norm": 0.1068948432803154,
2115
+ "learning_rate": 1.7239404352806415e-05,
2116
+ "loss": 0.0293,
2117
+ "step": 1500
2118
+ }
2119
+ ],
2120
+ "logging_steps": 5,
2121
+ "max_steps": 1800,
2122
+ "num_input_tokens_seen": 0,
2123
+ "num_train_epochs": 450,
2124
+ "save_steps": 300,
2125
+ "stateful_callbacks": {
2126
+ "TrainerControl": {
2127
+ "args": {
2128
+ "should_epoch_stop": false,
2129
+ "should_evaluate": false,
2130
+ "should_log": false,
2131
+ "should_save": true,
2132
+ "should_training_stop": false
2133
+ },
2134
+ "attributes": {}
2135
+ }
2136
+ },
2137
+ "total_flos": 1.275678425088e+18,
2138
+ "train_batch_size": 2,
2139
+ "trial_name": null,
2140
+ "trial_params": null
2141
+ }
Mu-Math/group_01/checkpoints/checkpoint-1500/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
Mu-Math/group_01/checkpoints/checkpoint-1800/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: /hkfs/work/workspace/scratch/tum_fmp0582-dndworkspace/不冻结Qwen训练/models/Qwen2.5-1.5B-Instruct
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.12.0
Mu-Math/group_01/checkpoints/checkpoint-1800/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "/hkfs/work/workspace/scratch/tum_fmp0582-dndworkspace/\u4e0d\u51bb\u7ed3Qwen\u8bad\u7ec3/models/Qwen2.5-1.5B-Instruct",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 128,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 64,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "down_proj",
24
+ "up_proj",
25
+ "gate_proj",
26
+ "q_proj",
27
+ "o_proj",
28
+ "v_proj",
29
+ "k_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
Mu-Math/group_01/checkpoints/checkpoint-1800/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
Mu-Math/group_01/checkpoints/checkpoint-1800/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
Mu-Math/group_01/checkpoints/checkpoint-1800/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
Mu-Math/group_01/checkpoints/checkpoint-1800/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
Mu-Math/group_01/checkpoints/checkpoint-1800/tokenizer_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "split_special_tokens": false,
205
+ "tokenizer_class": "Qwen2Tokenizer",
206
+ "unk_token": null
207
+ }
Mu-Math/group_01/checkpoints/checkpoint-1800/trainer_state.json ADDED
@@ -0,0 +1,2561 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 450.0,
6
+ "eval_steps": 500,
7
+ "global_step": 1800,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.32,
14
+ "grad_norm": 11.867908477783203,
15
+ "learning_rate": 0.0,
16
+ "loss": 1.9204,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 1.32,
21
+ "grad_norm": 7.492858409881592,
22
+ "learning_rate": 7.4074074074074075e-06,
23
+ "loss": 1.8831,
24
+ "step": 5
25
+ },
26
+ {
27
+ "epoch": 2.64,
28
+ "grad_norm": 3.9426615238189697,
29
+ "learning_rate": 1.6666666666666667e-05,
30
+ "loss": 1.6453,
31
+ "step": 10
32
+ },
33
+ {
34
+ "epoch": 3.96,
35
+ "grad_norm": 1.769984483718872,
36
+ "learning_rate": 2.5925925925925925e-05,
37
+ "loss": 1.2506,
38
+ "step": 15
39
+ },
40
+ {
41
+ "epoch": 5.0,
42
+ "grad_norm": 1.108256220817566,
43
+ "learning_rate": 3.518518518518519e-05,
44
+ "loss": 1.0012,
45
+ "step": 20
46
+ },
47
+ {
48
+ "epoch": 6.32,
49
+ "grad_norm": 0.5219796299934387,
50
+ "learning_rate": 4.4444444444444447e-05,
51
+ "loss": 0.8034,
52
+ "step": 25
53
+ },
54
+ {
55
+ "epoch": 7.64,
56
+ "grad_norm": 0.6449305415153503,
57
+ "learning_rate": 5.370370370370371e-05,
58
+ "loss": 0.6539,
59
+ "step": 30
60
+ },
61
+ {
62
+ "epoch": 8.96,
63
+ "grad_norm": 0.580233633518219,
64
+ "learning_rate": 6.296296296296296e-05,
65
+ "loss": 0.5474,
66
+ "step": 35
67
+ },
68
+ {
69
+ "epoch": 10.0,
70
+ "grad_norm": 1.5570186376571655,
71
+ "learning_rate": 7.222222222222222e-05,
72
+ "loss": 0.4811,
73
+ "step": 40
74
+ },
75
+ {
76
+ "epoch": 11.32,
77
+ "grad_norm": 0.5841688513755798,
78
+ "learning_rate": 8.148148148148148e-05,
79
+ "loss": 0.3477,
80
+ "step": 45
81
+ },
82
+ {
83
+ "epoch": 12.64,
84
+ "grad_norm": 0.7968279719352722,
85
+ "learning_rate": 9.074074074074075e-05,
86
+ "loss": 0.2089,
87
+ "step": 50
88
+ },
89
+ {
90
+ "epoch": 13.96,
91
+ "grad_norm": 0.8396451473236084,
92
+ "learning_rate": 0.0001,
93
+ "loss": 0.1357,
94
+ "step": 55
95
+ },
96
+ {
97
+ "epoch": 15.0,
98
+ "grad_norm": 2.7755286693573,
99
+ "learning_rate": 9.971363115693013e-05,
100
+ "loss": 0.1235,
101
+ "step": 60
102
+ },
103
+ {
104
+ "epoch": 16.32,
105
+ "grad_norm": 0.6953228116035461,
106
+ "learning_rate": 9.942726231386026e-05,
107
+ "loss": 0.0755,
108
+ "step": 65
109
+ },
110
+ {
111
+ "epoch": 17.64,
112
+ "grad_norm": 1.1248857975006104,
113
+ "learning_rate": 9.914089347079038e-05,
114
+ "loss": 0.0546,
115
+ "step": 70
116
+ },
117
+ {
118
+ "epoch": 18.96,
119
+ "grad_norm": 0.5247378945350647,
120
+ "learning_rate": 9.885452462772051e-05,
121
+ "loss": 0.0602,
122
+ "step": 75
123
+ },
124
+ {
125
+ "epoch": 20.0,
126
+ "grad_norm": 1.827890157699585,
127
+ "learning_rate": 9.856815578465064e-05,
128
+ "loss": 0.0501,
129
+ "step": 80
130
+ },
131
+ {
132
+ "epoch": 21.32,
133
+ "grad_norm": 0.23602119088172913,
134
+ "learning_rate": 9.828178694158075e-05,
135
+ "loss": 0.0443,
136
+ "step": 85
137
+ },
138
+ {
139
+ "epoch": 22.64,
140
+ "grad_norm": 0.2811133861541748,
141
+ "learning_rate": 9.799541809851088e-05,
142
+ "loss": 0.0448,
143
+ "step": 90
144
+ },
145
+ {
146
+ "epoch": 23.96,
147
+ "grad_norm": 0.29150959849357605,
148
+ "learning_rate": 9.7709049255441e-05,
149
+ "loss": 0.0426,
150
+ "step": 95
151
+ },
152
+ {
153
+ "epoch": 25.0,
154
+ "grad_norm": 1.4590457677841187,
155
+ "learning_rate": 9.742268041237114e-05,
156
+ "loss": 0.04,
157
+ "step": 100
158
+ },
159
+ {
160
+ "epoch": 26.32,
161
+ "grad_norm": 0.15335209667682648,
162
+ "learning_rate": 9.713631156930127e-05,
163
+ "loss": 0.0374,
164
+ "step": 105
165
+ },
166
+ {
167
+ "epoch": 27.64,
168
+ "grad_norm": 0.3241201639175415,
169
+ "learning_rate": 9.68499427262314e-05,
170
+ "loss": 0.0378,
171
+ "step": 110
172
+ },
173
+ {
174
+ "epoch": 28.96,
175
+ "grad_norm": 0.18619631230831146,
176
+ "learning_rate": 9.656357388316152e-05,
177
+ "loss": 0.0374,
178
+ "step": 115
179
+ },
180
+ {
181
+ "epoch": 30.0,
182
+ "grad_norm": 0.4512801170349121,
183
+ "learning_rate": 9.627720504009165e-05,
184
+ "loss": 0.0342,
185
+ "step": 120
186
+ },
187
+ {
188
+ "epoch": 31.32,
189
+ "grad_norm": 0.21706914901733398,
190
+ "learning_rate": 9.599083619702178e-05,
191
+ "loss": 0.0369,
192
+ "step": 125
193
+ },
194
+ {
195
+ "epoch": 32.64,
196
+ "grad_norm": 0.42762166261672974,
197
+ "learning_rate": 9.57044673539519e-05,
198
+ "loss": 0.0355,
199
+ "step": 130
200
+ },
201
+ {
202
+ "epoch": 33.96,
203
+ "grad_norm": 0.1793977916240692,
204
+ "learning_rate": 9.541809851088203e-05,
205
+ "loss": 0.0347,
206
+ "step": 135
207
+ },
208
+ {
209
+ "epoch": 35.0,
210
+ "grad_norm": 1.866305947303772,
211
+ "learning_rate": 9.513172966781214e-05,
212
+ "loss": 0.0368,
213
+ "step": 140
214
+ },
215
+ {
216
+ "epoch": 36.32,
217
+ "grad_norm": 0.09879657626152039,
218
+ "learning_rate": 9.484536082474227e-05,
219
+ "loss": 0.0347,
220
+ "step": 145
221
+ },
222
+ {
223
+ "epoch": 37.64,
224
+ "grad_norm": 0.09229481220245361,
225
+ "learning_rate": 9.45589919816724e-05,
226
+ "loss": 0.0338,
227
+ "step": 150
228
+ },
229
+ {
230
+ "epoch": 38.96,
231
+ "grad_norm": 0.11409584432840347,
232
+ "learning_rate": 9.427262313860252e-05,
233
+ "loss": 0.0339,
234
+ "step": 155
235
+ },
236
+ {
237
+ "epoch": 40.0,
238
+ "grad_norm": 0.35678204894065857,
239
+ "learning_rate": 9.398625429553265e-05,
240
+ "loss": 0.0347,
241
+ "step": 160
242
+ },
243
+ {
244
+ "epoch": 41.32,
245
+ "grad_norm": 0.08212767541408539,
246
+ "learning_rate": 9.369988545246277e-05,
247
+ "loss": 0.0345,
248
+ "step": 165
249
+ },
250
+ {
251
+ "epoch": 42.64,
252
+ "grad_norm": 0.07271627336740494,
253
+ "learning_rate": 9.34135166093929e-05,
254
+ "loss": 0.0305,
255
+ "step": 170
256
+ },
257
+ {
258
+ "epoch": 43.96,
259
+ "grad_norm": 0.23211534321308136,
260
+ "learning_rate": 9.312714776632303e-05,
261
+ "loss": 0.0321,
262
+ "step": 175
263
+ },
264
+ {
265
+ "epoch": 45.0,
266
+ "grad_norm": 0.23425568640232086,
267
+ "learning_rate": 9.284077892325315e-05,
268
+ "loss": 0.0334,
269
+ "step": 180
270
+ },
271
+ {
272
+ "epoch": 46.32,
273
+ "grad_norm": 0.07825004309415817,
274
+ "learning_rate": 9.255441008018328e-05,
275
+ "loss": 0.0349,
276
+ "step": 185
277
+ },
278
+ {
279
+ "epoch": 47.64,
280
+ "grad_norm": 0.06621824949979782,
281
+ "learning_rate": 9.22680412371134e-05,
282
+ "loss": 0.0302,
283
+ "step": 190
284
+ },
285
+ {
286
+ "epoch": 48.96,
287
+ "grad_norm": 0.0967830941081047,
288
+ "learning_rate": 9.198167239404353e-05,
289
+ "loss": 0.0316,
290
+ "step": 195
291
+ },
292
+ {
293
+ "epoch": 50.0,
294
+ "grad_norm": 0.39718347787857056,
295
+ "learning_rate": 9.169530355097366e-05,
296
+ "loss": 0.0307,
297
+ "step": 200
298
+ },
299
+ {
300
+ "epoch": 51.32,
301
+ "grad_norm": 0.06881817430257797,
302
+ "learning_rate": 9.140893470790379e-05,
303
+ "loss": 0.0291,
304
+ "step": 205
305
+ },
306
+ {
307
+ "epoch": 52.64,
308
+ "grad_norm": 0.07241260260343552,
309
+ "learning_rate": 9.112256586483391e-05,
310
+ "loss": 0.032,
311
+ "step": 210
312
+ },
313
+ {
314
+ "epoch": 53.96,
315
+ "grad_norm": 0.08191649615764618,
316
+ "learning_rate": 9.083619702176404e-05,
317
+ "loss": 0.0293,
318
+ "step": 215
319
+ },
320
+ {
321
+ "epoch": 55.0,
322
+ "grad_norm": 0.20381148159503937,
323
+ "learning_rate": 9.054982817869416e-05,
324
+ "loss": 0.033,
325
+ "step": 220
326
+ },
327
+ {
328
+ "epoch": 56.32,
329
+ "grad_norm": 0.0765785425901413,
330
+ "learning_rate": 9.026345933562429e-05,
331
+ "loss": 0.0323,
332
+ "step": 225
333
+ },
334
+ {
335
+ "epoch": 57.64,
336
+ "grad_norm": 0.0698801577091217,
337
+ "learning_rate": 8.997709049255442e-05,
338
+ "loss": 0.0324,
339
+ "step": 230
340
+ },
341
+ {
342
+ "epoch": 58.96,
343
+ "grad_norm": 0.08089473098516464,
344
+ "learning_rate": 8.969072164948454e-05,
345
+ "loss": 0.0314,
346
+ "step": 235
347
+ },
348
+ {
349
+ "epoch": 60.0,
350
+ "grad_norm": 0.22270062565803528,
351
+ "learning_rate": 8.940435280641467e-05,
352
+ "loss": 0.0303,
353
+ "step": 240
354
+ },
355
+ {
356
+ "epoch": 61.32,
357
+ "grad_norm": 0.07712433487176895,
358
+ "learning_rate": 8.91179839633448e-05,
359
+ "loss": 0.0324,
360
+ "step": 245
361
+ },
362
+ {
363
+ "epoch": 62.64,
364
+ "grad_norm": 0.05860769376158714,
365
+ "learning_rate": 8.883161512027491e-05,
366
+ "loss": 0.0321,
367
+ "step": 250
368
+ },
369
+ {
370
+ "epoch": 63.96,
371
+ "grad_norm": 0.05999445170164108,
372
+ "learning_rate": 8.854524627720504e-05,
373
+ "loss": 0.0315,
374
+ "step": 255
375
+ },
376
+ {
377
+ "epoch": 65.0,
378
+ "grad_norm": 0.20564565062522888,
379
+ "learning_rate": 8.825887743413516e-05,
380
+ "loss": 0.0364,
381
+ "step": 260
382
+ },
383
+ {
384
+ "epoch": 66.32,
385
+ "grad_norm": 0.0610821433365345,
386
+ "learning_rate": 8.797250859106529e-05,
387
+ "loss": 0.0311,
388
+ "step": 265
389
+ },
390
+ {
391
+ "epoch": 67.64,
392
+ "grad_norm": 0.05693706497550011,
393
+ "learning_rate": 8.768613974799542e-05,
394
+ "loss": 0.0294,
395
+ "step": 270
396
+ },
397
+ {
398
+ "epoch": 68.96,
399
+ "grad_norm": 0.06817185133695602,
400
+ "learning_rate": 8.739977090492554e-05,
401
+ "loss": 0.0284,
402
+ "step": 275
403
+ },
404
+ {
405
+ "epoch": 70.0,
406
+ "grad_norm": 0.17458151280879974,
407
+ "learning_rate": 8.711340206185567e-05,
408
+ "loss": 0.0291,
409
+ "step": 280
410
+ },
411
+ {
412
+ "epoch": 71.32,
413
+ "grad_norm": 0.07353579252958298,
414
+ "learning_rate": 8.682703321878581e-05,
415
+ "loss": 0.0324,
416
+ "step": 285
417
+ },
418
+ {
419
+ "epoch": 72.64,
420
+ "grad_norm": 0.061573781073093414,
421
+ "learning_rate": 8.654066437571594e-05,
422
+ "loss": 0.0305,
423
+ "step": 290
424
+ },
425
+ {
426
+ "epoch": 73.96,
427
+ "grad_norm": 0.07544506341218948,
428
+ "learning_rate": 8.625429553264606e-05,
429
+ "loss": 0.0294,
430
+ "step": 295
431
+ },
432
+ {
433
+ "epoch": 75.0,
434
+ "grad_norm": 0.11100324243307114,
435
+ "learning_rate": 8.596792668957619e-05,
436
+ "loss": 0.03,
437
+ "step": 300
438
+ },
439
+ {
440
+ "epoch": 76.32,
441
+ "grad_norm": 0.0491141714155674,
442
+ "learning_rate": 8.56815578465063e-05,
443
+ "loss": 0.0296,
444
+ "step": 305
445
+ },
446
+ {
447
+ "epoch": 77.64,
448
+ "grad_norm": 0.07668624073266983,
449
+ "learning_rate": 8.539518900343643e-05,
450
+ "loss": 0.0345,
451
+ "step": 310
452
+ },
453
+ {
454
+ "epoch": 78.96,
455
+ "grad_norm": 0.07898231595754623,
456
+ "learning_rate": 8.510882016036655e-05,
457
+ "loss": 0.0306,
458
+ "step": 315
459
+ },
460
+ {
461
+ "epoch": 80.0,
462
+ "grad_norm": 0.3391458988189697,
463
+ "learning_rate": 8.482245131729668e-05,
464
+ "loss": 0.0334,
465
+ "step": 320
466
+ },
467
+ {
468
+ "epoch": 81.32,
469
+ "grad_norm": 0.05544694885611534,
470
+ "learning_rate": 8.453608247422681e-05,
471
+ "loss": 0.0282,
472
+ "step": 325
473
+ },
474
+ {
475
+ "epoch": 82.64,
476
+ "grad_norm": 0.05032579228281975,
477
+ "learning_rate": 8.424971363115693e-05,
478
+ "loss": 0.0323,
479
+ "step": 330
480
+ },
481
+ {
482
+ "epoch": 83.96,
483
+ "grad_norm": 0.05664476007223129,
484
+ "learning_rate": 8.396334478808706e-05,
485
+ "loss": 0.0295,
486
+ "step": 335
487
+ },
488
+ {
489
+ "epoch": 85.0,
490
+ "grad_norm": 0.24190960824489594,
491
+ "learning_rate": 8.367697594501719e-05,
492
+ "loss": 0.033,
493
+ "step": 340
494
+ },
495
+ {
496
+ "epoch": 86.32,
497
+ "grad_norm": 0.05068003758788109,
498
+ "learning_rate": 8.339060710194731e-05,
499
+ "loss": 0.0294,
500
+ "step": 345
501
+ },
502
+ {
503
+ "epoch": 87.64,
504
+ "grad_norm": 0.06719321757555008,
505
+ "learning_rate": 8.310423825887744e-05,
506
+ "loss": 0.0297,
507
+ "step": 350
508
+ },
509
+ {
510
+ "epoch": 88.96,
511
+ "grad_norm": 0.05750493332743645,
512
+ "learning_rate": 8.281786941580757e-05,
513
+ "loss": 0.0276,
514
+ "step": 355
515
+ },
516
+ {
517
+ "epoch": 90.0,
518
+ "grad_norm": 0.17318210005760193,
519
+ "learning_rate": 8.253150057273768e-05,
520
+ "loss": 0.0309,
521
+ "step": 360
522
+ },
523
+ {
524
+ "epoch": 91.32,
525
+ "grad_norm": 0.05168261379003525,
526
+ "learning_rate": 8.224513172966782e-05,
527
+ "loss": 0.0284,
528
+ "step": 365
529
+ },
530
+ {
531
+ "epoch": 92.64,
532
+ "grad_norm": 0.053040292114019394,
533
+ "learning_rate": 8.195876288659795e-05,
534
+ "loss": 0.0314,
535
+ "step": 370
536
+ },
537
+ {
538
+ "epoch": 93.96,
539
+ "grad_norm": 0.06162334978580475,
540
+ "learning_rate": 8.167239404352807e-05,
541
+ "loss": 0.0297,
542
+ "step": 375
543
+ },
544
+ {
545
+ "epoch": 95.0,
546
+ "grad_norm": 0.13474801182746887,
547
+ "learning_rate": 8.13860252004582e-05,
548
+ "loss": 0.0271,
549
+ "step": 380
550
+ },
551
+ {
552
+ "epoch": 96.32,
553
+ "grad_norm": 0.05177682265639305,
554
+ "learning_rate": 8.109965635738833e-05,
555
+ "loss": 0.0301,
556
+ "step": 385
557
+ },
558
+ {
559
+ "epoch": 97.64,
560
+ "grad_norm": 0.04276576265692711,
561
+ "learning_rate": 8.081328751431845e-05,
562
+ "loss": 0.0286,
563
+ "step": 390
564
+ },
565
+ {
566
+ "epoch": 98.96,
567
+ "grad_norm": 0.04698758199810982,
568
+ "learning_rate": 8.052691867124858e-05,
569
+ "loss": 0.0284,
570
+ "step": 395
571
+ },
572
+ {
573
+ "epoch": 100.0,
574
+ "grad_norm": 0.14094208180904388,
575
+ "learning_rate": 8.02405498281787e-05,
576
+ "loss": 0.0302,
577
+ "step": 400
578
+ },
579
+ {
580
+ "epoch": 101.32,
581
+ "grad_norm": 0.0528222993016243,
582
+ "learning_rate": 7.995418098510883e-05,
583
+ "loss": 0.0304,
584
+ "step": 405
585
+ },
586
+ {
587
+ "epoch": 102.64,
588
+ "grad_norm": 0.053034182637929916,
589
+ "learning_rate": 7.966781214203894e-05,
590
+ "loss": 0.0316,
591
+ "step": 410
592
+ },
593
+ {
594
+ "epoch": 103.96,
595
+ "grad_norm": 0.05732697248458862,
596
+ "learning_rate": 7.938144329896907e-05,
597
+ "loss": 0.0295,
598
+ "step": 415
599
+ },
600
+ {
601
+ "epoch": 105.0,
602
+ "grad_norm": 0.17511749267578125,
603
+ "learning_rate": 7.90950744558992e-05,
604
+ "loss": 0.0317,
605
+ "step": 420
606
+ },
607
+ {
608
+ "epoch": 106.32,
609
+ "grad_norm": 0.04588017240166664,
610
+ "learning_rate": 7.880870561282932e-05,
611
+ "loss": 0.0305,
612
+ "step": 425
613
+ },
614
+ {
615
+ "epoch": 107.64,
616
+ "grad_norm": 0.049282800406217575,
617
+ "learning_rate": 7.852233676975945e-05,
618
+ "loss": 0.031,
619
+ "step": 430
620
+ },
621
+ {
622
+ "epoch": 108.96,
623
+ "grad_norm": 0.04937691241502762,
624
+ "learning_rate": 7.823596792668958e-05,
625
+ "loss": 0.0278,
626
+ "step": 435
627
+ },
628
+ {
629
+ "epoch": 110.0,
630
+ "grad_norm": 0.11863432824611664,
631
+ "learning_rate": 7.79495990836197e-05,
632
+ "loss": 0.0316,
633
+ "step": 440
634
+ },
635
+ {
636
+ "epoch": 111.32,
637
+ "grad_norm": 0.04387475177645683,
638
+ "learning_rate": 7.766323024054983e-05,
639
+ "loss": 0.0283,
640
+ "step": 445
641
+ },
642
+ {
643
+ "epoch": 112.64,
644
+ "grad_norm": 0.04409867897629738,
645
+ "learning_rate": 7.737686139747996e-05,
646
+ "loss": 0.0306,
647
+ "step": 450
648
+ },
649
+ {
650
+ "epoch": 113.96,
651
+ "grad_norm": 0.04834749549627304,
652
+ "learning_rate": 7.709049255441008e-05,
653
+ "loss": 0.0302,
654
+ "step": 455
655
+ },
656
+ {
657
+ "epoch": 115.0,
658
+ "grad_norm": 0.1553424447774887,
659
+ "learning_rate": 7.680412371134021e-05,
660
+ "loss": 0.0326,
661
+ "step": 460
662
+ },
663
+ {
664
+ "epoch": 116.32,
665
+ "grad_norm": 0.05963806435465813,
666
+ "learning_rate": 7.651775486827034e-05,
667
+ "loss": 0.0291,
668
+ "step": 465
669
+ },
670
+ {
671
+ "epoch": 117.64,
672
+ "grad_norm": 0.04697559028863907,
673
+ "learning_rate": 7.623138602520046e-05,
674
+ "loss": 0.027,
675
+ "step": 470
676
+ },
677
+ {
678
+ "epoch": 118.96,
679
+ "grad_norm": 0.04225379601120949,
680
+ "learning_rate": 7.594501718213059e-05,
681
+ "loss": 0.0343,
682
+ "step": 475
683
+ },
684
+ {
685
+ "epoch": 120.0,
686
+ "grad_norm": 0.1076933965086937,
687
+ "learning_rate": 7.565864833906071e-05,
688
+ "loss": 0.0288,
689
+ "step": 480
690
+ },
691
+ {
692
+ "epoch": 121.32,
693
+ "grad_norm": 0.04540383443236351,
694
+ "learning_rate": 7.537227949599084e-05,
695
+ "loss": 0.0291,
696
+ "step": 485
697
+ },
698
+ {
699
+ "epoch": 122.64,
700
+ "grad_norm": 0.05459335818886757,
701
+ "learning_rate": 7.508591065292097e-05,
702
+ "loss": 0.0289,
703
+ "step": 490
704
+ },
705
+ {
706
+ "epoch": 123.96,
707
+ "grad_norm": 0.05171333625912666,
708
+ "learning_rate": 7.47995418098511e-05,
709
+ "loss": 0.0284,
710
+ "step": 495
711
+ },
712
+ {
713
+ "epoch": 125.0,
714
+ "grad_norm": 0.08606769144535065,
715
+ "learning_rate": 7.451317296678122e-05,
716
+ "loss": 0.0314,
717
+ "step": 500
718
+ },
719
+ {
720
+ "epoch": 126.32,
721
+ "grad_norm": 0.040535662323236465,
722
+ "learning_rate": 7.422680412371135e-05,
723
+ "loss": 0.028,
724
+ "step": 505
725
+ },
726
+ {
727
+ "epoch": 127.64,
728
+ "grad_norm": 0.04621696099638939,
729
+ "learning_rate": 7.394043528064147e-05,
730
+ "loss": 0.0281,
731
+ "step": 510
732
+ },
733
+ {
734
+ "epoch": 128.96,
735
+ "grad_norm": 0.04407593980431557,
736
+ "learning_rate": 7.36540664375716e-05,
737
+ "loss": 0.0309,
738
+ "step": 515
739
+ },
740
+ {
741
+ "epoch": 130.0,
742
+ "grad_norm": 0.24090737104415894,
743
+ "learning_rate": 7.336769759450171e-05,
744
+ "loss": 0.0302,
745
+ "step": 520
746
+ },
747
+ {
748
+ "epoch": 131.32,
749
+ "grad_norm": 0.051712971180677414,
750
+ "learning_rate": 7.308132875143184e-05,
751
+ "loss": 0.0305,
752
+ "step": 525
753
+ },
754
+ {
755
+ "epoch": 132.64,
756
+ "grad_norm": 0.0373610258102417,
757
+ "learning_rate": 7.279495990836197e-05,
758
+ "loss": 0.0283,
759
+ "step": 530
760
+ },
761
+ {
762
+ "epoch": 133.96,
763
+ "grad_norm": 0.04424213245511055,
764
+ "learning_rate": 7.250859106529209e-05,
765
+ "loss": 0.0317,
766
+ "step": 535
767
+ },
768
+ {
769
+ "epoch": 135.0,
770
+ "grad_norm": 0.09113436192274094,
771
+ "learning_rate": 7.222222222222222e-05,
772
+ "loss": 0.0302,
773
+ "step": 540
774
+ },
775
+ {
776
+ "epoch": 136.32,
777
+ "grad_norm": 0.03745009005069733,
778
+ "learning_rate": 7.193585337915235e-05,
779
+ "loss": 0.0315,
780
+ "step": 545
781
+ },
782
+ {
783
+ "epoch": 137.64,
784
+ "grad_norm": 0.04058730602264404,
785
+ "learning_rate": 7.164948453608247e-05,
786
+ "loss": 0.0312,
787
+ "step": 550
788
+ },
789
+ {
790
+ "epoch": 138.96,
791
+ "grad_norm": 0.046279069036245346,
792
+ "learning_rate": 7.136311569301261e-05,
793
+ "loss": 0.0295,
794
+ "step": 555
795
+ },
796
+ {
797
+ "epoch": 140.0,
798
+ "grad_norm": 0.17239141464233398,
799
+ "learning_rate": 7.107674684994274e-05,
800
+ "loss": 0.0307,
801
+ "step": 560
802
+ },
803
+ {
804
+ "epoch": 141.32,
805
+ "grad_norm": 0.036460030823946,
806
+ "learning_rate": 7.079037800687286e-05,
807
+ "loss": 0.0284,
808
+ "step": 565
809
+ },
810
+ {
811
+ "epoch": 142.64,
812
+ "grad_norm": 0.03434258699417114,
813
+ "learning_rate": 7.050400916380299e-05,
814
+ "loss": 0.0283,
815
+ "step": 570
816
+ },
817
+ {
818
+ "epoch": 143.96,
819
+ "grad_norm": 0.0470467284321785,
820
+ "learning_rate": 7.02176403207331e-05,
821
+ "loss": 0.0296,
822
+ "step": 575
823
+ },
824
+ {
825
+ "epoch": 145.0,
826
+ "grad_norm": 0.07163394242525101,
827
+ "learning_rate": 6.993127147766323e-05,
828
+ "loss": 0.0256,
829
+ "step": 580
830
+ },
831
+ {
832
+ "epoch": 146.32,
833
+ "grad_norm": 0.042208388447761536,
834
+ "learning_rate": 6.964490263459336e-05,
835
+ "loss": 0.0298,
836
+ "step": 585
837
+ },
838
+ {
839
+ "epoch": 147.64,
840
+ "grad_norm": 0.04421050846576691,
841
+ "learning_rate": 6.935853379152348e-05,
842
+ "loss": 0.0274,
843
+ "step": 590
844
+ },
845
+ {
846
+ "epoch": 148.96,
847
+ "grad_norm": 0.047223106026649475,
848
+ "learning_rate": 6.907216494845361e-05,
849
+ "loss": 0.0311,
850
+ "step": 595
851
+ },
852
+ {
853
+ "epoch": 150.0,
854
+ "grad_norm": 0.1724609136581421,
855
+ "learning_rate": 6.878579610538374e-05,
856
+ "loss": 0.0302,
857
+ "step": 600
858
+ },
859
+ {
860
+ "epoch": 151.32,
861
+ "grad_norm": 0.042247697710990906,
862
+ "learning_rate": 6.849942726231386e-05,
863
+ "loss": 0.0287,
864
+ "step": 605
865
+ },
866
+ {
867
+ "epoch": 152.64,
868
+ "grad_norm": 0.05167734622955322,
869
+ "learning_rate": 6.821305841924399e-05,
870
+ "loss": 0.0279,
871
+ "step": 610
872
+ },
873
+ {
874
+ "epoch": 153.96,
875
+ "grad_norm": 0.03621920198202133,
876
+ "learning_rate": 6.792668957617412e-05,
877
+ "loss": 0.0291,
878
+ "step": 615
879
+ },
880
+ {
881
+ "epoch": 155.0,
882
+ "grad_norm": 0.22533060610294342,
883
+ "learning_rate": 6.764032073310424e-05,
884
+ "loss": 0.0354,
885
+ "step": 620
886
+ },
887
+ {
888
+ "epoch": 156.32,
889
+ "grad_norm": 0.03441638499498367,
890
+ "learning_rate": 6.735395189003437e-05,
891
+ "loss": 0.0285,
892
+ "step": 625
893
+ },
894
+ {
895
+ "epoch": 157.64,
896
+ "grad_norm": 0.03728373721241951,
897
+ "learning_rate": 6.706758304696448e-05,
898
+ "loss": 0.0299,
899
+ "step": 630
900
+ },
901
+ {
902
+ "epoch": 158.96,
903
+ "grad_norm": 0.043604422360658646,
904
+ "learning_rate": 6.678121420389462e-05,
905
+ "loss": 0.0292,
906
+ "step": 635
907
+ },
908
+ {
909
+ "epoch": 160.0,
910
+ "grad_norm": 0.17142102122306824,
911
+ "learning_rate": 6.649484536082475e-05,
912
+ "loss": 0.0331,
913
+ "step": 640
914
+ },
915
+ {
916
+ "epoch": 161.32,
917
+ "grad_norm": 0.03554172441363335,
918
+ "learning_rate": 6.620847651775487e-05,
919
+ "loss": 0.0305,
920
+ "step": 645
921
+ },
922
+ {
923
+ "epoch": 162.64,
924
+ "grad_norm": 0.043817318975925446,
925
+ "learning_rate": 6.5922107674685e-05,
926
+ "loss": 0.0305,
927
+ "step": 650
928
+ },
929
+ {
930
+ "epoch": 163.96,
931
+ "grad_norm": 0.04247381538152695,
932
+ "learning_rate": 6.563573883161513e-05,
933
+ "loss": 0.0302,
934
+ "step": 655
935
+ },
936
+ {
937
+ "epoch": 165.0,
938
+ "grad_norm": 0.09436971694231033,
939
+ "learning_rate": 6.534936998854525e-05,
940
+ "loss": 0.0292,
941
+ "step": 660
942
+ },
943
+ {
944
+ "epoch": 166.32,
945
+ "grad_norm": 0.04177865758538246,
946
+ "learning_rate": 6.506300114547538e-05,
947
+ "loss": 0.0281,
948
+ "step": 665
949
+ },
950
+ {
951
+ "epoch": 167.64,
952
+ "grad_norm": 0.04079804942011833,
953
+ "learning_rate": 6.477663230240551e-05,
954
+ "loss": 0.0288,
955
+ "step": 670
956
+ },
957
+ {
958
+ "epoch": 168.96,
959
+ "grad_norm": 0.039114974439144135,
960
+ "learning_rate": 6.449026345933563e-05,
961
+ "loss": 0.0275,
962
+ "step": 675
963
+ },
964
+ {
965
+ "epoch": 170.0,
966
+ "grad_norm": 0.11661799997091293,
967
+ "learning_rate": 6.420389461626576e-05,
968
+ "loss": 0.0288,
969
+ "step": 680
970
+ },
971
+ {
972
+ "epoch": 171.32,
973
+ "grad_norm": 0.03423461318016052,
974
+ "learning_rate": 6.391752577319587e-05,
975
+ "loss": 0.0306,
976
+ "step": 685
977
+ },
978
+ {
979
+ "epoch": 172.64,
980
+ "grad_norm": 0.03615871071815491,
981
+ "learning_rate": 6.3631156930126e-05,
982
+ "loss": 0.031,
983
+ "step": 690
984
+ },
985
+ {
986
+ "epoch": 173.96,
987
+ "grad_norm": 0.04067518189549446,
988
+ "learning_rate": 6.334478808705613e-05,
989
+ "loss": 0.0312,
990
+ "step": 695
991
+ },
992
+ {
993
+ "epoch": 175.0,
994
+ "grad_norm": 0.11094173789024353,
995
+ "learning_rate": 6.305841924398625e-05,
996
+ "loss": 0.0287,
997
+ "step": 700
998
+ },
999
+ {
1000
+ "epoch": 176.32,
1001
+ "grad_norm": 0.03173477575182915,
1002
+ "learning_rate": 6.277205040091638e-05,
1003
+ "loss": 0.0296,
1004
+ "step": 705
1005
+ },
1006
+ {
1007
+ "epoch": 177.64,
1008
+ "grad_norm": 0.0374116450548172,
1009
+ "learning_rate": 6.24856815578465e-05,
1010
+ "loss": 0.0292,
1011
+ "step": 710
1012
+ },
1013
+ {
1014
+ "epoch": 178.96,
1015
+ "grad_norm": 0.03814936801791191,
1016
+ "learning_rate": 6.219931271477663e-05,
1017
+ "loss": 0.0273,
1018
+ "step": 715
1019
+ },
1020
+ {
1021
+ "epoch": 180.0,
1022
+ "grad_norm": 0.14255362749099731,
1023
+ "learning_rate": 6.191294387170676e-05,
1024
+ "loss": 0.0335,
1025
+ "step": 720
1026
+ },
1027
+ {
1028
+ "epoch": 181.32,
1029
+ "grad_norm": 0.04104507714509964,
1030
+ "learning_rate": 6.162657502863689e-05,
1031
+ "loss": 0.0303,
1032
+ "step": 725
1033
+ },
1034
+ {
1035
+ "epoch": 182.64,
1036
+ "grad_norm": 0.037353888154029846,
1037
+ "learning_rate": 6.134020618556701e-05,
1038
+ "loss": 0.0271,
1039
+ "step": 730
1040
+ },
1041
+ {
1042
+ "epoch": 183.96,
1043
+ "grad_norm": 0.03552788123488426,
1044
+ "learning_rate": 6.105383734249714e-05,
1045
+ "loss": 0.0288,
1046
+ "step": 735
1047
+ },
1048
+ {
1049
+ "epoch": 185.0,
1050
+ "grad_norm": 0.09345243126153946,
1051
+ "learning_rate": 6.076746849942726e-05,
1052
+ "loss": 0.0271,
1053
+ "step": 740
1054
+ },
1055
+ {
1056
+ "epoch": 186.32,
1057
+ "grad_norm": 0.031304650008678436,
1058
+ "learning_rate": 6.0481099656357384e-05,
1059
+ "loss": 0.0287,
1060
+ "step": 745
1061
+ },
1062
+ {
1063
+ "epoch": 187.64,
1064
+ "grad_norm": 0.03588686138391495,
1065
+ "learning_rate": 6.019473081328752e-05,
1066
+ "loss": 0.0292,
1067
+ "step": 750
1068
+ },
1069
+ {
1070
+ "epoch": 188.96,
1071
+ "grad_norm": 0.03166257590055466,
1072
+ "learning_rate": 5.9908361970217644e-05,
1073
+ "loss": 0.0277,
1074
+ "step": 755
1075
+ },
1076
+ {
1077
+ "epoch": 190.0,
1078
+ "grad_norm": 0.09115266799926758,
1079
+ "learning_rate": 5.962199312714777e-05,
1080
+ "loss": 0.0286,
1081
+ "step": 760
1082
+ },
1083
+ {
1084
+ "epoch": 191.32,
1085
+ "grad_norm": 0.028432967141270638,
1086
+ "learning_rate": 5.93356242840779e-05,
1087
+ "loss": 0.0277,
1088
+ "step": 765
1089
+ },
1090
+ {
1091
+ "epoch": 192.64,
1092
+ "grad_norm": 0.04126034304499626,
1093
+ "learning_rate": 5.904925544100802e-05,
1094
+ "loss": 0.0315,
1095
+ "step": 770
1096
+ },
1097
+ {
1098
+ "epoch": 193.96,
1099
+ "grad_norm": 0.04166596010327339,
1100
+ "learning_rate": 5.876288659793815e-05,
1101
+ "loss": 0.0281,
1102
+ "step": 775
1103
+ },
1104
+ {
1105
+ "epoch": 195.0,
1106
+ "grad_norm": 0.11017812788486481,
1107
+ "learning_rate": 5.8476517754868276e-05,
1108
+ "loss": 0.0285,
1109
+ "step": 780
1110
+ },
1111
+ {
1112
+ "epoch": 196.32,
1113
+ "grad_norm": 0.04071119427680969,
1114
+ "learning_rate": 5.81901489117984e-05,
1115
+ "loss": 0.0289,
1116
+ "step": 785
1117
+ },
1118
+ {
1119
+ "epoch": 197.64,
1120
+ "grad_norm": 0.03756481036543846,
1121
+ "learning_rate": 5.790378006872853e-05,
1122
+ "loss": 0.0276,
1123
+ "step": 790
1124
+ },
1125
+ {
1126
+ "epoch": 198.96,
1127
+ "grad_norm": 0.039780210703611374,
1128
+ "learning_rate": 5.761741122565865e-05,
1129
+ "loss": 0.0282,
1130
+ "step": 795
1131
+ },
1132
+ {
1133
+ "epoch": 200.0,
1134
+ "grad_norm": 0.12418342381715775,
1135
+ "learning_rate": 5.7331042382588775e-05,
1136
+ "loss": 0.0296,
1137
+ "step": 800
1138
+ },
1139
+ {
1140
+ "epoch": 201.32,
1141
+ "grad_norm": 0.0338447242975235,
1142
+ "learning_rate": 5.70446735395189e-05,
1143
+ "loss": 0.0286,
1144
+ "step": 805
1145
+ },
1146
+ {
1147
+ "epoch": 202.64,
1148
+ "grad_norm": 0.03490043804049492,
1149
+ "learning_rate": 5.675830469644903e-05,
1150
+ "loss": 0.0306,
1151
+ "step": 810
1152
+ },
1153
+ {
1154
+ "epoch": 203.96,
1155
+ "grad_norm": 0.03847096487879753,
1156
+ "learning_rate": 5.6471935853379155e-05,
1157
+ "loss": 0.0283,
1158
+ "step": 815
1159
+ },
1160
+ {
1161
+ "epoch": 205.0,
1162
+ "grad_norm": 0.10988269001245499,
1163
+ "learning_rate": 5.618556701030928e-05,
1164
+ "loss": 0.0278,
1165
+ "step": 820
1166
+ },
1167
+ {
1168
+ "epoch": 206.32,
1169
+ "grad_norm": 0.034018851816654205,
1170
+ "learning_rate": 5.589919816723941e-05,
1171
+ "loss": 0.03,
1172
+ "step": 825
1173
+ },
1174
+ {
1175
+ "epoch": 207.64,
1176
+ "grad_norm": 0.032927289605140686,
1177
+ "learning_rate": 5.5612829324169534e-05,
1178
+ "loss": 0.0293,
1179
+ "step": 830
1180
+ },
1181
+ {
1182
+ "epoch": 208.96,
1183
+ "grad_norm": 0.03604916110634804,
1184
+ "learning_rate": 5.532646048109966e-05,
1185
+ "loss": 0.0285,
1186
+ "step": 835
1187
+ },
1188
+ {
1189
+ "epoch": 210.0,
1190
+ "grad_norm": 0.09708557277917862,
1191
+ "learning_rate": 5.504009163802979e-05,
1192
+ "loss": 0.0305,
1193
+ "step": 840
1194
+ },
1195
+ {
1196
+ "epoch": 211.32,
1197
+ "grad_norm": 0.03745417296886444,
1198
+ "learning_rate": 5.4753722794959914e-05,
1199
+ "loss": 0.0296,
1200
+ "step": 845
1201
+ },
1202
+ {
1203
+ "epoch": 212.64,
1204
+ "grad_norm": 0.028906095772981644,
1205
+ "learning_rate": 5.4467353951890033e-05,
1206
+ "loss": 0.0277,
1207
+ "step": 850
1208
+ },
1209
+ {
1210
+ "epoch": 213.96,
1211
+ "grad_norm": 0.03228568285703659,
1212
+ "learning_rate": 5.418098510882016e-05,
1213
+ "loss": 0.0295,
1214
+ "step": 855
1215
+ },
1216
+ {
1217
+ "epoch": 215.0,
1218
+ "grad_norm": 0.1302802711725235,
1219
+ "learning_rate": 5.3894616265750286e-05,
1220
+ "loss": 0.031,
1221
+ "step": 860
1222
+ },
1223
+ {
1224
+ "epoch": 216.32,
1225
+ "grad_norm": 0.031472526490688324,
1226
+ "learning_rate": 5.360824742268041e-05,
1227
+ "loss": 0.0286,
1228
+ "step": 865
1229
+ },
1230
+ {
1231
+ "epoch": 217.64,
1232
+ "grad_norm": 0.03589686006307602,
1233
+ "learning_rate": 5.332187857961054e-05,
1234
+ "loss": 0.0308,
1235
+ "step": 870
1236
+ },
1237
+ {
1238
+ "epoch": 218.96,
1239
+ "grad_norm": 0.04117952659726143,
1240
+ "learning_rate": 5.3035509736540666e-05,
1241
+ "loss": 0.0298,
1242
+ "step": 875
1243
+ },
1244
+ {
1245
+ "epoch": 220.0,
1246
+ "grad_norm": 0.16901935636997223,
1247
+ "learning_rate": 5.274914089347079e-05,
1248
+ "loss": 0.0316,
1249
+ "step": 880
1250
+ },
1251
+ {
1252
+ "epoch": 221.32,
1253
+ "grad_norm": 0.03608705848455429,
1254
+ "learning_rate": 5.246277205040092e-05,
1255
+ "loss": 0.0297,
1256
+ "step": 885
1257
+ },
1258
+ {
1259
+ "epoch": 222.64,
1260
+ "grad_norm": 0.028423065319657326,
1261
+ "learning_rate": 5.2176403207331045e-05,
1262
+ "loss": 0.029,
1263
+ "step": 890
1264
+ },
1265
+ {
1266
+ "epoch": 223.96,
1267
+ "grad_norm": 0.03328604996204376,
1268
+ "learning_rate": 5.189003436426118e-05,
1269
+ "loss": 0.0304,
1270
+ "step": 895
1271
+ },
1272
+ {
1273
+ "epoch": 225.0,
1274
+ "grad_norm": 0.1140102967619896,
1275
+ "learning_rate": 5.1603665521191305e-05,
1276
+ "loss": 0.0295,
1277
+ "step": 900
1278
+ },
1279
+ {
1280
+ "epoch": 226.32,
1281
+ "grad_norm": 0.03379100188612938,
1282
+ "learning_rate": 5.131729667812142e-05,
1283
+ "loss": 0.0289,
1284
+ "step": 905
1285
+ },
1286
+ {
1287
+ "epoch": 227.64,
1288
+ "grad_norm": 0.03175675496459007,
1289
+ "learning_rate": 5.1030927835051544e-05,
1290
+ "loss": 0.0272,
1291
+ "step": 910
1292
+ },
1293
+ {
1294
+ "epoch": 228.96,
1295
+ "grad_norm": 0.0344826877117157,
1296
+ "learning_rate": 5.074455899198167e-05,
1297
+ "loss": 0.0308,
1298
+ "step": 915
1299
+ },
1300
+ {
1301
+ "epoch": 230.0,
1302
+ "grad_norm": 0.1841171830892563,
1303
+ "learning_rate": 5.04581901489118e-05,
1304
+ "loss": 0.0308,
1305
+ "step": 920
1306
+ },
1307
+ {
1308
+ "epoch": 231.32,
1309
+ "grad_norm": 0.03660387173295021,
1310
+ "learning_rate": 5.0171821305841924e-05,
1311
+ "loss": 0.0326,
1312
+ "step": 925
1313
+ },
1314
+ {
1315
+ "epoch": 232.64,
1316
+ "grad_norm": 0.03065328672528267,
1317
+ "learning_rate": 4.988545246277205e-05,
1318
+ "loss": 0.0295,
1319
+ "step": 930
1320
+ },
1321
+ {
1322
+ "epoch": 233.96,
1323
+ "grad_norm": 0.03993593156337738,
1324
+ "learning_rate": 4.9599083619702184e-05,
1325
+ "loss": 0.0292,
1326
+ "step": 935
1327
+ },
1328
+ {
1329
+ "epoch": 235.0,
1330
+ "grad_norm": 0.10738981515169144,
1331
+ "learning_rate": 4.931271477663231e-05,
1332
+ "loss": 0.0302,
1333
+ "step": 940
1334
+ },
1335
+ {
1336
+ "epoch": 236.32,
1337
+ "grad_norm": 0.03143048286437988,
1338
+ "learning_rate": 4.902634593356243e-05,
1339
+ "loss": 0.0278,
1340
+ "step": 945
1341
+ },
1342
+ {
1343
+ "epoch": 237.64,
1344
+ "grad_norm": 0.028968214988708496,
1345
+ "learning_rate": 4.8739977090492556e-05,
1346
+ "loss": 0.0271,
1347
+ "step": 950
1348
+ },
1349
+ {
1350
+ "epoch": 238.96,
1351
+ "grad_norm": 0.038674987852573395,
1352
+ "learning_rate": 4.845360824742268e-05,
1353
+ "loss": 0.0297,
1354
+ "step": 955
1355
+ },
1356
+ {
1357
+ "epoch": 240.0,
1358
+ "grad_norm": 0.10797161608934402,
1359
+ "learning_rate": 4.816723940435281e-05,
1360
+ "loss": 0.0278,
1361
+ "step": 960
1362
+ },
1363
+ {
1364
+ "epoch": 241.32,
1365
+ "grad_norm": 0.03592285141348839,
1366
+ "learning_rate": 4.7880870561282936e-05,
1367
+ "loss": 0.0281,
1368
+ "step": 965
1369
+ },
1370
+ {
1371
+ "epoch": 242.64,
1372
+ "grad_norm": 0.031206540763378143,
1373
+ "learning_rate": 4.7594501718213055e-05,
1374
+ "loss": 0.031,
1375
+ "step": 970
1376
+ },
1377
+ {
1378
+ "epoch": 243.96,
1379
+ "grad_norm": 0.03692101314663887,
1380
+ "learning_rate": 4.730813287514318e-05,
1381
+ "loss": 0.0276,
1382
+ "step": 975
1383
+ },
1384
+ {
1385
+ "epoch": 245.0,
1386
+ "grad_norm": 0.1415632963180542,
1387
+ "learning_rate": 4.7021764032073315e-05,
1388
+ "loss": 0.0325,
1389
+ "step": 980
1390
+ },
1391
+ {
1392
+ "epoch": 246.32,
1393
+ "grad_norm": 0.0346578024327755,
1394
+ "learning_rate": 4.673539518900344e-05,
1395
+ "loss": 0.0291,
1396
+ "step": 985
1397
+ },
1398
+ {
1399
+ "epoch": 247.64,
1400
+ "grad_norm": 0.036887165158987045,
1401
+ "learning_rate": 4.644902634593357e-05,
1402
+ "loss": 0.0279,
1403
+ "step": 990
1404
+ },
1405
+ {
1406
+ "epoch": 248.96,
1407
+ "grad_norm": 0.03107571043074131,
1408
+ "learning_rate": 4.6162657502863694e-05,
1409
+ "loss": 0.0277,
1410
+ "step": 995
1411
+ },
1412
+ {
1413
+ "epoch": 250.0,
1414
+ "grad_norm": 0.13857436180114746,
1415
+ "learning_rate": 4.5876288659793814e-05,
1416
+ "loss": 0.0287,
1417
+ "step": 1000
1418
+ },
1419
+ {
1420
+ "epoch": 251.32,
1421
+ "grad_norm": 0.03328908607363701,
1422
+ "learning_rate": 4.558991981672394e-05,
1423
+ "loss": 0.0275,
1424
+ "step": 1005
1425
+ },
1426
+ {
1427
+ "epoch": 252.64,
1428
+ "grad_norm": 0.03218206763267517,
1429
+ "learning_rate": 4.530355097365407e-05,
1430
+ "loss": 0.0295,
1431
+ "step": 1010
1432
+ },
1433
+ {
1434
+ "epoch": 253.96,
1435
+ "grad_norm": 0.030677294358611107,
1436
+ "learning_rate": 4.5017182130584194e-05,
1437
+ "loss": 0.0288,
1438
+ "step": 1015
1439
+ },
1440
+ {
1441
+ "epoch": 255.0,
1442
+ "grad_norm": 0.08906098455190659,
1443
+ "learning_rate": 4.473081328751432e-05,
1444
+ "loss": 0.0283,
1445
+ "step": 1020
1446
+ },
1447
+ {
1448
+ "epoch": 256.32,
1449
+ "grad_norm": 0.0315646268427372,
1450
+ "learning_rate": 4.4444444444444447e-05,
1451
+ "loss": 0.0292,
1452
+ "step": 1025
1453
+ },
1454
+ {
1455
+ "epoch": 257.64,
1456
+ "grad_norm": 0.0322076752781868,
1457
+ "learning_rate": 4.415807560137457e-05,
1458
+ "loss": 0.0286,
1459
+ "step": 1030
1460
+ },
1461
+ {
1462
+ "epoch": 258.96,
1463
+ "grad_norm": 0.03561684116721153,
1464
+ "learning_rate": 4.38717067583047e-05,
1465
+ "loss": 0.0266,
1466
+ "step": 1035
1467
+ },
1468
+ {
1469
+ "epoch": 260.0,
1470
+ "grad_norm": 0.1383010447025299,
1471
+ "learning_rate": 4.3585337915234826e-05,
1472
+ "loss": 0.0291,
1473
+ "step": 1040
1474
+ },
1475
+ {
1476
+ "epoch": 261.32,
1477
+ "grad_norm": 0.02982248179614544,
1478
+ "learning_rate": 4.329896907216495e-05,
1479
+ "loss": 0.0281,
1480
+ "step": 1045
1481
+ },
1482
+ {
1483
+ "epoch": 262.64,
1484
+ "grad_norm": 0.03563191369175911,
1485
+ "learning_rate": 4.301260022909508e-05,
1486
+ "loss": 0.0285,
1487
+ "step": 1050
1488
+ },
1489
+ {
1490
+ "epoch": 263.96,
1491
+ "grad_norm": 0.03730940818786621,
1492
+ "learning_rate": 4.27262313860252e-05,
1493
+ "loss": 0.0298,
1494
+ "step": 1055
1495
+ },
1496
+ {
1497
+ "epoch": 265.0,
1498
+ "grad_norm": 0.12043489515781403,
1499
+ "learning_rate": 4.2439862542955325e-05,
1500
+ "loss": 0.029,
1501
+ "step": 1060
1502
+ },
1503
+ {
1504
+ "epoch": 266.32,
1505
+ "grad_norm": 0.03577538579702377,
1506
+ "learning_rate": 4.215349369988545e-05,
1507
+ "loss": 0.0304,
1508
+ "step": 1065
1509
+ },
1510
+ {
1511
+ "epoch": 267.64,
1512
+ "grad_norm": 0.035051047801971436,
1513
+ "learning_rate": 4.1867124856815585e-05,
1514
+ "loss": 0.0292,
1515
+ "step": 1070
1516
+ },
1517
+ {
1518
+ "epoch": 268.96,
1519
+ "grad_norm": 0.03524423763155937,
1520
+ "learning_rate": 4.158075601374571e-05,
1521
+ "loss": 0.029,
1522
+ "step": 1075
1523
+ },
1524
+ {
1525
+ "epoch": 270.0,
1526
+ "grad_norm": 0.11722230911254883,
1527
+ "learning_rate": 4.129438717067583e-05,
1528
+ "loss": 0.0286,
1529
+ "step": 1080
1530
+ },
1531
+ {
1532
+ "epoch": 271.32,
1533
+ "grad_norm": 0.0350823737680912,
1534
+ "learning_rate": 4.100801832760596e-05,
1535
+ "loss": 0.0295,
1536
+ "step": 1085
1537
+ },
1538
+ {
1539
+ "epoch": 272.64,
1540
+ "grad_norm": 0.03372941538691521,
1541
+ "learning_rate": 4.0721649484536084e-05,
1542
+ "loss": 0.0288,
1543
+ "step": 1090
1544
+ },
1545
+ {
1546
+ "epoch": 273.96,
1547
+ "grad_norm": 0.028644917532801628,
1548
+ "learning_rate": 4.043528064146621e-05,
1549
+ "loss": 0.0326,
1550
+ "step": 1095
1551
+ },
1552
+ {
1553
+ "epoch": 275.0,
1554
+ "grad_norm": 0.10958810150623322,
1555
+ "learning_rate": 4.014891179839634e-05,
1556
+ "loss": 0.0293,
1557
+ "step": 1100
1558
+ },
1559
+ {
1560
+ "epoch": 276.32,
1561
+ "grad_norm": 0.03524491935968399,
1562
+ "learning_rate": 3.9862542955326463e-05,
1563
+ "loss": 0.0289,
1564
+ "step": 1105
1565
+ },
1566
+ {
1567
+ "epoch": 277.64,
1568
+ "grad_norm": 0.028043361380696297,
1569
+ "learning_rate": 3.957617411225659e-05,
1570
+ "loss": 0.0292,
1571
+ "step": 1110
1572
+ },
1573
+ {
1574
+ "epoch": 278.96,
1575
+ "grad_norm": 0.03574656322598457,
1576
+ "learning_rate": 3.9289805269186716e-05,
1577
+ "loss": 0.028,
1578
+ "step": 1115
1579
+ },
1580
+ {
1581
+ "epoch": 280.0,
1582
+ "grad_norm": 0.12416456639766693,
1583
+ "learning_rate": 3.900343642611684e-05,
1584
+ "loss": 0.0278,
1585
+ "step": 1120
1586
+ },
1587
+ {
1588
+ "epoch": 281.32,
1589
+ "grad_norm": 0.02984347939491272,
1590
+ "learning_rate": 3.871706758304697e-05,
1591
+ "loss": 0.0324,
1592
+ "step": 1125
1593
+ },
1594
+ {
1595
+ "epoch": 282.64,
1596
+ "grad_norm": 0.03649289906024933,
1597
+ "learning_rate": 3.8430698739977096e-05,
1598
+ "loss": 0.0281,
1599
+ "step": 1130
1600
+ },
1601
+ {
1602
+ "epoch": 283.96,
1603
+ "grad_norm": 0.03943822532892227,
1604
+ "learning_rate": 3.8144329896907216e-05,
1605
+ "loss": 0.0268,
1606
+ "step": 1135
1607
+ },
1608
+ {
1609
+ "epoch": 285.0,
1610
+ "grad_norm": 0.14334431290626526,
1611
+ "learning_rate": 3.785796105383734e-05,
1612
+ "loss": 0.0305,
1613
+ "step": 1140
1614
+ },
1615
+ {
1616
+ "epoch": 286.32,
1617
+ "grad_norm": 0.030261779204010963,
1618
+ "learning_rate": 3.757159221076747e-05,
1619
+ "loss": 0.028,
1620
+ "step": 1145
1621
+ },
1622
+ {
1623
+ "epoch": 287.64,
1624
+ "grad_norm": 0.03134704381227493,
1625
+ "learning_rate": 3.7285223367697595e-05,
1626
+ "loss": 0.0267,
1627
+ "step": 1150
1628
+ },
1629
+ {
1630
+ "epoch": 288.96,
1631
+ "grad_norm": 0.031728796660900116,
1632
+ "learning_rate": 3.699885452462772e-05,
1633
+ "loss": 0.028,
1634
+ "step": 1155
1635
+ },
1636
+ {
1637
+ "epoch": 290.0,
1638
+ "grad_norm": 0.15487806499004364,
1639
+ "learning_rate": 3.671248568155785e-05,
1640
+ "loss": 0.031,
1641
+ "step": 1160
1642
+ },
1643
+ {
1644
+ "epoch": 291.32,
1645
+ "grad_norm": 0.033745523542165756,
1646
+ "learning_rate": 3.6426116838487974e-05,
1647
+ "loss": 0.0298,
1648
+ "step": 1165
1649
+ },
1650
+ {
1651
+ "epoch": 292.64,
1652
+ "grad_norm": 0.026857230812311172,
1653
+ "learning_rate": 3.61397479954181e-05,
1654
+ "loss": 0.0265,
1655
+ "step": 1170
1656
+ },
1657
+ {
1658
+ "epoch": 293.96,
1659
+ "grad_norm": 0.03467594459652901,
1660
+ "learning_rate": 3.585337915234823e-05,
1661
+ "loss": 0.0291,
1662
+ "step": 1175
1663
+ },
1664
+ {
1665
+ "epoch": 295.0,
1666
+ "grad_norm": 0.1255461573600769,
1667
+ "learning_rate": 3.5567010309278354e-05,
1668
+ "loss": 0.0304,
1669
+ "step": 1180
1670
+ },
1671
+ {
1672
+ "epoch": 296.32,
1673
+ "grad_norm": 0.03569836914539337,
1674
+ "learning_rate": 3.528064146620848e-05,
1675
+ "loss": 0.0275,
1676
+ "step": 1185
1677
+ },
1678
+ {
1679
+ "epoch": 297.64,
1680
+ "grad_norm": 0.03207559511065483,
1681
+ "learning_rate": 3.49942726231386e-05,
1682
+ "loss": 0.0288,
1683
+ "step": 1190
1684
+ },
1685
+ {
1686
+ "epoch": 298.96,
1687
+ "grad_norm": 0.03445427492260933,
1688
+ "learning_rate": 3.4707903780068726e-05,
1689
+ "loss": 0.0274,
1690
+ "step": 1195
1691
+ },
1692
+ {
1693
+ "epoch": 300.0,
1694
+ "grad_norm": 0.11089900881052017,
1695
+ "learning_rate": 3.442153493699885e-05,
1696
+ "loss": 0.0268,
1697
+ "step": 1200
1698
+ },
1699
+ {
1700
+ "epoch": 301.32,
1701
+ "grad_norm": 0.030901776626706123,
1702
+ "learning_rate": 3.4135166093928986e-05,
1703
+ "loss": 0.0285,
1704
+ "step": 1205
1705
+ },
1706
+ {
1707
+ "epoch": 302.64,
1708
+ "grad_norm": 0.03404972329735756,
1709
+ "learning_rate": 3.384879725085911e-05,
1710
+ "loss": 0.0282,
1711
+ "step": 1210
1712
+ },
1713
+ {
1714
+ "epoch": 303.96,
1715
+ "grad_norm": 0.03297970071434975,
1716
+ "learning_rate": 3.356242840778923e-05,
1717
+ "loss": 0.0289,
1718
+ "step": 1215
1719
+ },
1720
+ {
1721
+ "epoch": 305.0,
1722
+ "grad_norm": 0.08513491600751877,
1723
+ "learning_rate": 3.327605956471936e-05,
1724
+ "loss": 0.0271,
1725
+ "step": 1220
1726
+ },
1727
+ {
1728
+ "epoch": 306.32,
1729
+ "grad_norm": 0.02815438061952591,
1730
+ "learning_rate": 3.2989690721649485e-05,
1731
+ "loss": 0.028,
1732
+ "step": 1225
1733
+ },
1734
+ {
1735
+ "epoch": 307.64,
1736
+ "grad_norm": 0.031231220811605453,
1737
+ "learning_rate": 3.270332187857961e-05,
1738
+ "loss": 0.0308,
1739
+ "step": 1230
1740
+ },
1741
+ {
1742
+ "epoch": 308.96,
1743
+ "grad_norm": 0.03579903766512871,
1744
+ "learning_rate": 3.241695303550974e-05,
1745
+ "loss": 0.0256,
1746
+ "step": 1235
1747
+ },
1748
+ {
1749
+ "epoch": 310.0,
1750
+ "grad_norm": 0.1284906268119812,
1751
+ "learning_rate": 3.2130584192439865e-05,
1752
+ "loss": 0.029,
1753
+ "step": 1240
1754
+ },
1755
+ {
1756
+ "epoch": 311.32,
1757
+ "grad_norm": 0.02885010838508606,
1758
+ "learning_rate": 3.184421534936999e-05,
1759
+ "loss": 0.0267,
1760
+ "step": 1245
1761
+ },
1762
+ {
1763
+ "epoch": 312.64,
1764
+ "grad_norm": 0.040551669895648956,
1765
+ "learning_rate": 3.155784650630012e-05,
1766
+ "loss": 0.0277,
1767
+ "step": 1250
1768
+ },
1769
+ {
1770
+ "epoch": 313.96,
1771
+ "grad_norm": 0.024676747620105743,
1772
+ "learning_rate": 3.1271477663230244e-05,
1773
+ "loss": 0.0296,
1774
+ "step": 1255
1775
+ },
1776
+ {
1777
+ "epoch": 315.0,
1778
+ "grad_norm": 0.1250019669532776,
1779
+ "learning_rate": 3.098510882016037e-05,
1780
+ "loss": 0.0295,
1781
+ "step": 1260
1782
+ },
1783
+ {
1784
+ "epoch": 316.32,
1785
+ "grad_norm": 0.03083103522658348,
1786
+ "learning_rate": 3.06987399770905e-05,
1787
+ "loss": 0.0286,
1788
+ "step": 1265
1789
+ },
1790
+ {
1791
+ "epoch": 317.64,
1792
+ "grad_norm": 0.03254910558462143,
1793
+ "learning_rate": 3.0412371134020617e-05,
1794
+ "loss": 0.0277,
1795
+ "step": 1270
1796
+ },
1797
+ {
1798
+ "epoch": 318.96,
1799
+ "grad_norm": 0.028430206701159477,
1800
+ "learning_rate": 3.0126002290950743e-05,
1801
+ "loss": 0.0268,
1802
+ "step": 1275
1803
+ },
1804
+ {
1805
+ "epoch": 320.0,
1806
+ "grad_norm": 0.10449621081352234,
1807
+ "learning_rate": 2.983963344788087e-05,
1808
+ "loss": 0.0279,
1809
+ "step": 1280
1810
+ },
1811
+ {
1812
+ "epoch": 321.32,
1813
+ "grad_norm": 0.03180396929383278,
1814
+ "learning_rate": 2.9553264604811e-05,
1815
+ "loss": 0.0287,
1816
+ "step": 1285
1817
+ },
1818
+ {
1819
+ "epoch": 322.64,
1820
+ "grad_norm": 0.03462441638112068,
1821
+ "learning_rate": 2.9266895761741126e-05,
1822
+ "loss": 0.0267,
1823
+ "step": 1290
1824
+ },
1825
+ {
1826
+ "epoch": 323.96,
1827
+ "grad_norm": 0.032813649624586105,
1828
+ "learning_rate": 2.8980526918671253e-05,
1829
+ "loss": 0.0272,
1830
+ "step": 1295
1831
+ },
1832
+ {
1833
+ "epoch": 325.0,
1834
+ "grad_norm": 0.11716829985380173,
1835
+ "learning_rate": 2.8694158075601372e-05,
1836
+ "loss": 0.0301,
1837
+ "step": 1300
1838
+ },
1839
+ {
1840
+ "epoch": 326.32,
1841
+ "grad_norm": 0.0283154658973217,
1842
+ "learning_rate": 2.8407789232531502e-05,
1843
+ "loss": 0.0297,
1844
+ "step": 1305
1845
+ },
1846
+ {
1847
+ "epoch": 327.64,
1848
+ "grad_norm": 0.037692759186029434,
1849
+ "learning_rate": 2.812142038946163e-05,
1850
+ "loss": 0.0279,
1851
+ "step": 1310
1852
+ },
1853
+ {
1854
+ "epoch": 328.96,
1855
+ "grad_norm": 0.03138533979654312,
1856
+ "learning_rate": 2.7835051546391755e-05,
1857
+ "loss": 0.0272,
1858
+ "step": 1315
1859
+ },
1860
+ {
1861
+ "epoch": 330.0,
1862
+ "grad_norm": 0.07045339792966843,
1863
+ "learning_rate": 2.754868270332188e-05,
1864
+ "loss": 0.0268,
1865
+ "step": 1320
1866
+ },
1867
+ {
1868
+ "epoch": 331.32,
1869
+ "grad_norm": 0.029422452673316002,
1870
+ "learning_rate": 2.7262313860252005e-05,
1871
+ "loss": 0.0285,
1872
+ "step": 1325
1873
+ },
1874
+ {
1875
+ "epoch": 332.64,
1876
+ "grad_norm": 0.025272730737924576,
1877
+ "learning_rate": 2.697594501718213e-05,
1878
+ "loss": 0.027,
1879
+ "step": 1330
1880
+ },
1881
+ {
1882
+ "epoch": 333.96,
1883
+ "grad_norm": 0.03468950465321541,
1884
+ "learning_rate": 2.6689576174112258e-05,
1885
+ "loss": 0.0281,
1886
+ "step": 1335
1887
+ },
1888
+ {
1889
+ "epoch": 335.0,
1890
+ "grad_norm": 0.1138090044260025,
1891
+ "learning_rate": 2.6403207331042384e-05,
1892
+ "loss": 0.0283,
1893
+ "step": 1340
1894
+ },
1895
+ {
1896
+ "epoch": 336.32,
1897
+ "grad_norm": 0.0285523422062397,
1898
+ "learning_rate": 2.611683848797251e-05,
1899
+ "loss": 0.0292,
1900
+ "step": 1345
1901
+ },
1902
+ {
1903
+ "epoch": 337.64,
1904
+ "grad_norm": 0.034624941647052765,
1905
+ "learning_rate": 2.5830469644902637e-05,
1906
+ "loss": 0.0288,
1907
+ "step": 1350
1908
+ },
1909
+ {
1910
+ "epoch": 338.96,
1911
+ "grad_norm": 0.03252566233277321,
1912
+ "learning_rate": 2.554410080183276e-05,
1913
+ "loss": 0.0262,
1914
+ "step": 1355
1915
+ },
1916
+ {
1917
+ "epoch": 340.0,
1918
+ "grad_norm": 0.10238504409790039,
1919
+ "learning_rate": 2.5257731958762887e-05,
1920
+ "loss": 0.0278,
1921
+ "step": 1360
1922
+ },
1923
+ {
1924
+ "epoch": 341.32,
1925
+ "grad_norm": 0.028706086799502373,
1926
+ "learning_rate": 2.4971363115693013e-05,
1927
+ "loss": 0.0287,
1928
+ "step": 1365
1929
+ },
1930
+ {
1931
+ "epoch": 342.64,
1932
+ "grad_norm": 0.03616653010249138,
1933
+ "learning_rate": 2.468499427262314e-05,
1934
+ "loss": 0.0288,
1935
+ "step": 1370
1936
+ },
1937
+ {
1938
+ "epoch": 343.96,
1939
+ "grad_norm": 0.033927544951438904,
1940
+ "learning_rate": 2.4398625429553266e-05,
1941
+ "loss": 0.0282,
1942
+ "step": 1375
1943
+ },
1944
+ {
1945
+ "epoch": 345.0,
1946
+ "grad_norm": 0.12410403043031693,
1947
+ "learning_rate": 2.4112256586483393e-05,
1948
+ "loss": 0.0289,
1949
+ "step": 1380
1950
+ },
1951
+ {
1952
+ "epoch": 346.32,
1953
+ "grad_norm": 0.033267851918935776,
1954
+ "learning_rate": 2.3825887743413516e-05,
1955
+ "loss": 0.0285,
1956
+ "step": 1385
1957
+ },
1958
+ {
1959
+ "epoch": 347.64,
1960
+ "grad_norm": 0.028466830030083656,
1961
+ "learning_rate": 2.3539518900343642e-05,
1962
+ "loss": 0.0274,
1963
+ "step": 1390
1964
+ },
1965
+ {
1966
+ "epoch": 348.96,
1967
+ "grad_norm": 0.0284014530479908,
1968
+ "learning_rate": 2.3253150057273772e-05,
1969
+ "loss": 0.0289,
1970
+ "step": 1395
1971
+ },
1972
+ {
1973
+ "epoch": 350.0,
1974
+ "grad_norm": 0.10417843610048294,
1975
+ "learning_rate": 2.2966781214203895e-05,
1976
+ "loss": 0.0288,
1977
+ "step": 1400
1978
+ },
1979
+ {
1980
+ "epoch": 351.32,
1981
+ "grad_norm": 0.02494928613305092,
1982
+ "learning_rate": 2.268041237113402e-05,
1983
+ "loss": 0.028,
1984
+ "step": 1405
1985
+ },
1986
+ {
1987
+ "epoch": 352.64,
1988
+ "grad_norm": 0.027743646875023842,
1989
+ "learning_rate": 2.2394043528064148e-05,
1990
+ "loss": 0.0288,
1991
+ "step": 1410
1992
+ },
1993
+ {
1994
+ "epoch": 353.96,
1995
+ "grad_norm": 0.037426408380270004,
1996
+ "learning_rate": 2.210767468499427e-05,
1997
+ "loss": 0.0268,
1998
+ "step": 1415
1999
+ },
2000
+ {
2001
+ "epoch": 355.0,
2002
+ "grad_norm": 0.06390511989593506,
2003
+ "learning_rate": 2.18213058419244e-05,
2004
+ "loss": 0.0281,
2005
+ "step": 1420
2006
+ },
2007
+ {
2008
+ "epoch": 356.32,
2009
+ "grad_norm": 0.02651941403746605,
2010
+ "learning_rate": 2.1534936998854528e-05,
2011
+ "loss": 0.0267,
2012
+ "step": 1425
2013
+ },
2014
+ {
2015
+ "epoch": 357.64,
2016
+ "grad_norm": 0.027626991271972656,
2017
+ "learning_rate": 2.124856815578465e-05,
2018
+ "loss": 0.0278,
2019
+ "step": 1430
2020
+ },
2021
+ {
2022
+ "epoch": 358.96,
2023
+ "grad_norm": 0.0289900004863739,
2024
+ "learning_rate": 2.0962199312714777e-05,
2025
+ "loss": 0.0289,
2026
+ "step": 1435
2027
+ },
2028
+ {
2029
+ "epoch": 360.0,
2030
+ "grad_norm": 0.08335373550653458,
2031
+ "learning_rate": 2.0675830469644904e-05,
2032
+ "loss": 0.0254,
2033
+ "step": 1440
2034
+ },
2035
+ {
2036
+ "epoch": 361.32,
2037
+ "grad_norm": 0.02882411703467369,
2038
+ "learning_rate": 2.038946162657503e-05,
2039
+ "loss": 0.0276,
2040
+ "step": 1445
2041
+ },
2042
+ {
2043
+ "epoch": 362.64,
2044
+ "grad_norm": 0.029498135671019554,
2045
+ "learning_rate": 2.0103092783505157e-05,
2046
+ "loss": 0.0273,
2047
+ "step": 1450
2048
+ },
2049
+ {
2050
+ "epoch": 363.96,
2051
+ "grad_norm": 0.030006349086761475,
2052
+ "learning_rate": 1.981672394043528e-05,
2053
+ "loss": 0.0266,
2054
+ "step": 1455
2055
+ },
2056
+ {
2057
+ "epoch": 365.0,
2058
+ "grad_norm": 0.08131309598684311,
2059
+ "learning_rate": 1.9530355097365406e-05,
2060
+ "loss": 0.0272,
2061
+ "step": 1460
2062
+ },
2063
+ {
2064
+ "epoch": 366.32,
2065
+ "grad_norm": 0.028547124937176704,
2066
+ "learning_rate": 1.9243986254295536e-05,
2067
+ "loss": 0.0267,
2068
+ "step": 1465
2069
+ },
2070
+ {
2071
+ "epoch": 367.64,
2072
+ "grad_norm": 0.027747539803385735,
2073
+ "learning_rate": 1.895761741122566e-05,
2074
+ "loss": 0.0273,
2075
+ "step": 1470
2076
+ },
2077
+ {
2078
+ "epoch": 368.96,
2079
+ "grad_norm": 0.032853253185749054,
2080
+ "learning_rate": 1.8671248568155786e-05,
2081
+ "loss": 0.0293,
2082
+ "step": 1475
2083
+ },
2084
+ {
2085
+ "epoch": 370.0,
2086
+ "grad_norm": 0.10667946934700012,
2087
+ "learning_rate": 1.8384879725085912e-05,
2088
+ "loss": 0.027,
2089
+ "step": 1480
2090
+ },
2091
+ {
2092
+ "epoch": 371.32,
2093
+ "grad_norm": 0.027019130066037178,
2094
+ "learning_rate": 1.809851088201604e-05,
2095
+ "loss": 0.0268,
2096
+ "step": 1485
2097
+ },
2098
+ {
2099
+ "epoch": 372.64,
2100
+ "grad_norm": 0.02968420460820198,
2101
+ "learning_rate": 1.7812142038946165e-05,
2102
+ "loss": 0.0303,
2103
+ "step": 1490
2104
+ },
2105
+ {
2106
+ "epoch": 373.96,
2107
+ "grad_norm": 0.03141555190086365,
2108
+ "learning_rate": 1.7525773195876288e-05,
2109
+ "loss": 0.0285,
2110
+ "step": 1495
2111
+ },
2112
+ {
2113
+ "epoch": 375.0,
2114
+ "grad_norm": 0.1068948432803154,
2115
+ "learning_rate": 1.7239404352806415e-05,
2116
+ "loss": 0.0293,
2117
+ "step": 1500
2118
+ },
2119
+ {
2120
+ "epoch": 376.32,
2121
+ "grad_norm": 0.03410301357507706,
2122
+ "learning_rate": 1.695303550973654e-05,
2123
+ "loss": 0.0276,
2124
+ "step": 1505
2125
+ },
2126
+ {
2127
+ "epoch": 377.64,
2128
+ "grad_norm": 0.03133257105946541,
2129
+ "learning_rate": 1.6666666666666667e-05,
2130
+ "loss": 0.029,
2131
+ "step": 1510
2132
+ },
2133
+ {
2134
+ "epoch": 378.96,
2135
+ "grad_norm": 0.028733504936099052,
2136
+ "learning_rate": 1.6380297823596794e-05,
2137
+ "loss": 0.0278,
2138
+ "step": 1515
2139
+ },
2140
+ {
2141
+ "epoch": 380.0,
2142
+ "grad_norm": 0.10409895330667496,
2143
+ "learning_rate": 1.609392898052692e-05,
2144
+ "loss": 0.027,
2145
+ "step": 1520
2146
+ },
2147
+ {
2148
+ "epoch": 381.32,
2149
+ "grad_norm": 0.036405060440301895,
2150
+ "learning_rate": 1.5807560137457044e-05,
2151
+ "loss": 0.0302,
2152
+ "step": 1525
2153
+ },
2154
+ {
2155
+ "epoch": 382.64,
2156
+ "grad_norm": 0.027341334149241447,
2157
+ "learning_rate": 1.5521191294387173e-05,
2158
+ "loss": 0.0272,
2159
+ "step": 1530
2160
+ },
2161
+ {
2162
+ "epoch": 383.96,
2163
+ "grad_norm": 0.039175573736429214,
2164
+ "learning_rate": 1.5234822451317298e-05,
2165
+ "loss": 0.0279,
2166
+ "step": 1535
2167
+ },
2168
+ {
2169
+ "epoch": 385.0,
2170
+ "grad_norm": 0.12478016316890717,
2171
+ "learning_rate": 1.4948453608247423e-05,
2172
+ "loss": 0.0285,
2173
+ "step": 1540
2174
+ },
2175
+ {
2176
+ "epoch": 386.32,
2177
+ "grad_norm": 0.029726864770054817,
2178
+ "learning_rate": 1.466208476517755e-05,
2179
+ "loss": 0.0289,
2180
+ "step": 1545
2181
+ },
2182
+ {
2183
+ "epoch": 387.64,
2184
+ "grad_norm": 0.03192641958594322,
2185
+ "learning_rate": 1.4375715922107674e-05,
2186
+ "loss": 0.0269,
2187
+ "step": 1550
2188
+ },
2189
+ {
2190
+ "epoch": 388.96,
2191
+ "grad_norm": 0.03215065971016884,
2192
+ "learning_rate": 1.40893470790378e-05,
2193
+ "loss": 0.0281,
2194
+ "step": 1555
2195
+ },
2196
+ {
2197
+ "epoch": 390.0,
2198
+ "grad_norm": 0.11953844130039215,
2199
+ "learning_rate": 1.3802978235967929e-05,
2200
+ "loss": 0.0296,
2201
+ "step": 1560
2202
+ },
2203
+ {
2204
+ "epoch": 391.32,
2205
+ "grad_norm": 0.027522824704647064,
2206
+ "learning_rate": 1.3516609392898052e-05,
2207
+ "loss": 0.0278,
2208
+ "step": 1565
2209
+ },
2210
+ {
2211
+ "epoch": 392.64,
2212
+ "grad_norm": 0.037742115557193756,
2213
+ "learning_rate": 1.323024054982818e-05,
2214
+ "loss": 0.0292,
2215
+ "step": 1570
2216
+ },
2217
+ {
2218
+ "epoch": 393.96,
2219
+ "grad_norm": 0.02829778380692005,
2220
+ "learning_rate": 1.2943871706758307e-05,
2221
+ "loss": 0.0264,
2222
+ "step": 1575
2223
+ },
2224
+ {
2225
+ "epoch": 395.0,
2226
+ "grad_norm": 0.12115279585123062,
2227
+ "learning_rate": 1.2657502863688431e-05,
2228
+ "loss": 0.0305,
2229
+ "step": 1580
2230
+ },
2231
+ {
2232
+ "epoch": 396.32,
2233
+ "grad_norm": 0.026462797075510025,
2234
+ "learning_rate": 1.2371134020618558e-05,
2235
+ "loss": 0.0273,
2236
+ "step": 1585
2237
+ },
2238
+ {
2239
+ "epoch": 397.64,
2240
+ "grad_norm": 0.03455578163266182,
2241
+ "learning_rate": 1.2084765177548683e-05,
2242
+ "loss": 0.0291,
2243
+ "step": 1590
2244
+ },
2245
+ {
2246
+ "epoch": 398.96,
2247
+ "grad_norm": 0.03112473525106907,
2248
+ "learning_rate": 1.1798396334478809e-05,
2249
+ "loss": 0.0263,
2250
+ "step": 1595
2251
+ },
2252
+ {
2253
+ "epoch": 400.0,
2254
+ "grad_norm": 0.13244664669036865,
2255
+ "learning_rate": 1.1512027491408934e-05,
2256
+ "loss": 0.03,
2257
+ "step": 1600
2258
+ },
2259
+ {
2260
+ "epoch": 401.32,
2261
+ "grad_norm": 0.030842171981930733,
2262
+ "learning_rate": 1.1225658648339062e-05,
2263
+ "loss": 0.0293,
2264
+ "step": 1605
2265
+ },
2266
+ {
2267
+ "epoch": 402.64,
2268
+ "grad_norm": 0.02750714123249054,
2269
+ "learning_rate": 1.0939289805269187e-05,
2270
+ "loss": 0.0269,
2271
+ "step": 1610
2272
+ },
2273
+ {
2274
+ "epoch": 403.96,
2275
+ "grad_norm": 0.027868203818798065,
2276
+ "learning_rate": 1.0652920962199313e-05,
2277
+ "loss": 0.0274,
2278
+ "step": 1615
2279
+ },
2280
+ {
2281
+ "epoch": 405.0,
2282
+ "grad_norm": 0.08533693850040436,
2283
+ "learning_rate": 1.036655211912944e-05,
2284
+ "loss": 0.0287,
2285
+ "step": 1620
2286
+ },
2287
+ {
2288
+ "epoch": 406.32,
2289
+ "grad_norm": 0.036794379353523254,
2290
+ "learning_rate": 1.0080183276059566e-05,
2291
+ "loss": 0.0284,
2292
+ "step": 1625
2293
+ },
2294
+ {
2295
+ "epoch": 407.64,
2296
+ "grad_norm": 0.03149307146668434,
2297
+ "learning_rate": 9.793814432989691e-06,
2298
+ "loss": 0.0264,
2299
+ "step": 1630
2300
+ },
2301
+ {
2302
+ "epoch": 408.96,
2303
+ "grad_norm": 0.03569972142577171,
2304
+ "learning_rate": 9.507445589919818e-06,
2305
+ "loss": 0.0284,
2306
+ "step": 1635
2307
+ },
2308
+ {
2309
+ "epoch": 410.0,
2310
+ "grad_norm": 0.10384050011634827,
2311
+ "learning_rate": 9.221076746849944e-06,
2312
+ "loss": 0.0276,
2313
+ "step": 1640
2314
+ },
2315
+ {
2316
+ "epoch": 411.32,
2317
+ "grad_norm": 0.028333071619272232,
2318
+ "learning_rate": 8.934707903780069e-06,
2319
+ "loss": 0.0272,
2320
+ "step": 1645
2321
+ },
2322
+ {
2323
+ "epoch": 412.64,
2324
+ "grad_norm": 0.028478605672717094,
2325
+ "learning_rate": 8.648339060710195e-06,
2326
+ "loss": 0.0295,
2327
+ "step": 1650
2328
+ },
2329
+ {
2330
+ "epoch": 413.96,
2331
+ "grad_norm": 0.028093887493014336,
2332
+ "learning_rate": 8.36197021764032e-06,
2333
+ "loss": 0.0295,
2334
+ "step": 1655
2335
+ },
2336
+ {
2337
+ "epoch": 415.0,
2338
+ "grad_norm": 0.10948823392391205,
2339
+ "learning_rate": 8.075601374570448e-06,
2340
+ "loss": 0.0286,
2341
+ "step": 1660
2342
+ },
2343
+ {
2344
+ "epoch": 416.32,
2345
+ "grad_norm": 0.02955321967601776,
2346
+ "learning_rate": 7.789232531500573e-06,
2347
+ "loss": 0.0266,
2348
+ "step": 1665
2349
+ },
2350
+ {
2351
+ "epoch": 417.64,
2352
+ "grad_norm": 0.02912413887679577,
2353
+ "learning_rate": 7.502863688430699e-06,
2354
+ "loss": 0.0273,
2355
+ "step": 1670
2356
+ },
2357
+ {
2358
+ "epoch": 418.96,
2359
+ "grad_norm": 0.028283055871725082,
2360
+ "learning_rate": 7.216494845360824e-06,
2361
+ "loss": 0.0266,
2362
+ "step": 1675
2363
+ },
2364
+ {
2365
+ "epoch": 420.0,
2366
+ "grad_norm": 0.090940460562706,
2367
+ "learning_rate": 6.930126002290952e-06,
2368
+ "loss": 0.0282,
2369
+ "step": 1680
2370
+ },
2371
+ {
2372
+ "epoch": 421.32,
2373
+ "grad_norm": 0.03360769525170326,
2374
+ "learning_rate": 6.643757159221077e-06,
2375
+ "loss": 0.027,
2376
+ "step": 1685
2377
+ },
2378
+ {
2379
+ "epoch": 422.64,
2380
+ "grad_norm": 0.029777785763144493,
2381
+ "learning_rate": 6.357388316151203e-06,
2382
+ "loss": 0.0274,
2383
+ "step": 1690
2384
+ },
2385
+ {
2386
+ "epoch": 423.96,
2387
+ "grad_norm": 0.03204215317964554,
2388
+ "learning_rate": 6.071019473081329e-06,
2389
+ "loss": 0.0273,
2390
+ "step": 1695
2391
+ },
2392
+ {
2393
+ "epoch": 425.0,
2394
+ "grad_norm": 0.1337508112192154,
2395
+ "learning_rate": 5.784650630011455e-06,
2396
+ "loss": 0.0275,
2397
+ "step": 1700
2398
+ },
2399
+ {
2400
+ "epoch": 426.32,
2401
+ "grad_norm": 0.03454073518514633,
2402
+ "learning_rate": 5.498281786941581e-06,
2403
+ "loss": 0.0274,
2404
+ "step": 1705
2405
+ },
2406
+ {
2407
+ "epoch": 427.64,
2408
+ "grad_norm": 0.029586778953671455,
2409
+ "learning_rate": 5.211912943871707e-06,
2410
+ "loss": 0.0319,
2411
+ "step": 1710
2412
+ },
2413
+ {
2414
+ "epoch": 428.96,
2415
+ "grad_norm": 0.02780616097152233,
2416
+ "learning_rate": 4.925544100801833e-06,
2417
+ "loss": 0.0297,
2418
+ "step": 1715
2419
+ },
2420
+ {
2421
+ "epoch": 430.0,
2422
+ "grad_norm": 0.13715778291225433,
2423
+ "learning_rate": 4.639175257731959e-06,
2424
+ "loss": 0.0309,
2425
+ "step": 1720
2426
+ },
2427
+ {
2428
+ "epoch": 431.32,
2429
+ "grad_norm": 0.03031608648598194,
2430
+ "learning_rate": 4.352806414662085e-06,
2431
+ "loss": 0.0276,
2432
+ "step": 1725
2433
+ },
2434
+ {
2435
+ "epoch": 432.64,
2436
+ "grad_norm": 0.031075894832611084,
2437
+ "learning_rate": 4.066437571592211e-06,
2438
+ "loss": 0.0265,
2439
+ "step": 1730
2440
+ },
2441
+ {
2442
+ "epoch": 433.96,
2443
+ "grad_norm": 0.02886197902262211,
2444
+ "learning_rate": 3.7800687285223365e-06,
2445
+ "loss": 0.0271,
2446
+ "step": 1735
2447
+ },
2448
+ {
2449
+ "epoch": 435.0,
2450
+ "grad_norm": 0.09652125835418701,
2451
+ "learning_rate": 3.493699885452463e-06,
2452
+ "loss": 0.0277,
2453
+ "step": 1740
2454
+ },
2455
+ {
2456
+ "epoch": 436.32,
2457
+ "grad_norm": 0.027949687093496323,
2458
+ "learning_rate": 3.2073310423825886e-06,
2459
+ "loss": 0.0289,
2460
+ "step": 1745
2461
+ },
2462
+ {
2463
+ "epoch": 437.64,
2464
+ "grad_norm": 0.026798376813530922,
2465
+ "learning_rate": 2.920962199312715e-06,
2466
+ "loss": 0.0282,
2467
+ "step": 1750
2468
+ },
2469
+ {
2470
+ "epoch": 438.96,
2471
+ "grad_norm": 0.032906703650951385,
2472
+ "learning_rate": 2.6345933562428407e-06,
2473
+ "loss": 0.0303,
2474
+ "step": 1755
2475
+ },
2476
+ {
2477
+ "epoch": 440.0,
2478
+ "grad_norm": 0.08160939812660217,
2479
+ "learning_rate": 2.3482245131729668e-06,
2480
+ "loss": 0.0257,
2481
+ "step": 1760
2482
+ },
2483
+ {
2484
+ "epoch": 441.32,
2485
+ "grad_norm": 0.03216954320669174,
2486
+ "learning_rate": 2.061855670103093e-06,
2487
+ "loss": 0.0281,
2488
+ "step": 1765
2489
+ },
2490
+ {
2491
+ "epoch": 442.64,
2492
+ "grad_norm": 0.03425678610801697,
2493
+ "learning_rate": 1.7754868270332189e-06,
2494
+ "loss": 0.0291,
2495
+ "step": 1770
2496
+ },
2497
+ {
2498
+ "epoch": 443.96,
2499
+ "grad_norm": 0.02900947816669941,
2500
+ "learning_rate": 1.4891179839633447e-06,
2501
+ "loss": 0.0271,
2502
+ "step": 1775
2503
+ },
2504
+ {
2505
+ "epoch": 445.0,
2506
+ "grad_norm": 0.09323178231716156,
2507
+ "learning_rate": 1.202749140893471e-06,
2508
+ "loss": 0.0293,
2509
+ "step": 1780
2510
+ },
2511
+ {
2512
+ "epoch": 446.32,
2513
+ "grad_norm": 0.028956923633813858,
2514
+ "learning_rate": 9.163802978235968e-07,
2515
+ "loss": 0.0266,
2516
+ "step": 1785
2517
+ },
2518
+ {
2519
+ "epoch": 447.64,
2520
+ "grad_norm": 0.029016662389039993,
2521
+ "learning_rate": 6.300114547537229e-07,
2522
+ "loss": 0.0278,
2523
+ "step": 1790
2524
+ },
2525
+ {
2526
+ "epoch": 448.96,
2527
+ "grad_norm": 0.03088531456887722,
2528
+ "learning_rate": 3.436426116838488e-07,
2529
+ "loss": 0.0282,
2530
+ "step": 1795
2531
+ },
2532
+ {
2533
+ "epoch": 450.0,
2534
+ "grad_norm": 0.11458810418844223,
2535
+ "learning_rate": 5.72737686139748e-08,
2536
+ "loss": 0.028,
2537
+ "step": 1800
2538
+ }
2539
+ ],
2540
+ "logging_steps": 5,
2541
+ "max_steps": 1800,
2542
+ "num_input_tokens_seen": 0,
2543
+ "num_train_epochs": 450,
2544
+ "save_steps": 300,
2545
+ "stateful_callbacks": {
2546
+ "TrainerControl": {
2547
+ "args": {
2548
+ "should_epoch_stop": false,
2549
+ "should_evaluate": false,
2550
+ "should_log": false,
2551
+ "should_save": true,
2552
+ "should_training_stop": true
2553
+ },
2554
+ "attributes": {}
2555
+ }
2556
+ },
2557
+ "total_flos": 1.5308141101056e+18,
2558
+ "train_batch_size": 2,
2559
+ "trial_name": null,
2560
+ "trial_params": null
2561
+ }
Mu-Math/group_01/checkpoints/checkpoint-1800/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
Mu-Math/group_01/checkpoints/checkpoint-300/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: /hkfs/work/workspace/scratch/tum_fmp0582-dndworkspace/不冻结Qwen训练/models/Qwen2.5-1.5B-Instruct
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.12.0
Mu-Math/group_01/checkpoints/checkpoint-300/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "/hkfs/work/workspace/scratch/tum_fmp0582-dndworkspace/\u4e0d\u51bb\u7ed3Qwen\u8bad\u7ec3/models/Qwen2.5-1.5B-Instruct",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 128,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 64,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "down_proj",
24
+ "up_proj",
25
+ "gate_proj",
26
+ "q_proj",
27
+ "o_proj",
28
+ "v_proj",
29
+ "k_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
Mu-Math/group_01/checkpoints/checkpoint-300/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
Mu-Math/group_01/checkpoints/checkpoint-300/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
Mu-Math/group_01/checkpoints/checkpoint-300/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
Mu-Math/group_01/checkpoints/checkpoint-300/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
Mu-Math/group_01/checkpoints/checkpoint-300/tokenizer_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "split_special_tokens": false,
205
+ "tokenizer_class": "Qwen2Tokenizer",
206
+ "unk_token": null
207
+ }
Mu-Math/group_01/checkpoints/checkpoint-300/trainer_state.json ADDED
@@ -0,0 +1,461 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 75.0,
6
+ "eval_steps": 500,
7
+ "global_step": 300,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.32,
14
+ "grad_norm": 11.867908477783203,
15
+ "learning_rate": 0.0,
16
+ "loss": 1.9204,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 1.32,
21
+ "grad_norm": 7.492858409881592,
22
+ "learning_rate": 7.4074074074074075e-06,
23
+ "loss": 1.8831,
24
+ "step": 5
25
+ },
26
+ {
27
+ "epoch": 2.64,
28
+ "grad_norm": 3.9426615238189697,
29
+ "learning_rate": 1.6666666666666667e-05,
30
+ "loss": 1.6453,
31
+ "step": 10
32
+ },
33
+ {
34
+ "epoch": 3.96,
35
+ "grad_norm": 1.769984483718872,
36
+ "learning_rate": 2.5925925925925925e-05,
37
+ "loss": 1.2506,
38
+ "step": 15
39
+ },
40
+ {
41
+ "epoch": 5.0,
42
+ "grad_norm": 1.108256220817566,
43
+ "learning_rate": 3.518518518518519e-05,
44
+ "loss": 1.0012,
45
+ "step": 20
46
+ },
47
+ {
48
+ "epoch": 6.32,
49
+ "grad_norm": 0.5219796299934387,
50
+ "learning_rate": 4.4444444444444447e-05,
51
+ "loss": 0.8034,
52
+ "step": 25
53
+ },
54
+ {
55
+ "epoch": 7.64,
56
+ "grad_norm": 0.6449305415153503,
57
+ "learning_rate": 5.370370370370371e-05,
58
+ "loss": 0.6539,
59
+ "step": 30
60
+ },
61
+ {
62
+ "epoch": 8.96,
63
+ "grad_norm": 0.580233633518219,
64
+ "learning_rate": 6.296296296296296e-05,
65
+ "loss": 0.5474,
66
+ "step": 35
67
+ },
68
+ {
69
+ "epoch": 10.0,
70
+ "grad_norm": 1.5570186376571655,
71
+ "learning_rate": 7.222222222222222e-05,
72
+ "loss": 0.4811,
73
+ "step": 40
74
+ },
75
+ {
76
+ "epoch": 11.32,
77
+ "grad_norm": 0.5841688513755798,
78
+ "learning_rate": 8.148148148148148e-05,
79
+ "loss": 0.3477,
80
+ "step": 45
81
+ },
82
+ {
83
+ "epoch": 12.64,
84
+ "grad_norm": 0.7968279719352722,
85
+ "learning_rate": 9.074074074074075e-05,
86
+ "loss": 0.2089,
87
+ "step": 50
88
+ },
89
+ {
90
+ "epoch": 13.96,
91
+ "grad_norm": 0.8396451473236084,
92
+ "learning_rate": 0.0001,
93
+ "loss": 0.1357,
94
+ "step": 55
95
+ },
96
+ {
97
+ "epoch": 15.0,
98
+ "grad_norm": 2.7755286693573,
99
+ "learning_rate": 9.971363115693013e-05,
100
+ "loss": 0.1235,
101
+ "step": 60
102
+ },
103
+ {
104
+ "epoch": 16.32,
105
+ "grad_norm": 0.6953228116035461,
106
+ "learning_rate": 9.942726231386026e-05,
107
+ "loss": 0.0755,
108
+ "step": 65
109
+ },
110
+ {
111
+ "epoch": 17.64,
112
+ "grad_norm": 1.1248857975006104,
113
+ "learning_rate": 9.914089347079038e-05,
114
+ "loss": 0.0546,
115
+ "step": 70
116
+ },
117
+ {
118
+ "epoch": 18.96,
119
+ "grad_norm": 0.5247378945350647,
120
+ "learning_rate": 9.885452462772051e-05,
121
+ "loss": 0.0602,
122
+ "step": 75
123
+ },
124
+ {
125
+ "epoch": 20.0,
126
+ "grad_norm": 1.827890157699585,
127
+ "learning_rate": 9.856815578465064e-05,
128
+ "loss": 0.0501,
129
+ "step": 80
130
+ },
131
+ {
132
+ "epoch": 21.32,
133
+ "grad_norm": 0.23602119088172913,
134
+ "learning_rate": 9.828178694158075e-05,
135
+ "loss": 0.0443,
136
+ "step": 85
137
+ },
138
+ {
139
+ "epoch": 22.64,
140
+ "grad_norm": 0.2811133861541748,
141
+ "learning_rate": 9.799541809851088e-05,
142
+ "loss": 0.0448,
143
+ "step": 90
144
+ },
145
+ {
146
+ "epoch": 23.96,
147
+ "grad_norm": 0.29150959849357605,
148
+ "learning_rate": 9.7709049255441e-05,
149
+ "loss": 0.0426,
150
+ "step": 95
151
+ },
152
+ {
153
+ "epoch": 25.0,
154
+ "grad_norm": 1.4590457677841187,
155
+ "learning_rate": 9.742268041237114e-05,
156
+ "loss": 0.04,
157
+ "step": 100
158
+ },
159
+ {
160
+ "epoch": 26.32,
161
+ "grad_norm": 0.15335209667682648,
162
+ "learning_rate": 9.713631156930127e-05,
163
+ "loss": 0.0374,
164
+ "step": 105
165
+ },
166
+ {
167
+ "epoch": 27.64,
168
+ "grad_norm": 0.3241201639175415,
169
+ "learning_rate": 9.68499427262314e-05,
170
+ "loss": 0.0378,
171
+ "step": 110
172
+ },
173
+ {
174
+ "epoch": 28.96,
175
+ "grad_norm": 0.18619631230831146,
176
+ "learning_rate": 9.656357388316152e-05,
177
+ "loss": 0.0374,
178
+ "step": 115
179
+ },
180
+ {
181
+ "epoch": 30.0,
182
+ "grad_norm": 0.4512801170349121,
183
+ "learning_rate": 9.627720504009165e-05,
184
+ "loss": 0.0342,
185
+ "step": 120
186
+ },
187
+ {
188
+ "epoch": 31.32,
189
+ "grad_norm": 0.21706914901733398,
190
+ "learning_rate": 9.599083619702178e-05,
191
+ "loss": 0.0369,
192
+ "step": 125
193
+ },
194
+ {
195
+ "epoch": 32.64,
196
+ "grad_norm": 0.42762166261672974,
197
+ "learning_rate": 9.57044673539519e-05,
198
+ "loss": 0.0355,
199
+ "step": 130
200
+ },
201
+ {
202
+ "epoch": 33.96,
203
+ "grad_norm": 0.1793977916240692,
204
+ "learning_rate": 9.541809851088203e-05,
205
+ "loss": 0.0347,
206
+ "step": 135
207
+ },
208
+ {
209
+ "epoch": 35.0,
210
+ "grad_norm": 1.866305947303772,
211
+ "learning_rate": 9.513172966781214e-05,
212
+ "loss": 0.0368,
213
+ "step": 140
214
+ },
215
+ {
216
+ "epoch": 36.32,
217
+ "grad_norm": 0.09879657626152039,
218
+ "learning_rate": 9.484536082474227e-05,
219
+ "loss": 0.0347,
220
+ "step": 145
221
+ },
222
+ {
223
+ "epoch": 37.64,
224
+ "grad_norm": 0.09229481220245361,
225
+ "learning_rate": 9.45589919816724e-05,
226
+ "loss": 0.0338,
227
+ "step": 150
228
+ },
229
+ {
230
+ "epoch": 38.96,
231
+ "grad_norm": 0.11409584432840347,
232
+ "learning_rate": 9.427262313860252e-05,
233
+ "loss": 0.0339,
234
+ "step": 155
235
+ },
236
+ {
237
+ "epoch": 40.0,
238
+ "grad_norm": 0.35678204894065857,
239
+ "learning_rate": 9.398625429553265e-05,
240
+ "loss": 0.0347,
241
+ "step": 160
242
+ },
243
+ {
244
+ "epoch": 41.32,
245
+ "grad_norm": 0.08212767541408539,
246
+ "learning_rate": 9.369988545246277e-05,
247
+ "loss": 0.0345,
248
+ "step": 165
249
+ },
250
+ {
251
+ "epoch": 42.64,
252
+ "grad_norm": 0.07271627336740494,
253
+ "learning_rate": 9.34135166093929e-05,
254
+ "loss": 0.0305,
255
+ "step": 170
256
+ },
257
+ {
258
+ "epoch": 43.96,
259
+ "grad_norm": 0.23211534321308136,
260
+ "learning_rate": 9.312714776632303e-05,
261
+ "loss": 0.0321,
262
+ "step": 175
263
+ },
264
+ {
265
+ "epoch": 45.0,
266
+ "grad_norm": 0.23425568640232086,
267
+ "learning_rate": 9.284077892325315e-05,
268
+ "loss": 0.0334,
269
+ "step": 180
270
+ },
271
+ {
272
+ "epoch": 46.32,
273
+ "grad_norm": 0.07825004309415817,
274
+ "learning_rate": 9.255441008018328e-05,
275
+ "loss": 0.0349,
276
+ "step": 185
277
+ },
278
+ {
279
+ "epoch": 47.64,
280
+ "grad_norm": 0.06621824949979782,
281
+ "learning_rate": 9.22680412371134e-05,
282
+ "loss": 0.0302,
283
+ "step": 190
284
+ },
285
+ {
286
+ "epoch": 48.96,
287
+ "grad_norm": 0.0967830941081047,
288
+ "learning_rate": 9.198167239404353e-05,
289
+ "loss": 0.0316,
290
+ "step": 195
291
+ },
292
+ {
293
+ "epoch": 50.0,
294
+ "grad_norm": 0.39718347787857056,
295
+ "learning_rate": 9.169530355097366e-05,
296
+ "loss": 0.0307,
297
+ "step": 200
298
+ },
299
+ {
300
+ "epoch": 51.32,
301
+ "grad_norm": 0.06881817430257797,
302
+ "learning_rate": 9.140893470790379e-05,
303
+ "loss": 0.0291,
304
+ "step": 205
305
+ },
306
+ {
307
+ "epoch": 52.64,
308
+ "grad_norm": 0.07241260260343552,
309
+ "learning_rate": 9.112256586483391e-05,
310
+ "loss": 0.032,
311
+ "step": 210
312
+ },
313
+ {
314
+ "epoch": 53.96,
315
+ "grad_norm": 0.08191649615764618,
316
+ "learning_rate": 9.083619702176404e-05,
317
+ "loss": 0.0293,
318
+ "step": 215
319
+ },
320
+ {
321
+ "epoch": 55.0,
322
+ "grad_norm": 0.20381148159503937,
323
+ "learning_rate": 9.054982817869416e-05,
324
+ "loss": 0.033,
325
+ "step": 220
326
+ },
327
+ {
328
+ "epoch": 56.32,
329
+ "grad_norm": 0.0765785425901413,
330
+ "learning_rate": 9.026345933562429e-05,
331
+ "loss": 0.0323,
332
+ "step": 225
333
+ },
334
+ {
335
+ "epoch": 57.64,
336
+ "grad_norm": 0.0698801577091217,
337
+ "learning_rate": 8.997709049255442e-05,
338
+ "loss": 0.0324,
339
+ "step": 230
340
+ },
341
+ {
342
+ "epoch": 58.96,
343
+ "grad_norm": 0.08089473098516464,
344
+ "learning_rate": 8.969072164948454e-05,
345
+ "loss": 0.0314,
346
+ "step": 235
347
+ },
348
+ {
349
+ "epoch": 60.0,
350
+ "grad_norm": 0.22270062565803528,
351
+ "learning_rate": 8.940435280641467e-05,
352
+ "loss": 0.0303,
353
+ "step": 240
354
+ },
355
+ {
356
+ "epoch": 61.32,
357
+ "grad_norm": 0.07712433487176895,
358
+ "learning_rate": 8.91179839633448e-05,
359
+ "loss": 0.0324,
360
+ "step": 245
361
+ },
362
+ {
363
+ "epoch": 62.64,
364
+ "grad_norm": 0.05860769376158714,
365
+ "learning_rate": 8.883161512027491e-05,
366
+ "loss": 0.0321,
367
+ "step": 250
368
+ },
369
+ {
370
+ "epoch": 63.96,
371
+ "grad_norm": 0.05999445170164108,
372
+ "learning_rate": 8.854524627720504e-05,
373
+ "loss": 0.0315,
374
+ "step": 255
375
+ },
376
+ {
377
+ "epoch": 65.0,
378
+ "grad_norm": 0.20564565062522888,
379
+ "learning_rate": 8.825887743413516e-05,
380
+ "loss": 0.0364,
381
+ "step": 260
382
+ },
383
+ {
384
+ "epoch": 66.32,
385
+ "grad_norm": 0.0610821433365345,
386
+ "learning_rate": 8.797250859106529e-05,
387
+ "loss": 0.0311,
388
+ "step": 265
389
+ },
390
+ {
391
+ "epoch": 67.64,
392
+ "grad_norm": 0.05693706497550011,
393
+ "learning_rate": 8.768613974799542e-05,
394
+ "loss": 0.0294,
395
+ "step": 270
396
+ },
397
+ {
398
+ "epoch": 68.96,
399
+ "grad_norm": 0.06817185133695602,
400
+ "learning_rate": 8.739977090492554e-05,
401
+ "loss": 0.0284,
402
+ "step": 275
403
+ },
404
+ {
405
+ "epoch": 70.0,
406
+ "grad_norm": 0.17458151280879974,
407
+ "learning_rate": 8.711340206185567e-05,
408
+ "loss": 0.0291,
409
+ "step": 280
410
+ },
411
+ {
412
+ "epoch": 71.32,
413
+ "grad_norm": 0.07353579252958298,
414
+ "learning_rate": 8.682703321878581e-05,
415
+ "loss": 0.0324,
416
+ "step": 285
417
+ },
418
+ {
419
+ "epoch": 72.64,
420
+ "grad_norm": 0.061573781073093414,
421
+ "learning_rate": 8.654066437571594e-05,
422
+ "loss": 0.0305,
423
+ "step": 290
424
+ },
425
+ {
426
+ "epoch": 73.96,
427
+ "grad_norm": 0.07544506341218948,
428
+ "learning_rate": 8.625429553264606e-05,
429
+ "loss": 0.0294,
430
+ "step": 295
431
+ },
432
+ {
433
+ "epoch": 75.0,
434
+ "grad_norm": 0.11100324243307114,
435
+ "learning_rate": 8.596792668957619e-05,
436
+ "loss": 0.03,
437
+ "step": 300
438
+ }
439
+ ],
440
+ "logging_steps": 5,
441
+ "max_steps": 1800,
442
+ "num_input_tokens_seen": 0,
443
+ "num_train_epochs": 450,
444
+ "save_steps": 300,
445
+ "stateful_callbacks": {
446
+ "TrainerControl": {
447
+ "args": {
448
+ "should_epoch_stop": false,
449
+ "should_evaluate": false,
450
+ "should_log": false,
451
+ "should_save": true,
452
+ "should_training_stop": false
453
+ },
454
+ "attributes": {}
455
+ }
456
+ },
457
+ "total_flos": 2.551356850176e+17,
458
+ "train_batch_size": 2,
459
+ "trial_name": null,
460
+ "trial_params": null
461
+ }
Mu-Math/group_01/checkpoints/checkpoint-300/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
Mu-Math/group_01/checkpoints/checkpoint-600/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: /hkfs/work/workspace/scratch/tum_fmp0582-dndworkspace/不冻结Qwen训练/models/Qwen2.5-1.5B-Instruct
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.12.0
Mu-Math/group_01/checkpoints/checkpoint-600/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "/hkfs/work/workspace/scratch/tum_fmp0582-dndworkspace/\u4e0d\u51bb\u7ed3Qwen\u8bad\u7ec3/models/Qwen2.5-1.5B-Instruct",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 128,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 64,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "down_proj",
24
+ "up_proj",
25
+ "gate_proj",
26
+ "q_proj",
27
+ "o_proj",
28
+ "v_proj",
29
+ "k_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
Mu-Math/group_01/checkpoints/checkpoint-600/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
Mu-Math/group_01/checkpoints/checkpoint-600/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
Mu-Math/group_01/checkpoints/checkpoint-600/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
Mu-Math/group_01/checkpoints/checkpoint-600/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
Mu-Math/group_01/checkpoints/checkpoint-600/tokenizer_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "split_special_tokens": false,
205
+ "tokenizer_class": "Qwen2Tokenizer",
206
+ "unk_token": null
207
+ }
Mu-Math/group_01/checkpoints/checkpoint-600/trainer_state.json ADDED
@@ -0,0 +1,881 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 150.0,
6
+ "eval_steps": 500,
7
+ "global_step": 600,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.32,
14
+ "grad_norm": 11.867908477783203,
15
+ "learning_rate": 0.0,
16
+ "loss": 1.9204,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 1.32,
21
+ "grad_norm": 7.492858409881592,
22
+ "learning_rate": 7.4074074074074075e-06,
23
+ "loss": 1.8831,
24
+ "step": 5
25
+ },
26
+ {
27
+ "epoch": 2.64,
28
+ "grad_norm": 3.9426615238189697,
29
+ "learning_rate": 1.6666666666666667e-05,
30
+ "loss": 1.6453,
31
+ "step": 10
32
+ },
33
+ {
34
+ "epoch": 3.96,
35
+ "grad_norm": 1.769984483718872,
36
+ "learning_rate": 2.5925925925925925e-05,
37
+ "loss": 1.2506,
38
+ "step": 15
39
+ },
40
+ {
41
+ "epoch": 5.0,
42
+ "grad_norm": 1.108256220817566,
43
+ "learning_rate": 3.518518518518519e-05,
44
+ "loss": 1.0012,
45
+ "step": 20
46
+ },
47
+ {
48
+ "epoch": 6.32,
49
+ "grad_norm": 0.5219796299934387,
50
+ "learning_rate": 4.4444444444444447e-05,
51
+ "loss": 0.8034,
52
+ "step": 25
53
+ },
54
+ {
55
+ "epoch": 7.64,
56
+ "grad_norm": 0.6449305415153503,
57
+ "learning_rate": 5.370370370370371e-05,
58
+ "loss": 0.6539,
59
+ "step": 30
60
+ },
61
+ {
62
+ "epoch": 8.96,
63
+ "grad_norm": 0.580233633518219,
64
+ "learning_rate": 6.296296296296296e-05,
65
+ "loss": 0.5474,
66
+ "step": 35
67
+ },
68
+ {
69
+ "epoch": 10.0,
70
+ "grad_norm": 1.5570186376571655,
71
+ "learning_rate": 7.222222222222222e-05,
72
+ "loss": 0.4811,
73
+ "step": 40
74
+ },
75
+ {
76
+ "epoch": 11.32,
77
+ "grad_norm": 0.5841688513755798,
78
+ "learning_rate": 8.148148148148148e-05,
79
+ "loss": 0.3477,
80
+ "step": 45
81
+ },
82
+ {
83
+ "epoch": 12.64,
84
+ "grad_norm": 0.7968279719352722,
85
+ "learning_rate": 9.074074074074075e-05,
86
+ "loss": 0.2089,
87
+ "step": 50
88
+ },
89
+ {
90
+ "epoch": 13.96,
91
+ "grad_norm": 0.8396451473236084,
92
+ "learning_rate": 0.0001,
93
+ "loss": 0.1357,
94
+ "step": 55
95
+ },
96
+ {
97
+ "epoch": 15.0,
98
+ "grad_norm": 2.7755286693573,
99
+ "learning_rate": 9.971363115693013e-05,
100
+ "loss": 0.1235,
101
+ "step": 60
102
+ },
103
+ {
104
+ "epoch": 16.32,
105
+ "grad_norm": 0.6953228116035461,
106
+ "learning_rate": 9.942726231386026e-05,
107
+ "loss": 0.0755,
108
+ "step": 65
109
+ },
110
+ {
111
+ "epoch": 17.64,
112
+ "grad_norm": 1.1248857975006104,
113
+ "learning_rate": 9.914089347079038e-05,
114
+ "loss": 0.0546,
115
+ "step": 70
116
+ },
117
+ {
118
+ "epoch": 18.96,
119
+ "grad_norm": 0.5247378945350647,
120
+ "learning_rate": 9.885452462772051e-05,
121
+ "loss": 0.0602,
122
+ "step": 75
123
+ },
124
+ {
125
+ "epoch": 20.0,
126
+ "grad_norm": 1.827890157699585,
127
+ "learning_rate": 9.856815578465064e-05,
128
+ "loss": 0.0501,
129
+ "step": 80
130
+ },
131
+ {
132
+ "epoch": 21.32,
133
+ "grad_norm": 0.23602119088172913,
134
+ "learning_rate": 9.828178694158075e-05,
135
+ "loss": 0.0443,
136
+ "step": 85
137
+ },
138
+ {
139
+ "epoch": 22.64,
140
+ "grad_norm": 0.2811133861541748,
141
+ "learning_rate": 9.799541809851088e-05,
142
+ "loss": 0.0448,
143
+ "step": 90
144
+ },
145
+ {
146
+ "epoch": 23.96,
147
+ "grad_norm": 0.29150959849357605,
148
+ "learning_rate": 9.7709049255441e-05,
149
+ "loss": 0.0426,
150
+ "step": 95
151
+ },
152
+ {
153
+ "epoch": 25.0,
154
+ "grad_norm": 1.4590457677841187,
155
+ "learning_rate": 9.742268041237114e-05,
156
+ "loss": 0.04,
157
+ "step": 100
158
+ },
159
+ {
160
+ "epoch": 26.32,
161
+ "grad_norm": 0.15335209667682648,
162
+ "learning_rate": 9.713631156930127e-05,
163
+ "loss": 0.0374,
164
+ "step": 105
165
+ },
166
+ {
167
+ "epoch": 27.64,
168
+ "grad_norm": 0.3241201639175415,
169
+ "learning_rate": 9.68499427262314e-05,
170
+ "loss": 0.0378,
171
+ "step": 110
172
+ },
173
+ {
174
+ "epoch": 28.96,
175
+ "grad_norm": 0.18619631230831146,
176
+ "learning_rate": 9.656357388316152e-05,
177
+ "loss": 0.0374,
178
+ "step": 115
179
+ },
180
+ {
181
+ "epoch": 30.0,
182
+ "grad_norm": 0.4512801170349121,
183
+ "learning_rate": 9.627720504009165e-05,
184
+ "loss": 0.0342,
185
+ "step": 120
186
+ },
187
+ {
188
+ "epoch": 31.32,
189
+ "grad_norm": 0.21706914901733398,
190
+ "learning_rate": 9.599083619702178e-05,
191
+ "loss": 0.0369,
192
+ "step": 125
193
+ },
194
+ {
195
+ "epoch": 32.64,
196
+ "grad_norm": 0.42762166261672974,
197
+ "learning_rate": 9.57044673539519e-05,
198
+ "loss": 0.0355,
199
+ "step": 130
200
+ },
201
+ {
202
+ "epoch": 33.96,
203
+ "grad_norm": 0.1793977916240692,
204
+ "learning_rate": 9.541809851088203e-05,
205
+ "loss": 0.0347,
206
+ "step": 135
207
+ },
208
+ {
209
+ "epoch": 35.0,
210
+ "grad_norm": 1.866305947303772,
211
+ "learning_rate": 9.513172966781214e-05,
212
+ "loss": 0.0368,
213
+ "step": 140
214
+ },
215
+ {
216
+ "epoch": 36.32,
217
+ "grad_norm": 0.09879657626152039,
218
+ "learning_rate": 9.484536082474227e-05,
219
+ "loss": 0.0347,
220
+ "step": 145
221
+ },
222
+ {
223
+ "epoch": 37.64,
224
+ "grad_norm": 0.09229481220245361,
225
+ "learning_rate": 9.45589919816724e-05,
226
+ "loss": 0.0338,
227
+ "step": 150
228
+ },
229
+ {
230
+ "epoch": 38.96,
231
+ "grad_norm": 0.11409584432840347,
232
+ "learning_rate": 9.427262313860252e-05,
233
+ "loss": 0.0339,
234
+ "step": 155
235
+ },
236
+ {
237
+ "epoch": 40.0,
238
+ "grad_norm": 0.35678204894065857,
239
+ "learning_rate": 9.398625429553265e-05,
240
+ "loss": 0.0347,
241
+ "step": 160
242
+ },
243
+ {
244
+ "epoch": 41.32,
245
+ "grad_norm": 0.08212767541408539,
246
+ "learning_rate": 9.369988545246277e-05,
247
+ "loss": 0.0345,
248
+ "step": 165
249
+ },
250
+ {
251
+ "epoch": 42.64,
252
+ "grad_norm": 0.07271627336740494,
253
+ "learning_rate": 9.34135166093929e-05,
254
+ "loss": 0.0305,
255
+ "step": 170
256
+ },
257
+ {
258
+ "epoch": 43.96,
259
+ "grad_norm": 0.23211534321308136,
260
+ "learning_rate": 9.312714776632303e-05,
261
+ "loss": 0.0321,
262
+ "step": 175
263
+ },
264
+ {
265
+ "epoch": 45.0,
266
+ "grad_norm": 0.23425568640232086,
267
+ "learning_rate": 9.284077892325315e-05,
268
+ "loss": 0.0334,
269
+ "step": 180
270
+ },
271
+ {
272
+ "epoch": 46.32,
273
+ "grad_norm": 0.07825004309415817,
274
+ "learning_rate": 9.255441008018328e-05,
275
+ "loss": 0.0349,
276
+ "step": 185
277
+ },
278
+ {
279
+ "epoch": 47.64,
280
+ "grad_norm": 0.06621824949979782,
281
+ "learning_rate": 9.22680412371134e-05,
282
+ "loss": 0.0302,
283
+ "step": 190
284
+ },
285
+ {
286
+ "epoch": 48.96,
287
+ "grad_norm": 0.0967830941081047,
288
+ "learning_rate": 9.198167239404353e-05,
289
+ "loss": 0.0316,
290
+ "step": 195
291
+ },
292
+ {
293
+ "epoch": 50.0,
294
+ "grad_norm": 0.39718347787857056,
295
+ "learning_rate": 9.169530355097366e-05,
296
+ "loss": 0.0307,
297
+ "step": 200
298
+ },
299
+ {
300
+ "epoch": 51.32,
301
+ "grad_norm": 0.06881817430257797,
302
+ "learning_rate": 9.140893470790379e-05,
303
+ "loss": 0.0291,
304
+ "step": 205
305
+ },
306
+ {
307
+ "epoch": 52.64,
308
+ "grad_norm": 0.07241260260343552,
309
+ "learning_rate": 9.112256586483391e-05,
310
+ "loss": 0.032,
311
+ "step": 210
312
+ },
313
+ {
314
+ "epoch": 53.96,
315
+ "grad_norm": 0.08191649615764618,
316
+ "learning_rate": 9.083619702176404e-05,
317
+ "loss": 0.0293,
318
+ "step": 215
319
+ },
320
+ {
321
+ "epoch": 55.0,
322
+ "grad_norm": 0.20381148159503937,
323
+ "learning_rate": 9.054982817869416e-05,
324
+ "loss": 0.033,
325
+ "step": 220
326
+ },
327
+ {
328
+ "epoch": 56.32,
329
+ "grad_norm": 0.0765785425901413,
330
+ "learning_rate": 9.026345933562429e-05,
331
+ "loss": 0.0323,
332
+ "step": 225
333
+ },
334
+ {
335
+ "epoch": 57.64,
336
+ "grad_norm": 0.0698801577091217,
337
+ "learning_rate": 8.997709049255442e-05,
338
+ "loss": 0.0324,
339
+ "step": 230
340
+ },
341
+ {
342
+ "epoch": 58.96,
343
+ "grad_norm": 0.08089473098516464,
344
+ "learning_rate": 8.969072164948454e-05,
345
+ "loss": 0.0314,
346
+ "step": 235
347
+ },
348
+ {
349
+ "epoch": 60.0,
350
+ "grad_norm": 0.22270062565803528,
351
+ "learning_rate": 8.940435280641467e-05,
352
+ "loss": 0.0303,
353
+ "step": 240
354
+ },
355
+ {
356
+ "epoch": 61.32,
357
+ "grad_norm": 0.07712433487176895,
358
+ "learning_rate": 8.91179839633448e-05,
359
+ "loss": 0.0324,
360
+ "step": 245
361
+ },
362
+ {
363
+ "epoch": 62.64,
364
+ "grad_norm": 0.05860769376158714,
365
+ "learning_rate": 8.883161512027491e-05,
366
+ "loss": 0.0321,
367
+ "step": 250
368
+ },
369
+ {
370
+ "epoch": 63.96,
371
+ "grad_norm": 0.05999445170164108,
372
+ "learning_rate": 8.854524627720504e-05,
373
+ "loss": 0.0315,
374
+ "step": 255
375
+ },
376
+ {
377
+ "epoch": 65.0,
378
+ "grad_norm": 0.20564565062522888,
379
+ "learning_rate": 8.825887743413516e-05,
380
+ "loss": 0.0364,
381
+ "step": 260
382
+ },
383
+ {
384
+ "epoch": 66.32,
385
+ "grad_norm": 0.0610821433365345,
386
+ "learning_rate": 8.797250859106529e-05,
387
+ "loss": 0.0311,
388
+ "step": 265
389
+ },
390
+ {
391
+ "epoch": 67.64,
392
+ "grad_norm": 0.05693706497550011,
393
+ "learning_rate": 8.768613974799542e-05,
394
+ "loss": 0.0294,
395
+ "step": 270
396
+ },
397
+ {
398
+ "epoch": 68.96,
399
+ "grad_norm": 0.06817185133695602,
400
+ "learning_rate": 8.739977090492554e-05,
401
+ "loss": 0.0284,
402
+ "step": 275
403
+ },
404
+ {
405
+ "epoch": 70.0,
406
+ "grad_norm": 0.17458151280879974,
407
+ "learning_rate": 8.711340206185567e-05,
408
+ "loss": 0.0291,
409
+ "step": 280
410
+ },
411
+ {
412
+ "epoch": 71.32,
413
+ "grad_norm": 0.07353579252958298,
414
+ "learning_rate": 8.682703321878581e-05,
415
+ "loss": 0.0324,
416
+ "step": 285
417
+ },
418
+ {
419
+ "epoch": 72.64,
420
+ "grad_norm": 0.061573781073093414,
421
+ "learning_rate": 8.654066437571594e-05,
422
+ "loss": 0.0305,
423
+ "step": 290
424
+ },
425
+ {
426
+ "epoch": 73.96,
427
+ "grad_norm": 0.07544506341218948,
428
+ "learning_rate": 8.625429553264606e-05,
429
+ "loss": 0.0294,
430
+ "step": 295
431
+ },
432
+ {
433
+ "epoch": 75.0,
434
+ "grad_norm": 0.11100324243307114,
435
+ "learning_rate": 8.596792668957619e-05,
436
+ "loss": 0.03,
437
+ "step": 300
438
+ },
439
+ {
440
+ "epoch": 76.32,
441
+ "grad_norm": 0.0491141714155674,
442
+ "learning_rate": 8.56815578465063e-05,
443
+ "loss": 0.0296,
444
+ "step": 305
445
+ },
446
+ {
447
+ "epoch": 77.64,
448
+ "grad_norm": 0.07668624073266983,
449
+ "learning_rate": 8.539518900343643e-05,
450
+ "loss": 0.0345,
451
+ "step": 310
452
+ },
453
+ {
454
+ "epoch": 78.96,
455
+ "grad_norm": 0.07898231595754623,
456
+ "learning_rate": 8.510882016036655e-05,
457
+ "loss": 0.0306,
458
+ "step": 315
459
+ },
460
+ {
461
+ "epoch": 80.0,
462
+ "grad_norm": 0.3391458988189697,
463
+ "learning_rate": 8.482245131729668e-05,
464
+ "loss": 0.0334,
465
+ "step": 320
466
+ },
467
+ {
468
+ "epoch": 81.32,
469
+ "grad_norm": 0.05544694885611534,
470
+ "learning_rate": 8.453608247422681e-05,
471
+ "loss": 0.0282,
472
+ "step": 325
473
+ },
474
+ {
475
+ "epoch": 82.64,
476
+ "grad_norm": 0.05032579228281975,
477
+ "learning_rate": 8.424971363115693e-05,
478
+ "loss": 0.0323,
479
+ "step": 330
480
+ },
481
+ {
482
+ "epoch": 83.96,
483
+ "grad_norm": 0.05664476007223129,
484
+ "learning_rate": 8.396334478808706e-05,
485
+ "loss": 0.0295,
486
+ "step": 335
487
+ },
488
+ {
489
+ "epoch": 85.0,
490
+ "grad_norm": 0.24190960824489594,
491
+ "learning_rate": 8.367697594501719e-05,
492
+ "loss": 0.033,
493
+ "step": 340
494
+ },
495
+ {
496
+ "epoch": 86.32,
497
+ "grad_norm": 0.05068003758788109,
498
+ "learning_rate": 8.339060710194731e-05,
499
+ "loss": 0.0294,
500
+ "step": 345
501
+ },
502
+ {
503
+ "epoch": 87.64,
504
+ "grad_norm": 0.06719321757555008,
505
+ "learning_rate": 8.310423825887744e-05,
506
+ "loss": 0.0297,
507
+ "step": 350
508
+ },
509
+ {
510
+ "epoch": 88.96,
511
+ "grad_norm": 0.05750493332743645,
512
+ "learning_rate": 8.281786941580757e-05,
513
+ "loss": 0.0276,
514
+ "step": 355
515
+ },
516
+ {
517
+ "epoch": 90.0,
518
+ "grad_norm": 0.17318210005760193,
519
+ "learning_rate": 8.253150057273768e-05,
520
+ "loss": 0.0309,
521
+ "step": 360
522
+ },
523
+ {
524
+ "epoch": 91.32,
525
+ "grad_norm": 0.05168261379003525,
526
+ "learning_rate": 8.224513172966782e-05,
527
+ "loss": 0.0284,
528
+ "step": 365
529
+ },
530
+ {
531
+ "epoch": 92.64,
532
+ "grad_norm": 0.053040292114019394,
533
+ "learning_rate": 8.195876288659795e-05,
534
+ "loss": 0.0314,
535
+ "step": 370
536
+ },
537
+ {
538
+ "epoch": 93.96,
539
+ "grad_norm": 0.06162334978580475,
540
+ "learning_rate": 8.167239404352807e-05,
541
+ "loss": 0.0297,
542
+ "step": 375
543
+ },
544
+ {
545
+ "epoch": 95.0,
546
+ "grad_norm": 0.13474801182746887,
547
+ "learning_rate": 8.13860252004582e-05,
548
+ "loss": 0.0271,
549
+ "step": 380
550
+ },
551
+ {
552
+ "epoch": 96.32,
553
+ "grad_norm": 0.05177682265639305,
554
+ "learning_rate": 8.109965635738833e-05,
555
+ "loss": 0.0301,
556
+ "step": 385
557
+ },
558
+ {
559
+ "epoch": 97.64,
560
+ "grad_norm": 0.04276576265692711,
561
+ "learning_rate": 8.081328751431845e-05,
562
+ "loss": 0.0286,
563
+ "step": 390
564
+ },
565
+ {
566
+ "epoch": 98.96,
567
+ "grad_norm": 0.04698758199810982,
568
+ "learning_rate": 8.052691867124858e-05,
569
+ "loss": 0.0284,
570
+ "step": 395
571
+ },
572
+ {
573
+ "epoch": 100.0,
574
+ "grad_norm": 0.14094208180904388,
575
+ "learning_rate": 8.02405498281787e-05,
576
+ "loss": 0.0302,
577
+ "step": 400
578
+ },
579
+ {
580
+ "epoch": 101.32,
581
+ "grad_norm": 0.0528222993016243,
582
+ "learning_rate": 7.995418098510883e-05,
583
+ "loss": 0.0304,
584
+ "step": 405
585
+ },
586
+ {
587
+ "epoch": 102.64,
588
+ "grad_norm": 0.053034182637929916,
589
+ "learning_rate": 7.966781214203894e-05,
590
+ "loss": 0.0316,
591
+ "step": 410
592
+ },
593
+ {
594
+ "epoch": 103.96,
595
+ "grad_norm": 0.05732697248458862,
596
+ "learning_rate": 7.938144329896907e-05,
597
+ "loss": 0.0295,
598
+ "step": 415
599
+ },
600
+ {
601
+ "epoch": 105.0,
602
+ "grad_norm": 0.17511749267578125,
603
+ "learning_rate": 7.90950744558992e-05,
604
+ "loss": 0.0317,
605
+ "step": 420
606
+ },
607
+ {
608
+ "epoch": 106.32,
609
+ "grad_norm": 0.04588017240166664,
610
+ "learning_rate": 7.880870561282932e-05,
611
+ "loss": 0.0305,
612
+ "step": 425
613
+ },
614
+ {
615
+ "epoch": 107.64,
616
+ "grad_norm": 0.049282800406217575,
617
+ "learning_rate": 7.852233676975945e-05,
618
+ "loss": 0.031,
619
+ "step": 430
620
+ },
621
+ {
622
+ "epoch": 108.96,
623
+ "grad_norm": 0.04937691241502762,
624
+ "learning_rate": 7.823596792668958e-05,
625
+ "loss": 0.0278,
626
+ "step": 435
627
+ },
628
+ {
629
+ "epoch": 110.0,
630
+ "grad_norm": 0.11863432824611664,
631
+ "learning_rate": 7.79495990836197e-05,
632
+ "loss": 0.0316,
633
+ "step": 440
634
+ },
635
+ {
636
+ "epoch": 111.32,
637
+ "grad_norm": 0.04387475177645683,
638
+ "learning_rate": 7.766323024054983e-05,
639
+ "loss": 0.0283,
640
+ "step": 445
641
+ },
642
+ {
643
+ "epoch": 112.64,
644
+ "grad_norm": 0.04409867897629738,
645
+ "learning_rate": 7.737686139747996e-05,
646
+ "loss": 0.0306,
647
+ "step": 450
648
+ },
649
+ {
650
+ "epoch": 113.96,
651
+ "grad_norm": 0.04834749549627304,
652
+ "learning_rate": 7.709049255441008e-05,
653
+ "loss": 0.0302,
654
+ "step": 455
655
+ },
656
+ {
657
+ "epoch": 115.0,
658
+ "grad_norm": 0.1553424447774887,
659
+ "learning_rate": 7.680412371134021e-05,
660
+ "loss": 0.0326,
661
+ "step": 460
662
+ },
663
+ {
664
+ "epoch": 116.32,
665
+ "grad_norm": 0.05963806435465813,
666
+ "learning_rate": 7.651775486827034e-05,
667
+ "loss": 0.0291,
668
+ "step": 465
669
+ },
670
+ {
671
+ "epoch": 117.64,
672
+ "grad_norm": 0.04697559028863907,
673
+ "learning_rate": 7.623138602520046e-05,
674
+ "loss": 0.027,
675
+ "step": 470
676
+ },
677
+ {
678
+ "epoch": 118.96,
679
+ "grad_norm": 0.04225379601120949,
680
+ "learning_rate": 7.594501718213059e-05,
681
+ "loss": 0.0343,
682
+ "step": 475
683
+ },
684
+ {
685
+ "epoch": 120.0,
686
+ "grad_norm": 0.1076933965086937,
687
+ "learning_rate": 7.565864833906071e-05,
688
+ "loss": 0.0288,
689
+ "step": 480
690
+ },
691
+ {
692
+ "epoch": 121.32,
693
+ "grad_norm": 0.04540383443236351,
694
+ "learning_rate": 7.537227949599084e-05,
695
+ "loss": 0.0291,
696
+ "step": 485
697
+ },
698
+ {
699
+ "epoch": 122.64,
700
+ "grad_norm": 0.05459335818886757,
701
+ "learning_rate": 7.508591065292097e-05,
702
+ "loss": 0.0289,
703
+ "step": 490
704
+ },
705
+ {
706
+ "epoch": 123.96,
707
+ "grad_norm": 0.05171333625912666,
708
+ "learning_rate": 7.47995418098511e-05,
709
+ "loss": 0.0284,
710
+ "step": 495
711
+ },
712
+ {
713
+ "epoch": 125.0,
714
+ "grad_norm": 0.08606769144535065,
715
+ "learning_rate": 7.451317296678122e-05,
716
+ "loss": 0.0314,
717
+ "step": 500
718
+ },
719
+ {
720
+ "epoch": 126.32,
721
+ "grad_norm": 0.040535662323236465,
722
+ "learning_rate": 7.422680412371135e-05,
723
+ "loss": 0.028,
724
+ "step": 505
725
+ },
726
+ {
727
+ "epoch": 127.64,
728
+ "grad_norm": 0.04621696099638939,
729
+ "learning_rate": 7.394043528064147e-05,
730
+ "loss": 0.0281,
731
+ "step": 510
732
+ },
733
+ {
734
+ "epoch": 128.96,
735
+ "grad_norm": 0.04407593980431557,
736
+ "learning_rate": 7.36540664375716e-05,
737
+ "loss": 0.0309,
738
+ "step": 515
739
+ },
740
+ {
741
+ "epoch": 130.0,
742
+ "grad_norm": 0.24090737104415894,
743
+ "learning_rate": 7.336769759450171e-05,
744
+ "loss": 0.0302,
745
+ "step": 520
746
+ },
747
+ {
748
+ "epoch": 131.32,
749
+ "grad_norm": 0.051712971180677414,
750
+ "learning_rate": 7.308132875143184e-05,
751
+ "loss": 0.0305,
752
+ "step": 525
753
+ },
754
+ {
755
+ "epoch": 132.64,
756
+ "grad_norm": 0.0373610258102417,
757
+ "learning_rate": 7.279495990836197e-05,
758
+ "loss": 0.0283,
759
+ "step": 530
760
+ },
761
+ {
762
+ "epoch": 133.96,
763
+ "grad_norm": 0.04424213245511055,
764
+ "learning_rate": 7.250859106529209e-05,
765
+ "loss": 0.0317,
766
+ "step": 535
767
+ },
768
+ {
769
+ "epoch": 135.0,
770
+ "grad_norm": 0.09113436192274094,
771
+ "learning_rate": 7.222222222222222e-05,
772
+ "loss": 0.0302,
773
+ "step": 540
774
+ },
775
+ {
776
+ "epoch": 136.32,
777
+ "grad_norm": 0.03745009005069733,
778
+ "learning_rate": 7.193585337915235e-05,
779
+ "loss": 0.0315,
780
+ "step": 545
781
+ },
782
+ {
783
+ "epoch": 137.64,
784
+ "grad_norm": 0.04058730602264404,
785
+ "learning_rate": 7.164948453608247e-05,
786
+ "loss": 0.0312,
787
+ "step": 550
788
+ },
789
+ {
790
+ "epoch": 138.96,
791
+ "grad_norm": 0.046279069036245346,
792
+ "learning_rate": 7.136311569301261e-05,
793
+ "loss": 0.0295,
794
+ "step": 555
795
+ },
796
+ {
797
+ "epoch": 140.0,
798
+ "grad_norm": 0.17239141464233398,
799
+ "learning_rate": 7.107674684994274e-05,
800
+ "loss": 0.0307,
801
+ "step": 560
802
+ },
803
+ {
804
+ "epoch": 141.32,
805
+ "grad_norm": 0.036460030823946,
806
+ "learning_rate": 7.079037800687286e-05,
807
+ "loss": 0.0284,
808
+ "step": 565
809
+ },
810
+ {
811
+ "epoch": 142.64,
812
+ "grad_norm": 0.03434258699417114,
813
+ "learning_rate": 7.050400916380299e-05,
814
+ "loss": 0.0283,
815
+ "step": 570
816
+ },
817
+ {
818
+ "epoch": 143.96,
819
+ "grad_norm": 0.0470467284321785,
820
+ "learning_rate": 7.02176403207331e-05,
821
+ "loss": 0.0296,
822
+ "step": 575
823
+ },
824
+ {
825
+ "epoch": 145.0,
826
+ "grad_norm": 0.07163394242525101,
827
+ "learning_rate": 6.993127147766323e-05,
828
+ "loss": 0.0256,
829
+ "step": 580
830
+ },
831
+ {
832
+ "epoch": 146.32,
833
+ "grad_norm": 0.042208388447761536,
834
+ "learning_rate": 6.964490263459336e-05,
835
+ "loss": 0.0298,
836
+ "step": 585
837
+ },
838
+ {
839
+ "epoch": 147.64,
840
+ "grad_norm": 0.04421050846576691,
841
+ "learning_rate": 6.935853379152348e-05,
842
+ "loss": 0.0274,
843
+ "step": 590
844
+ },
845
+ {
846
+ "epoch": 148.96,
847
+ "grad_norm": 0.047223106026649475,
848
+ "learning_rate": 6.907216494845361e-05,
849
+ "loss": 0.0311,
850
+ "step": 595
851
+ },
852
+ {
853
+ "epoch": 150.0,
854
+ "grad_norm": 0.1724609136581421,
855
+ "learning_rate": 6.878579610538374e-05,
856
+ "loss": 0.0302,
857
+ "step": 600
858
+ }
859
+ ],
860
+ "logging_steps": 5,
861
+ "max_steps": 1800,
862
+ "num_input_tokens_seen": 0,
863
+ "num_train_epochs": 450,
864
+ "save_steps": 300,
865
+ "stateful_callbacks": {
866
+ "TrainerControl": {
867
+ "args": {
868
+ "should_epoch_stop": false,
869
+ "should_evaluate": false,
870
+ "should_log": false,
871
+ "should_save": true,
872
+ "should_training_stop": false
873
+ },
874
+ "attributes": {}
875
+ }
876
+ },
877
+ "total_flos": 5.102713700352e+17,
878
+ "train_batch_size": 2,
879
+ "trial_name": null,
880
+ "trial_params": null
881
+ }
Mu-Math/group_01/checkpoints/checkpoint-600/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
Mu-Math/group_01/metadata.json ADDED
@@ -0,0 +1,2718 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset_name": "Mu-Math",
3
+ "group_index": 1,
4
+ "prompt_group_file": "/hkfs/work/workspace/scratch/tum_fmp0582-dndworkspace/自己训练lora/train_lora/prompt_groups/Mu-Math/group_01.json",
5
+ "output_dir": "/hkfs/work/workspace/scratch/tum_fmp0582-dndworkspace/自己训练lora/train_lora/outputs/Mu-Math/group_01",
6
+ "checkpoint_root": "/hkfs/work/workspace/scratch/tum_fmp0582-dndworkspace/自己训练lora/train_lora/outputs/Mu-Math/group_01/checkpoints",
7
+ "generated_at": "2025-11-06T12:13:11Z",
8
+ "train_loss": 0.054099425789382725,
9
+ "metrics": {
10
+ "train_runtime": 31004.9647,
11
+ "train_samples_per_second": 1.858,
12
+ "train_steps_per_second": 0.058,
13
+ "total_flos": 1.5308141101056e+18,
14
+ "train_loss": 0.054099425789382725,
15
+ "epoch": 450.0
16
+ },
17
+ "trainer_state": [
18
+ {
19
+ "loss": 1.9204,
20
+ "grad_norm": 11.867908477783203,
21
+ "learning_rate": 0.0,
22
+ "epoch": 0.32,
23
+ "step": 1
24
+ },
25
+ {
26
+ "loss": 1.8831,
27
+ "grad_norm": 7.492858409881592,
28
+ "learning_rate": 7.4074074074074075e-06,
29
+ "epoch": 1.32,
30
+ "step": 5
31
+ },
32
+ {
33
+ "loss": 1.6453,
34
+ "grad_norm": 3.9426615238189697,
35
+ "learning_rate": 1.6666666666666667e-05,
36
+ "epoch": 2.64,
37
+ "step": 10
38
+ },
39
+ {
40
+ "loss": 1.2506,
41
+ "grad_norm": 1.769984483718872,
42
+ "learning_rate": 2.5925925925925925e-05,
43
+ "epoch": 3.96,
44
+ "step": 15
45
+ },
46
+ {
47
+ "loss": 1.0012,
48
+ "grad_norm": 1.108256220817566,
49
+ "learning_rate": 3.518518518518519e-05,
50
+ "epoch": 5.0,
51
+ "step": 20
52
+ },
53
+ {
54
+ "loss": 0.8034,
55
+ "grad_norm": 0.5219796299934387,
56
+ "learning_rate": 4.4444444444444447e-05,
57
+ "epoch": 6.32,
58
+ "step": 25
59
+ },
60
+ {
61
+ "loss": 0.6539,
62
+ "grad_norm": 0.6449305415153503,
63
+ "learning_rate": 5.370370370370371e-05,
64
+ "epoch": 7.64,
65
+ "step": 30
66
+ },
67
+ {
68
+ "loss": 0.5474,
69
+ "grad_norm": 0.580233633518219,
70
+ "learning_rate": 6.296296296296296e-05,
71
+ "epoch": 8.96,
72
+ "step": 35
73
+ },
74
+ {
75
+ "loss": 0.4811,
76
+ "grad_norm": 1.5570186376571655,
77
+ "learning_rate": 7.222222222222222e-05,
78
+ "epoch": 10.0,
79
+ "step": 40
80
+ },
81
+ {
82
+ "loss": 0.3477,
83
+ "grad_norm": 0.5841688513755798,
84
+ "learning_rate": 8.148148148148148e-05,
85
+ "epoch": 11.32,
86
+ "step": 45
87
+ },
88
+ {
89
+ "loss": 0.2089,
90
+ "grad_norm": 0.7968279719352722,
91
+ "learning_rate": 9.074074074074075e-05,
92
+ "epoch": 12.64,
93
+ "step": 50
94
+ },
95
+ {
96
+ "loss": 0.1357,
97
+ "grad_norm": 0.8396451473236084,
98
+ "learning_rate": 0.0001,
99
+ "epoch": 13.96,
100
+ "step": 55
101
+ },
102
+ {
103
+ "loss": 0.1235,
104
+ "grad_norm": 2.7755286693573,
105
+ "learning_rate": 9.971363115693013e-05,
106
+ "epoch": 15.0,
107
+ "step": 60
108
+ },
109
+ {
110
+ "loss": 0.0755,
111
+ "grad_norm": 0.6953228116035461,
112
+ "learning_rate": 9.942726231386026e-05,
113
+ "epoch": 16.32,
114
+ "step": 65
115
+ },
116
+ {
117
+ "loss": 0.0546,
118
+ "grad_norm": 1.1248857975006104,
119
+ "learning_rate": 9.914089347079038e-05,
120
+ "epoch": 17.64,
121
+ "step": 70
122
+ },
123
+ {
124
+ "loss": 0.0602,
125
+ "grad_norm": 0.5247378945350647,
126
+ "learning_rate": 9.885452462772051e-05,
127
+ "epoch": 18.96,
128
+ "step": 75
129
+ },
130
+ {
131
+ "loss": 0.0501,
132
+ "grad_norm": 1.827890157699585,
133
+ "learning_rate": 9.856815578465064e-05,
134
+ "epoch": 20.0,
135
+ "step": 80
136
+ },
137
+ {
138
+ "loss": 0.0443,
139
+ "grad_norm": 0.23602119088172913,
140
+ "learning_rate": 9.828178694158075e-05,
141
+ "epoch": 21.32,
142
+ "step": 85
143
+ },
144
+ {
145
+ "loss": 0.0448,
146
+ "grad_norm": 0.2811133861541748,
147
+ "learning_rate": 9.799541809851088e-05,
148
+ "epoch": 22.64,
149
+ "step": 90
150
+ },
151
+ {
152
+ "loss": 0.0426,
153
+ "grad_norm": 0.29150959849357605,
154
+ "learning_rate": 9.7709049255441e-05,
155
+ "epoch": 23.96,
156
+ "step": 95
157
+ },
158
+ {
159
+ "loss": 0.04,
160
+ "grad_norm": 1.4590457677841187,
161
+ "learning_rate": 9.742268041237114e-05,
162
+ "epoch": 25.0,
163
+ "step": 100
164
+ },
165
+ {
166
+ "loss": 0.0374,
167
+ "grad_norm": 0.15335209667682648,
168
+ "learning_rate": 9.713631156930127e-05,
169
+ "epoch": 26.32,
170
+ "step": 105
171
+ },
172
+ {
173
+ "loss": 0.0378,
174
+ "grad_norm": 0.3241201639175415,
175
+ "learning_rate": 9.68499427262314e-05,
176
+ "epoch": 27.64,
177
+ "step": 110
178
+ },
179
+ {
180
+ "loss": 0.0374,
181
+ "grad_norm": 0.18619631230831146,
182
+ "learning_rate": 9.656357388316152e-05,
183
+ "epoch": 28.96,
184
+ "step": 115
185
+ },
186
+ {
187
+ "loss": 0.0342,
188
+ "grad_norm": 0.4512801170349121,
189
+ "learning_rate": 9.627720504009165e-05,
190
+ "epoch": 30.0,
191
+ "step": 120
192
+ },
193
+ {
194
+ "loss": 0.0369,
195
+ "grad_norm": 0.21706914901733398,
196
+ "learning_rate": 9.599083619702178e-05,
197
+ "epoch": 31.32,
198
+ "step": 125
199
+ },
200
+ {
201
+ "loss": 0.0355,
202
+ "grad_norm": 0.42762166261672974,
203
+ "learning_rate": 9.57044673539519e-05,
204
+ "epoch": 32.64,
205
+ "step": 130
206
+ },
207
+ {
208
+ "loss": 0.0347,
209
+ "grad_norm": 0.1793977916240692,
210
+ "learning_rate": 9.541809851088203e-05,
211
+ "epoch": 33.96,
212
+ "step": 135
213
+ },
214
+ {
215
+ "loss": 0.0368,
216
+ "grad_norm": 1.866305947303772,
217
+ "learning_rate": 9.513172966781214e-05,
218
+ "epoch": 35.0,
219
+ "step": 140
220
+ },
221
+ {
222
+ "loss": 0.0347,
223
+ "grad_norm": 0.09879657626152039,
224
+ "learning_rate": 9.484536082474227e-05,
225
+ "epoch": 36.32,
226
+ "step": 145
227
+ },
228
+ {
229
+ "loss": 0.0338,
230
+ "grad_norm": 0.09229481220245361,
231
+ "learning_rate": 9.45589919816724e-05,
232
+ "epoch": 37.64,
233
+ "step": 150
234
+ },
235
+ {
236
+ "loss": 0.0339,
237
+ "grad_norm": 0.11409584432840347,
238
+ "learning_rate": 9.427262313860252e-05,
239
+ "epoch": 38.96,
240
+ "step": 155
241
+ },
242
+ {
243
+ "loss": 0.0347,
244
+ "grad_norm": 0.35678204894065857,
245
+ "learning_rate": 9.398625429553265e-05,
246
+ "epoch": 40.0,
247
+ "step": 160
248
+ },
249
+ {
250
+ "loss": 0.0345,
251
+ "grad_norm": 0.08212767541408539,
252
+ "learning_rate": 9.369988545246277e-05,
253
+ "epoch": 41.32,
254
+ "step": 165
255
+ },
256
+ {
257
+ "loss": 0.0305,
258
+ "grad_norm": 0.07271627336740494,
259
+ "learning_rate": 9.34135166093929e-05,
260
+ "epoch": 42.64,
261
+ "step": 170
262
+ },
263
+ {
264
+ "loss": 0.0321,
265
+ "grad_norm": 0.23211534321308136,
266
+ "learning_rate": 9.312714776632303e-05,
267
+ "epoch": 43.96,
268
+ "step": 175
269
+ },
270
+ {
271
+ "loss": 0.0334,
272
+ "grad_norm": 0.23425568640232086,
273
+ "learning_rate": 9.284077892325315e-05,
274
+ "epoch": 45.0,
275
+ "step": 180
276
+ },
277
+ {
278
+ "loss": 0.0349,
279
+ "grad_norm": 0.07825004309415817,
280
+ "learning_rate": 9.255441008018328e-05,
281
+ "epoch": 46.32,
282
+ "step": 185
283
+ },
284
+ {
285
+ "loss": 0.0302,
286
+ "grad_norm": 0.06621824949979782,
287
+ "learning_rate": 9.22680412371134e-05,
288
+ "epoch": 47.64,
289
+ "step": 190
290
+ },
291
+ {
292
+ "loss": 0.0316,
293
+ "grad_norm": 0.0967830941081047,
294
+ "learning_rate": 9.198167239404353e-05,
295
+ "epoch": 48.96,
296
+ "step": 195
297
+ },
298
+ {
299
+ "loss": 0.0307,
300
+ "grad_norm": 0.39718347787857056,
301
+ "learning_rate": 9.169530355097366e-05,
302
+ "epoch": 50.0,
303
+ "step": 200
304
+ },
305
+ {
306
+ "loss": 0.0291,
307
+ "grad_norm": 0.06881817430257797,
308
+ "learning_rate": 9.140893470790379e-05,
309
+ "epoch": 51.32,
310
+ "step": 205
311
+ },
312
+ {
313
+ "loss": 0.032,
314
+ "grad_norm": 0.07241260260343552,
315
+ "learning_rate": 9.112256586483391e-05,
316
+ "epoch": 52.64,
317
+ "step": 210
318
+ },
319
+ {
320
+ "loss": 0.0293,
321
+ "grad_norm": 0.08191649615764618,
322
+ "learning_rate": 9.083619702176404e-05,
323
+ "epoch": 53.96,
324
+ "step": 215
325
+ },
326
+ {
327
+ "loss": 0.033,
328
+ "grad_norm": 0.20381148159503937,
329
+ "learning_rate": 9.054982817869416e-05,
330
+ "epoch": 55.0,
331
+ "step": 220
332
+ },
333
+ {
334
+ "loss": 0.0323,
335
+ "grad_norm": 0.0765785425901413,
336
+ "learning_rate": 9.026345933562429e-05,
337
+ "epoch": 56.32,
338
+ "step": 225
339
+ },
340
+ {
341
+ "loss": 0.0324,
342
+ "grad_norm": 0.0698801577091217,
343
+ "learning_rate": 8.997709049255442e-05,
344
+ "epoch": 57.64,
345
+ "step": 230
346
+ },
347
+ {
348
+ "loss": 0.0314,
349
+ "grad_norm": 0.08089473098516464,
350
+ "learning_rate": 8.969072164948454e-05,
351
+ "epoch": 58.96,
352
+ "step": 235
353
+ },
354
+ {
355
+ "loss": 0.0303,
356
+ "grad_norm": 0.22270062565803528,
357
+ "learning_rate": 8.940435280641467e-05,
358
+ "epoch": 60.0,
359
+ "step": 240
360
+ },
361
+ {
362
+ "loss": 0.0324,
363
+ "grad_norm": 0.07712433487176895,
364
+ "learning_rate": 8.91179839633448e-05,
365
+ "epoch": 61.32,
366
+ "step": 245
367
+ },
368
+ {
369
+ "loss": 0.0321,
370
+ "grad_norm": 0.05860769376158714,
371
+ "learning_rate": 8.883161512027491e-05,
372
+ "epoch": 62.64,
373
+ "step": 250
374
+ },
375
+ {
376
+ "loss": 0.0315,
377
+ "grad_norm": 0.05999445170164108,
378
+ "learning_rate": 8.854524627720504e-05,
379
+ "epoch": 63.96,
380
+ "step": 255
381
+ },
382
+ {
383
+ "loss": 0.0364,
384
+ "grad_norm": 0.20564565062522888,
385
+ "learning_rate": 8.825887743413516e-05,
386
+ "epoch": 65.0,
387
+ "step": 260
388
+ },
389
+ {
390
+ "loss": 0.0311,
391
+ "grad_norm": 0.0610821433365345,
392
+ "learning_rate": 8.797250859106529e-05,
393
+ "epoch": 66.32,
394
+ "step": 265
395
+ },
396
+ {
397
+ "loss": 0.0294,
398
+ "grad_norm": 0.05693706497550011,
399
+ "learning_rate": 8.768613974799542e-05,
400
+ "epoch": 67.64,
401
+ "step": 270
402
+ },
403
+ {
404
+ "loss": 0.0284,
405
+ "grad_norm": 0.06817185133695602,
406
+ "learning_rate": 8.739977090492554e-05,
407
+ "epoch": 68.96,
408
+ "step": 275
409
+ },
410
+ {
411
+ "loss": 0.0291,
412
+ "grad_norm": 0.17458151280879974,
413
+ "learning_rate": 8.711340206185567e-05,
414
+ "epoch": 70.0,
415
+ "step": 280
416
+ },
417
+ {
418
+ "loss": 0.0324,
419
+ "grad_norm": 0.07353579252958298,
420
+ "learning_rate": 8.682703321878581e-05,
421
+ "epoch": 71.32,
422
+ "step": 285
423
+ },
424
+ {
425
+ "loss": 0.0305,
426
+ "grad_norm": 0.061573781073093414,
427
+ "learning_rate": 8.654066437571594e-05,
428
+ "epoch": 72.64,
429
+ "step": 290
430
+ },
431
+ {
432
+ "loss": 0.0294,
433
+ "grad_norm": 0.07544506341218948,
434
+ "learning_rate": 8.625429553264606e-05,
435
+ "epoch": 73.96,
436
+ "step": 295
437
+ },
438
+ {
439
+ "loss": 0.03,
440
+ "grad_norm": 0.11100324243307114,
441
+ "learning_rate": 8.596792668957619e-05,
442
+ "epoch": 75.0,
443
+ "step": 300
444
+ },
445
+ {
446
+ "loss": 0.0296,
447
+ "grad_norm": 0.0491141714155674,
448
+ "learning_rate": 8.56815578465063e-05,
449
+ "epoch": 76.32,
450
+ "step": 305
451
+ },
452
+ {
453
+ "loss": 0.0345,
454
+ "grad_norm": 0.07668624073266983,
455
+ "learning_rate": 8.539518900343643e-05,
456
+ "epoch": 77.64,
457
+ "step": 310
458
+ },
459
+ {
460
+ "loss": 0.0306,
461
+ "grad_norm": 0.07898231595754623,
462
+ "learning_rate": 8.510882016036655e-05,
463
+ "epoch": 78.96,
464
+ "step": 315
465
+ },
466
+ {
467
+ "loss": 0.0334,
468
+ "grad_norm": 0.3391458988189697,
469
+ "learning_rate": 8.482245131729668e-05,
470
+ "epoch": 80.0,
471
+ "step": 320
472
+ },
473
+ {
474
+ "loss": 0.0282,
475
+ "grad_norm": 0.05544694885611534,
476
+ "learning_rate": 8.453608247422681e-05,
477
+ "epoch": 81.32,
478
+ "step": 325
479
+ },
480
+ {
481
+ "loss": 0.0323,
482
+ "grad_norm": 0.05032579228281975,
483
+ "learning_rate": 8.424971363115693e-05,
484
+ "epoch": 82.64,
485
+ "step": 330
486
+ },
487
+ {
488
+ "loss": 0.0295,
489
+ "grad_norm": 0.05664476007223129,
490
+ "learning_rate": 8.396334478808706e-05,
491
+ "epoch": 83.96,
492
+ "step": 335
493
+ },
494
+ {
495
+ "loss": 0.033,
496
+ "grad_norm": 0.24190960824489594,
497
+ "learning_rate": 8.367697594501719e-05,
498
+ "epoch": 85.0,
499
+ "step": 340
500
+ },
501
+ {
502
+ "loss": 0.0294,
503
+ "grad_norm": 0.05068003758788109,
504
+ "learning_rate": 8.339060710194731e-05,
505
+ "epoch": 86.32,
506
+ "step": 345
507
+ },
508
+ {
509
+ "loss": 0.0297,
510
+ "grad_norm": 0.06719321757555008,
511
+ "learning_rate": 8.310423825887744e-05,
512
+ "epoch": 87.64,
513
+ "step": 350
514
+ },
515
+ {
516
+ "loss": 0.0276,
517
+ "grad_norm": 0.05750493332743645,
518
+ "learning_rate": 8.281786941580757e-05,
519
+ "epoch": 88.96,
520
+ "step": 355
521
+ },
522
+ {
523
+ "loss": 0.0309,
524
+ "grad_norm": 0.17318210005760193,
525
+ "learning_rate": 8.253150057273768e-05,
526
+ "epoch": 90.0,
527
+ "step": 360
528
+ },
529
+ {
530
+ "loss": 0.0284,
531
+ "grad_norm": 0.05168261379003525,
532
+ "learning_rate": 8.224513172966782e-05,
533
+ "epoch": 91.32,
534
+ "step": 365
535
+ },
536
+ {
537
+ "loss": 0.0314,
538
+ "grad_norm": 0.053040292114019394,
539
+ "learning_rate": 8.195876288659795e-05,
540
+ "epoch": 92.64,
541
+ "step": 370
542
+ },
543
+ {
544
+ "loss": 0.0297,
545
+ "grad_norm": 0.06162334978580475,
546
+ "learning_rate": 8.167239404352807e-05,
547
+ "epoch": 93.96,
548
+ "step": 375
549
+ },
550
+ {
551
+ "loss": 0.0271,
552
+ "grad_norm": 0.13474801182746887,
553
+ "learning_rate": 8.13860252004582e-05,
554
+ "epoch": 95.0,
555
+ "step": 380
556
+ },
557
+ {
558
+ "loss": 0.0301,
559
+ "grad_norm": 0.05177682265639305,
560
+ "learning_rate": 8.109965635738833e-05,
561
+ "epoch": 96.32,
562
+ "step": 385
563
+ },
564
+ {
565
+ "loss": 0.0286,
566
+ "grad_norm": 0.04276576265692711,
567
+ "learning_rate": 8.081328751431845e-05,
568
+ "epoch": 97.64,
569
+ "step": 390
570
+ },
571
+ {
572
+ "loss": 0.0284,
573
+ "grad_norm": 0.04698758199810982,
574
+ "learning_rate": 8.052691867124858e-05,
575
+ "epoch": 98.96,
576
+ "step": 395
577
+ },
578
+ {
579
+ "loss": 0.0302,
580
+ "grad_norm": 0.14094208180904388,
581
+ "learning_rate": 8.02405498281787e-05,
582
+ "epoch": 100.0,
583
+ "step": 400
584
+ },
585
+ {
586
+ "loss": 0.0304,
587
+ "grad_norm": 0.0528222993016243,
588
+ "learning_rate": 7.995418098510883e-05,
589
+ "epoch": 101.32,
590
+ "step": 405
591
+ },
592
+ {
593
+ "loss": 0.0316,
594
+ "grad_norm": 0.053034182637929916,
595
+ "learning_rate": 7.966781214203894e-05,
596
+ "epoch": 102.64,
597
+ "step": 410
598
+ },
599
+ {
600
+ "loss": 0.0295,
601
+ "grad_norm": 0.05732697248458862,
602
+ "learning_rate": 7.938144329896907e-05,
603
+ "epoch": 103.96,
604
+ "step": 415
605
+ },
606
+ {
607
+ "loss": 0.0317,
608
+ "grad_norm": 0.17511749267578125,
609
+ "learning_rate": 7.90950744558992e-05,
610
+ "epoch": 105.0,
611
+ "step": 420
612
+ },
613
+ {
614
+ "loss": 0.0305,
615
+ "grad_norm": 0.04588017240166664,
616
+ "learning_rate": 7.880870561282932e-05,
617
+ "epoch": 106.32,
618
+ "step": 425
619
+ },
620
+ {
621
+ "loss": 0.031,
622
+ "grad_norm": 0.049282800406217575,
623
+ "learning_rate": 7.852233676975945e-05,
624
+ "epoch": 107.64,
625
+ "step": 430
626
+ },
627
+ {
628
+ "loss": 0.0278,
629
+ "grad_norm": 0.04937691241502762,
630
+ "learning_rate": 7.823596792668958e-05,
631
+ "epoch": 108.96,
632
+ "step": 435
633
+ },
634
+ {
635
+ "loss": 0.0316,
636
+ "grad_norm": 0.11863432824611664,
637
+ "learning_rate": 7.79495990836197e-05,
638
+ "epoch": 110.0,
639
+ "step": 440
640
+ },
641
+ {
642
+ "loss": 0.0283,
643
+ "grad_norm": 0.04387475177645683,
644
+ "learning_rate": 7.766323024054983e-05,
645
+ "epoch": 111.32,
646
+ "step": 445
647
+ },
648
+ {
649
+ "loss": 0.0306,
650
+ "grad_norm": 0.04409867897629738,
651
+ "learning_rate": 7.737686139747996e-05,
652
+ "epoch": 112.64,
653
+ "step": 450
654
+ },
655
+ {
656
+ "loss": 0.0302,
657
+ "grad_norm": 0.04834749549627304,
658
+ "learning_rate": 7.709049255441008e-05,
659
+ "epoch": 113.96,
660
+ "step": 455
661
+ },
662
+ {
663
+ "loss": 0.0326,
664
+ "grad_norm": 0.1553424447774887,
665
+ "learning_rate": 7.680412371134021e-05,
666
+ "epoch": 115.0,
667
+ "step": 460
668
+ },
669
+ {
670
+ "loss": 0.0291,
671
+ "grad_norm": 0.05963806435465813,
672
+ "learning_rate": 7.651775486827034e-05,
673
+ "epoch": 116.32,
674
+ "step": 465
675
+ },
676
+ {
677
+ "loss": 0.027,
678
+ "grad_norm": 0.04697559028863907,
679
+ "learning_rate": 7.623138602520046e-05,
680
+ "epoch": 117.64,
681
+ "step": 470
682
+ },
683
+ {
684
+ "loss": 0.0343,
685
+ "grad_norm": 0.04225379601120949,
686
+ "learning_rate": 7.594501718213059e-05,
687
+ "epoch": 118.96,
688
+ "step": 475
689
+ },
690
+ {
691
+ "loss": 0.0288,
692
+ "grad_norm": 0.1076933965086937,
693
+ "learning_rate": 7.565864833906071e-05,
694
+ "epoch": 120.0,
695
+ "step": 480
696
+ },
697
+ {
698
+ "loss": 0.0291,
699
+ "grad_norm": 0.04540383443236351,
700
+ "learning_rate": 7.537227949599084e-05,
701
+ "epoch": 121.32,
702
+ "step": 485
703
+ },
704
+ {
705
+ "loss": 0.0289,
706
+ "grad_norm": 0.05459335818886757,
707
+ "learning_rate": 7.508591065292097e-05,
708
+ "epoch": 122.64,
709
+ "step": 490
710
+ },
711
+ {
712
+ "loss": 0.0284,
713
+ "grad_norm": 0.05171333625912666,
714
+ "learning_rate": 7.47995418098511e-05,
715
+ "epoch": 123.96,
716
+ "step": 495
717
+ },
718
+ {
719
+ "loss": 0.0314,
720
+ "grad_norm": 0.08606769144535065,
721
+ "learning_rate": 7.451317296678122e-05,
722
+ "epoch": 125.0,
723
+ "step": 500
724
+ },
725
+ {
726
+ "loss": 0.028,
727
+ "grad_norm": 0.040535662323236465,
728
+ "learning_rate": 7.422680412371135e-05,
729
+ "epoch": 126.32,
730
+ "step": 505
731
+ },
732
+ {
733
+ "loss": 0.0281,
734
+ "grad_norm": 0.04621696099638939,
735
+ "learning_rate": 7.394043528064147e-05,
736
+ "epoch": 127.64,
737
+ "step": 510
738
+ },
739
+ {
740
+ "loss": 0.0309,
741
+ "grad_norm": 0.04407593980431557,
742
+ "learning_rate": 7.36540664375716e-05,
743
+ "epoch": 128.96,
744
+ "step": 515
745
+ },
746
+ {
747
+ "loss": 0.0302,
748
+ "grad_norm": 0.24090737104415894,
749
+ "learning_rate": 7.336769759450171e-05,
750
+ "epoch": 130.0,
751
+ "step": 520
752
+ },
753
+ {
754
+ "loss": 0.0305,
755
+ "grad_norm": 0.051712971180677414,
756
+ "learning_rate": 7.308132875143184e-05,
757
+ "epoch": 131.32,
758
+ "step": 525
759
+ },
760
+ {
761
+ "loss": 0.0283,
762
+ "grad_norm": 0.0373610258102417,
763
+ "learning_rate": 7.279495990836197e-05,
764
+ "epoch": 132.64,
765
+ "step": 530
766
+ },
767
+ {
768
+ "loss": 0.0317,
769
+ "grad_norm": 0.04424213245511055,
770
+ "learning_rate": 7.250859106529209e-05,
771
+ "epoch": 133.96,
772
+ "step": 535
773
+ },
774
+ {
775
+ "loss": 0.0302,
776
+ "grad_norm": 0.09113436192274094,
777
+ "learning_rate": 7.222222222222222e-05,
778
+ "epoch": 135.0,
779
+ "step": 540
780
+ },
781
+ {
782
+ "loss": 0.0315,
783
+ "grad_norm": 0.03745009005069733,
784
+ "learning_rate": 7.193585337915235e-05,
785
+ "epoch": 136.32,
786
+ "step": 545
787
+ },
788
+ {
789
+ "loss": 0.0312,
790
+ "grad_norm": 0.04058730602264404,
791
+ "learning_rate": 7.164948453608247e-05,
792
+ "epoch": 137.64,
793
+ "step": 550
794
+ },
795
+ {
796
+ "loss": 0.0295,
797
+ "grad_norm": 0.046279069036245346,
798
+ "learning_rate": 7.136311569301261e-05,
799
+ "epoch": 138.96,
800
+ "step": 555
801
+ },
802
+ {
803
+ "loss": 0.0307,
804
+ "grad_norm": 0.17239141464233398,
805
+ "learning_rate": 7.107674684994274e-05,
806
+ "epoch": 140.0,
807
+ "step": 560
808
+ },
809
+ {
810
+ "loss": 0.0284,
811
+ "grad_norm": 0.036460030823946,
812
+ "learning_rate": 7.079037800687286e-05,
813
+ "epoch": 141.32,
814
+ "step": 565
815
+ },
816
+ {
817
+ "loss": 0.0283,
818
+ "grad_norm": 0.03434258699417114,
819
+ "learning_rate": 7.050400916380299e-05,
820
+ "epoch": 142.64,
821
+ "step": 570
822
+ },
823
+ {
824
+ "loss": 0.0296,
825
+ "grad_norm": 0.0470467284321785,
826
+ "learning_rate": 7.02176403207331e-05,
827
+ "epoch": 143.96,
828
+ "step": 575
829
+ },
830
+ {
831
+ "loss": 0.0256,
832
+ "grad_norm": 0.07163394242525101,
833
+ "learning_rate": 6.993127147766323e-05,
834
+ "epoch": 145.0,
835
+ "step": 580
836
+ },
837
+ {
838
+ "loss": 0.0298,
839
+ "grad_norm": 0.042208388447761536,
840
+ "learning_rate": 6.964490263459336e-05,
841
+ "epoch": 146.32,
842
+ "step": 585
843
+ },
844
+ {
845
+ "loss": 0.0274,
846
+ "grad_norm": 0.04421050846576691,
847
+ "learning_rate": 6.935853379152348e-05,
848
+ "epoch": 147.64,
849
+ "step": 590
850
+ },
851
+ {
852
+ "loss": 0.0311,
853
+ "grad_norm": 0.047223106026649475,
854
+ "learning_rate": 6.907216494845361e-05,
855
+ "epoch": 148.96,
856
+ "step": 595
857
+ },
858
+ {
859
+ "loss": 0.0302,
860
+ "grad_norm": 0.1724609136581421,
861
+ "learning_rate": 6.878579610538374e-05,
862
+ "epoch": 150.0,
863
+ "step": 600
864
+ },
865
+ {
866
+ "loss": 0.0287,
867
+ "grad_norm": 0.042247697710990906,
868
+ "learning_rate": 6.849942726231386e-05,
869
+ "epoch": 151.32,
870
+ "step": 605
871
+ },
872
+ {
873
+ "loss": 0.0279,
874
+ "grad_norm": 0.05167734622955322,
875
+ "learning_rate": 6.821305841924399e-05,
876
+ "epoch": 152.64,
877
+ "step": 610
878
+ },
879
+ {
880
+ "loss": 0.0291,
881
+ "grad_norm": 0.03621920198202133,
882
+ "learning_rate": 6.792668957617412e-05,
883
+ "epoch": 153.96,
884
+ "step": 615
885
+ },
886
+ {
887
+ "loss": 0.0354,
888
+ "grad_norm": 0.22533060610294342,
889
+ "learning_rate": 6.764032073310424e-05,
890
+ "epoch": 155.0,
891
+ "step": 620
892
+ },
893
+ {
894
+ "loss": 0.0285,
895
+ "grad_norm": 0.03441638499498367,
896
+ "learning_rate": 6.735395189003437e-05,
897
+ "epoch": 156.32,
898
+ "step": 625
899
+ },
900
+ {
901
+ "loss": 0.0299,
902
+ "grad_norm": 0.03728373721241951,
903
+ "learning_rate": 6.706758304696448e-05,
904
+ "epoch": 157.64,
905
+ "step": 630
906
+ },
907
+ {
908
+ "loss": 0.0292,
909
+ "grad_norm": 0.043604422360658646,
910
+ "learning_rate": 6.678121420389462e-05,
911
+ "epoch": 158.96,
912
+ "step": 635
913
+ },
914
+ {
915
+ "loss": 0.0331,
916
+ "grad_norm": 0.17142102122306824,
917
+ "learning_rate": 6.649484536082475e-05,
918
+ "epoch": 160.0,
919
+ "step": 640
920
+ },
921
+ {
922
+ "loss": 0.0305,
923
+ "grad_norm": 0.03554172441363335,
924
+ "learning_rate": 6.620847651775487e-05,
925
+ "epoch": 161.32,
926
+ "step": 645
927
+ },
928
+ {
929
+ "loss": 0.0305,
930
+ "grad_norm": 0.043817318975925446,
931
+ "learning_rate": 6.5922107674685e-05,
932
+ "epoch": 162.64,
933
+ "step": 650
934
+ },
935
+ {
936
+ "loss": 0.0302,
937
+ "grad_norm": 0.04247381538152695,
938
+ "learning_rate": 6.563573883161513e-05,
939
+ "epoch": 163.96,
940
+ "step": 655
941
+ },
942
+ {
943
+ "loss": 0.0292,
944
+ "grad_norm": 0.09436971694231033,
945
+ "learning_rate": 6.534936998854525e-05,
946
+ "epoch": 165.0,
947
+ "step": 660
948
+ },
949
+ {
950
+ "loss": 0.0281,
951
+ "grad_norm": 0.04177865758538246,
952
+ "learning_rate": 6.506300114547538e-05,
953
+ "epoch": 166.32,
954
+ "step": 665
955
+ },
956
+ {
957
+ "loss": 0.0288,
958
+ "grad_norm": 0.04079804942011833,
959
+ "learning_rate": 6.477663230240551e-05,
960
+ "epoch": 167.64,
961
+ "step": 670
962
+ },
963
+ {
964
+ "loss": 0.0275,
965
+ "grad_norm": 0.039114974439144135,
966
+ "learning_rate": 6.449026345933563e-05,
967
+ "epoch": 168.96,
968
+ "step": 675
969
+ },
970
+ {
971
+ "loss": 0.0288,
972
+ "grad_norm": 0.11661799997091293,
973
+ "learning_rate": 6.420389461626576e-05,
974
+ "epoch": 170.0,
975
+ "step": 680
976
+ },
977
+ {
978
+ "loss": 0.0306,
979
+ "grad_norm": 0.03423461318016052,
980
+ "learning_rate": 6.391752577319587e-05,
981
+ "epoch": 171.32,
982
+ "step": 685
983
+ },
984
+ {
985
+ "loss": 0.031,
986
+ "grad_norm": 0.03615871071815491,
987
+ "learning_rate": 6.3631156930126e-05,
988
+ "epoch": 172.64,
989
+ "step": 690
990
+ },
991
+ {
992
+ "loss": 0.0312,
993
+ "grad_norm": 0.04067518189549446,
994
+ "learning_rate": 6.334478808705613e-05,
995
+ "epoch": 173.96,
996
+ "step": 695
997
+ },
998
+ {
999
+ "loss": 0.0287,
1000
+ "grad_norm": 0.11094173789024353,
1001
+ "learning_rate": 6.305841924398625e-05,
1002
+ "epoch": 175.0,
1003
+ "step": 700
1004
+ },
1005
+ {
1006
+ "loss": 0.0296,
1007
+ "grad_norm": 0.03173477575182915,
1008
+ "learning_rate": 6.277205040091638e-05,
1009
+ "epoch": 176.32,
1010
+ "step": 705
1011
+ },
1012
+ {
1013
+ "loss": 0.0292,
1014
+ "grad_norm": 0.0374116450548172,
1015
+ "learning_rate": 6.24856815578465e-05,
1016
+ "epoch": 177.64,
1017
+ "step": 710
1018
+ },
1019
+ {
1020
+ "loss": 0.0273,
1021
+ "grad_norm": 0.03814936801791191,
1022
+ "learning_rate": 6.219931271477663e-05,
1023
+ "epoch": 178.96,
1024
+ "step": 715
1025
+ },
1026
+ {
1027
+ "loss": 0.0335,
1028
+ "grad_norm": 0.14255362749099731,
1029
+ "learning_rate": 6.191294387170676e-05,
1030
+ "epoch": 180.0,
1031
+ "step": 720
1032
+ },
1033
+ {
1034
+ "loss": 0.0303,
1035
+ "grad_norm": 0.04104507714509964,
1036
+ "learning_rate": 6.162657502863689e-05,
1037
+ "epoch": 181.32,
1038
+ "step": 725
1039
+ },
1040
+ {
1041
+ "loss": 0.0271,
1042
+ "grad_norm": 0.037353888154029846,
1043
+ "learning_rate": 6.134020618556701e-05,
1044
+ "epoch": 182.64,
1045
+ "step": 730
1046
+ },
1047
+ {
1048
+ "loss": 0.0288,
1049
+ "grad_norm": 0.03552788123488426,
1050
+ "learning_rate": 6.105383734249714e-05,
1051
+ "epoch": 183.96,
1052
+ "step": 735
1053
+ },
1054
+ {
1055
+ "loss": 0.0271,
1056
+ "grad_norm": 0.09345243126153946,
1057
+ "learning_rate": 6.076746849942726e-05,
1058
+ "epoch": 185.0,
1059
+ "step": 740
1060
+ },
1061
+ {
1062
+ "loss": 0.0287,
1063
+ "grad_norm": 0.031304650008678436,
1064
+ "learning_rate": 6.0481099656357384e-05,
1065
+ "epoch": 186.32,
1066
+ "step": 745
1067
+ },
1068
+ {
1069
+ "loss": 0.0292,
1070
+ "grad_norm": 0.03588686138391495,
1071
+ "learning_rate": 6.019473081328752e-05,
1072
+ "epoch": 187.64,
1073
+ "step": 750
1074
+ },
1075
+ {
1076
+ "loss": 0.0277,
1077
+ "grad_norm": 0.03166257590055466,
1078
+ "learning_rate": 5.9908361970217644e-05,
1079
+ "epoch": 188.96,
1080
+ "step": 755
1081
+ },
1082
+ {
1083
+ "loss": 0.0286,
1084
+ "grad_norm": 0.09115266799926758,
1085
+ "learning_rate": 5.962199312714777e-05,
1086
+ "epoch": 190.0,
1087
+ "step": 760
1088
+ },
1089
+ {
1090
+ "loss": 0.0277,
1091
+ "grad_norm": 0.028432967141270638,
1092
+ "learning_rate": 5.93356242840779e-05,
1093
+ "epoch": 191.32,
1094
+ "step": 765
1095
+ },
1096
+ {
1097
+ "loss": 0.0315,
1098
+ "grad_norm": 0.04126034304499626,
1099
+ "learning_rate": 5.904925544100802e-05,
1100
+ "epoch": 192.64,
1101
+ "step": 770
1102
+ },
1103
+ {
1104
+ "loss": 0.0281,
1105
+ "grad_norm": 0.04166596010327339,
1106
+ "learning_rate": 5.876288659793815e-05,
1107
+ "epoch": 193.96,
1108
+ "step": 775
1109
+ },
1110
+ {
1111
+ "loss": 0.0285,
1112
+ "grad_norm": 0.11017812788486481,
1113
+ "learning_rate": 5.8476517754868276e-05,
1114
+ "epoch": 195.0,
1115
+ "step": 780
1116
+ },
1117
+ {
1118
+ "loss": 0.0289,
1119
+ "grad_norm": 0.04071119427680969,
1120
+ "learning_rate": 5.81901489117984e-05,
1121
+ "epoch": 196.32,
1122
+ "step": 785
1123
+ },
1124
+ {
1125
+ "loss": 0.0276,
1126
+ "grad_norm": 0.03756481036543846,
1127
+ "learning_rate": 5.790378006872853e-05,
1128
+ "epoch": 197.64,
1129
+ "step": 790
1130
+ },
1131
+ {
1132
+ "loss": 0.0282,
1133
+ "grad_norm": 0.039780210703611374,
1134
+ "learning_rate": 5.761741122565865e-05,
1135
+ "epoch": 198.96,
1136
+ "step": 795
1137
+ },
1138
+ {
1139
+ "loss": 0.0296,
1140
+ "grad_norm": 0.12418342381715775,
1141
+ "learning_rate": 5.7331042382588775e-05,
1142
+ "epoch": 200.0,
1143
+ "step": 800
1144
+ },
1145
+ {
1146
+ "loss": 0.0286,
1147
+ "grad_norm": 0.0338447242975235,
1148
+ "learning_rate": 5.70446735395189e-05,
1149
+ "epoch": 201.32,
1150
+ "step": 805
1151
+ },
1152
+ {
1153
+ "loss": 0.0306,
1154
+ "grad_norm": 0.03490043804049492,
1155
+ "learning_rate": 5.675830469644903e-05,
1156
+ "epoch": 202.64,
1157
+ "step": 810
1158
+ },
1159
+ {
1160
+ "loss": 0.0283,
1161
+ "grad_norm": 0.03847096487879753,
1162
+ "learning_rate": 5.6471935853379155e-05,
1163
+ "epoch": 203.96,
1164
+ "step": 815
1165
+ },
1166
+ {
1167
+ "loss": 0.0278,
1168
+ "grad_norm": 0.10988269001245499,
1169
+ "learning_rate": 5.618556701030928e-05,
1170
+ "epoch": 205.0,
1171
+ "step": 820
1172
+ },
1173
+ {
1174
+ "loss": 0.03,
1175
+ "grad_norm": 0.034018851816654205,
1176
+ "learning_rate": 5.589919816723941e-05,
1177
+ "epoch": 206.32,
1178
+ "step": 825
1179
+ },
1180
+ {
1181
+ "loss": 0.0293,
1182
+ "grad_norm": 0.032927289605140686,
1183
+ "learning_rate": 5.5612829324169534e-05,
1184
+ "epoch": 207.64,
1185
+ "step": 830
1186
+ },
1187
+ {
1188
+ "loss": 0.0285,
1189
+ "grad_norm": 0.03604916110634804,
1190
+ "learning_rate": 5.532646048109966e-05,
1191
+ "epoch": 208.96,
1192
+ "step": 835
1193
+ },
1194
+ {
1195
+ "loss": 0.0305,
1196
+ "grad_norm": 0.09708557277917862,
1197
+ "learning_rate": 5.504009163802979e-05,
1198
+ "epoch": 210.0,
1199
+ "step": 840
1200
+ },
1201
+ {
1202
+ "loss": 0.0296,
1203
+ "grad_norm": 0.03745417296886444,
1204
+ "learning_rate": 5.4753722794959914e-05,
1205
+ "epoch": 211.32,
1206
+ "step": 845
1207
+ },
1208
+ {
1209
+ "loss": 0.0277,
1210
+ "grad_norm": 0.028906095772981644,
1211
+ "learning_rate": 5.4467353951890033e-05,
1212
+ "epoch": 212.64,
1213
+ "step": 850
1214
+ },
1215
+ {
1216
+ "loss": 0.0295,
1217
+ "grad_norm": 0.03228568285703659,
1218
+ "learning_rate": 5.418098510882016e-05,
1219
+ "epoch": 213.96,
1220
+ "step": 855
1221
+ },
1222
+ {
1223
+ "loss": 0.031,
1224
+ "grad_norm": 0.1302802711725235,
1225
+ "learning_rate": 5.3894616265750286e-05,
1226
+ "epoch": 215.0,
1227
+ "step": 860
1228
+ },
1229
+ {
1230
+ "loss": 0.0286,
1231
+ "grad_norm": 0.031472526490688324,
1232
+ "learning_rate": 5.360824742268041e-05,
1233
+ "epoch": 216.32,
1234
+ "step": 865
1235
+ },
1236
+ {
1237
+ "loss": 0.0308,
1238
+ "grad_norm": 0.03589686006307602,
1239
+ "learning_rate": 5.332187857961054e-05,
1240
+ "epoch": 217.64,
1241
+ "step": 870
1242
+ },
1243
+ {
1244
+ "loss": 0.0298,
1245
+ "grad_norm": 0.04117952659726143,
1246
+ "learning_rate": 5.3035509736540666e-05,
1247
+ "epoch": 218.96,
1248
+ "step": 875
1249
+ },
1250
+ {
1251
+ "loss": 0.0316,
1252
+ "grad_norm": 0.16901935636997223,
1253
+ "learning_rate": 5.274914089347079e-05,
1254
+ "epoch": 220.0,
1255
+ "step": 880
1256
+ },
1257
+ {
1258
+ "loss": 0.0297,
1259
+ "grad_norm": 0.03608705848455429,
1260
+ "learning_rate": 5.246277205040092e-05,
1261
+ "epoch": 221.32,
1262
+ "step": 885
1263
+ },
1264
+ {
1265
+ "loss": 0.029,
1266
+ "grad_norm": 0.028423065319657326,
1267
+ "learning_rate": 5.2176403207331045e-05,
1268
+ "epoch": 222.64,
1269
+ "step": 890
1270
+ },
1271
+ {
1272
+ "loss": 0.0304,
1273
+ "grad_norm": 0.03328604996204376,
1274
+ "learning_rate": 5.189003436426118e-05,
1275
+ "epoch": 223.96,
1276
+ "step": 895
1277
+ },
1278
+ {
1279
+ "loss": 0.0295,
1280
+ "grad_norm": 0.1140102967619896,
1281
+ "learning_rate": 5.1603665521191305e-05,
1282
+ "epoch": 225.0,
1283
+ "step": 900
1284
+ },
1285
+ {
1286
+ "loss": 0.0289,
1287
+ "grad_norm": 0.03379100188612938,
1288
+ "learning_rate": 5.131729667812142e-05,
1289
+ "epoch": 226.32,
1290
+ "step": 905
1291
+ },
1292
+ {
1293
+ "loss": 0.0272,
1294
+ "grad_norm": 0.03175675496459007,
1295
+ "learning_rate": 5.1030927835051544e-05,
1296
+ "epoch": 227.64,
1297
+ "step": 910
1298
+ },
1299
+ {
1300
+ "loss": 0.0308,
1301
+ "grad_norm": 0.0344826877117157,
1302
+ "learning_rate": 5.074455899198167e-05,
1303
+ "epoch": 228.96,
1304
+ "step": 915
1305
+ },
1306
+ {
1307
+ "loss": 0.0308,
1308
+ "grad_norm": 0.1841171830892563,
1309
+ "learning_rate": 5.04581901489118e-05,
1310
+ "epoch": 230.0,
1311
+ "step": 920
1312
+ },
1313
+ {
1314
+ "loss": 0.0326,
1315
+ "grad_norm": 0.03660387173295021,
1316
+ "learning_rate": 5.0171821305841924e-05,
1317
+ "epoch": 231.32,
1318
+ "step": 925
1319
+ },
1320
+ {
1321
+ "loss": 0.0295,
1322
+ "grad_norm": 0.03065328672528267,
1323
+ "learning_rate": 4.988545246277205e-05,
1324
+ "epoch": 232.64,
1325
+ "step": 930
1326
+ },
1327
+ {
1328
+ "loss": 0.0292,
1329
+ "grad_norm": 0.03993593156337738,
1330
+ "learning_rate": 4.9599083619702184e-05,
1331
+ "epoch": 233.96,
1332
+ "step": 935
1333
+ },
1334
+ {
1335
+ "loss": 0.0302,
1336
+ "grad_norm": 0.10738981515169144,
1337
+ "learning_rate": 4.931271477663231e-05,
1338
+ "epoch": 235.0,
1339
+ "step": 940
1340
+ },
1341
+ {
1342
+ "loss": 0.0278,
1343
+ "grad_norm": 0.03143048286437988,
1344
+ "learning_rate": 4.902634593356243e-05,
1345
+ "epoch": 236.32,
1346
+ "step": 945
1347
+ },
1348
+ {
1349
+ "loss": 0.0271,
1350
+ "grad_norm": 0.028968214988708496,
1351
+ "learning_rate": 4.8739977090492556e-05,
1352
+ "epoch": 237.64,
1353
+ "step": 950
1354
+ },
1355
+ {
1356
+ "loss": 0.0297,
1357
+ "grad_norm": 0.038674987852573395,
1358
+ "learning_rate": 4.845360824742268e-05,
1359
+ "epoch": 238.96,
1360
+ "step": 955
1361
+ },
1362
+ {
1363
+ "loss": 0.0278,
1364
+ "grad_norm": 0.10797161608934402,
1365
+ "learning_rate": 4.816723940435281e-05,
1366
+ "epoch": 240.0,
1367
+ "step": 960
1368
+ },
1369
+ {
1370
+ "loss": 0.0281,
1371
+ "grad_norm": 0.03592285141348839,
1372
+ "learning_rate": 4.7880870561282936e-05,
1373
+ "epoch": 241.32,
1374
+ "step": 965
1375
+ },
1376
+ {
1377
+ "loss": 0.031,
1378
+ "grad_norm": 0.031206540763378143,
1379
+ "learning_rate": 4.7594501718213055e-05,
1380
+ "epoch": 242.64,
1381
+ "step": 970
1382
+ },
1383
+ {
1384
+ "loss": 0.0276,
1385
+ "grad_norm": 0.03692101314663887,
1386
+ "learning_rate": 4.730813287514318e-05,
1387
+ "epoch": 243.96,
1388
+ "step": 975
1389
+ },
1390
+ {
1391
+ "loss": 0.0325,
1392
+ "grad_norm": 0.1415632963180542,
1393
+ "learning_rate": 4.7021764032073315e-05,
1394
+ "epoch": 245.0,
1395
+ "step": 980
1396
+ },
1397
+ {
1398
+ "loss": 0.0291,
1399
+ "grad_norm": 0.0346578024327755,
1400
+ "learning_rate": 4.673539518900344e-05,
1401
+ "epoch": 246.32,
1402
+ "step": 985
1403
+ },
1404
+ {
1405
+ "loss": 0.0279,
1406
+ "grad_norm": 0.036887165158987045,
1407
+ "learning_rate": 4.644902634593357e-05,
1408
+ "epoch": 247.64,
1409
+ "step": 990
1410
+ },
1411
+ {
1412
+ "loss": 0.0277,
1413
+ "grad_norm": 0.03107571043074131,
1414
+ "learning_rate": 4.6162657502863694e-05,
1415
+ "epoch": 248.96,
1416
+ "step": 995
1417
+ },
1418
+ {
1419
+ "loss": 0.0287,
1420
+ "grad_norm": 0.13857436180114746,
1421
+ "learning_rate": 4.5876288659793814e-05,
1422
+ "epoch": 250.0,
1423
+ "step": 1000
1424
+ },
1425
+ {
1426
+ "loss": 0.0275,
1427
+ "grad_norm": 0.03328908607363701,
1428
+ "learning_rate": 4.558991981672394e-05,
1429
+ "epoch": 251.32,
1430
+ "step": 1005
1431
+ },
1432
+ {
1433
+ "loss": 0.0295,
1434
+ "grad_norm": 0.03218206763267517,
1435
+ "learning_rate": 4.530355097365407e-05,
1436
+ "epoch": 252.64,
1437
+ "step": 1010
1438
+ },
1439
+ {
1440
+ "loss": 0.0288,
1441
+ "grad_norm": 0.030677294358611107,
1442
+ "learning_rate": 4.5017182130584194e-05,
1443
+ "epoch": 253.96,
1444
+ "step": 1015
1445
+ },
1446
+ {
1447
+ "loss": 0.0283,
1448
+ "grad_norm": 0.08906098455190659,
1449
+ "learning_rate": 4.473081328751432e-05,
1450
+ "epoch": 255.0,
1451
+ "step": 1020
1452
+ },
1453
+ {
1454
+ "loss": 0.0292,
1455
+ "grad_norm": 0.0315646268427372,
1456
+ "learning_rate": 4.4444444444444447e-05,
1457
+ "epoch": 256.32,
1458
+ "step": 1025
1459
+ },
1460
+ {
1461
+ "loss": 0.0286,
1462
+ "grad_norm": 0.0322076752781868,
1463
+ "learning_rate": 4.415807560137457e-05,
1464
+ "epoch": 257.64,
1465
+ "step": 1030
1466
+ },
1467
+ {
1468
+ "loss": 0.0266,
1469
+ "grad_norm": 0.03561684116721153,
1470
+ "learning_rate": 4.38717067583047e-05,
1471
+ "epoch": 258.96,
1472
+ "step": 1035
1473
+ },
1474
+ {
1475
+ "loss": 0.0291,
1476
+ "grad_norm": 0.1383010447025299,
1477
+ "learning_rate": 4.3585337915234826e-05,
1478
+ "epoch": 260.0,
1479
+ "step": 1040
1480
+ },
1481
+ {
1482
+ "loss": 0.0281,
1483
+ "grad_norm": 0.02982248179614544,
1484
+ "learning_rate": 4.329896907216495e-05,
1485
+ "epoch": 261.32,
1486
+ "step": 1045
1487
+ },
1488
+ {
1489
+ "loss": 0.0285,
1490
+ "grad_norm": 0.03563191369175911,
1491
+ "learning_rate": 4.301260022909508e-05,
1492
+ "epoch": 262.64,
1493
+ "step": 1050
1494
+ },
1495
+ {
1496
+ "loss": 0.0298,
1497
+ "grad_norm": 0.03730940818786621,
1498
+ "learning_rate": 4.27262313860252e-05,
1499
+ "epoch": 263.96,
1500
+ "step": 1055
1501
+ },
1502
+ {
1503
+ "loss": 0.029,
1504
+ "grad_norm": 0.12043489515781403,
1505
+ "learning_rate": 4.2439862542955325e-05,
1506
+ "epoch": 265.0,
1507
+ "step": 1060
1508
+ },
1509
+ {
1510
+ "loss": 0.0304,
1511
+ "grad_norm": 0.03577538579702377,
1512
+ "learning_rate": 4.215349369988545e-05,
1513
+ "epoch": 266.32,
1514
+ "step": 1065
1515
+ },
1516
+ {
1517
+ "loss": 0.0292,
1518
+ "grad_norm": 0.035051047801971436,
1519
+ "learning_rate": 4.1867124856815585e-05,
1520
+ "epoch": 267.64,
1521
+ "step": 1070
1522
+ },
1523
+ {
1524
+ "loss": 0.029,
1525
+ "grad_norm": 0.03524423763155937,
1526
+ "learning_rate": 4.158075601374571e-05,
1527
+ "epoch": 268.96,
1528
+ "step": 1075
1529
+ },
1530
+ {
1531
+ "loss": 0.0286,
1532
+ "grad_norm": 0.11722230911254883,
1533
+ "learning_rate": 4.129438717067583e-05,
1534
+ "epoch": 270.0,
1535
+ "step": 1080
1536
+ },
1537
+ {
1538
+ "loss": 0.0295,
1539
+ "grad_norm": 0.0350823737680912,
1540
+ "learning_rate": 4.100801832760596e-05,
1541
+ "epoch": 271.32,
1542
+ "step": 1085
1543
+ },
1544
+ {
1545
+ "loss": 0.0288,
1546
+ "grad_norm": 0.03372941538691521,
1547
+ "learning_rate": 4.0721649484536084e-05,
1548
+ "epoch": 272.64,
1549
+ "step": 1090
1550
+ },
1551
+ {
1552
+ "loss": 0.0326,
1553
+ "grad_norm": 0.028644917532801628,
1554
+ "learning_rate": 4.043528064146621e-05,
1555
+ "epoch": 273.96,
1556
+ "step": 1095
1557
+ },
1558
+ {
1559
+ "loss": 0.0293,
1560
+ "grad_norm": 0.10958810150623322,
1561
+ "learning_rate": 4.014891179839634e-05,
1562
+ "epoch": 275.0,
1563
+ "step": 1100
1564
+ },
1565
+ {
1566
+ "loss": 0.0289,
1567
+ "grad_norm": 0.03524491935968399,
1568
+ "learning_rate": 3.9862542955326463e-05,
1569
+ "epoch": 276.32,
1570
+ "step": 1105
1571
+ },
1572
+ {
1573
+ "loss": 0.0292,
1574
+ "grad_norm": 0.028043361380696297,
1575
+ "learning_rate": 3.957617411225659e-05,
1576
+ "epoch": 277.64,
1577
+ "step": 1110
1578
+ },
1579
+ {
1580
+ "loss": 0.028,
1581
+ "grad_norm": 0.03574656322598457,
1582
+ "learning_rate": 3.9289805269186716e-05,
1583
+ "epoch": 278.96,
1584
+ "step": 1115
1585
+ },
1586
+ {
1587
+ "loss": 0.0278,
1588
+ "grad_norm": 0.12416456639766693,
1589
+ "learning_rate": 3.900343642611684e-05,
1590
+ "epoch": 280.0,
1591
+ "step": 1120
1592
+ },
1593
+ {
1594
+ "loss": 0.0324,
1595
+ "grad_norm": 0.02984347939491272,
1596
+ "learning_rate": 3.871706758304697e-05,
1597
+ "epoch": 281.32,
1598
+ "step": 1125
1599
+ },
1600
+ {
1601
+ "loss": 0.0281,
1602
+ "grad_norm": 0.03649289906024933,
1603
+ "learning_rate": 3.8430698739977096e-05,
1604
+ "epoch": 282.64,
1605
+ "step": 1130
1606
+ },
1607
+ {
1608
+ "loss": 0.0268,
1609
+ "grad_norm": 0.03943822532892227,
1610
+ "learning_rate": 3.8144329896907216e-05,
1611
+ "epoch": 283.96,
1612
+ "step": 1135
1613
+ },
1614
+ {
1615
+ "loss": 0.0305,
1616
+ "grad_norm": 0.14334431290626526,
1617
+ "learning_rate": 3.785796105383734e-05,
1618
+ "epoch": 285.0,
1619
+ "step": 1140
1620
+ },
1621
+ {
1622
+ "loss": 0.028,
1623
+ "grad_norm": 0.030261779204010963,
1624
+ "learning_rate": 3.757159221076747e-05,
1625
+ "epoch": 286.32,
1626
+ "step": 1145
1627
+ },
1628
+ {
1629
+ "loss": 0.0267,
1630
+ "grad_norm": 0.03134704381227493,
1631
+ "learning_rate": 3.7285223367697595e-05,
1632
+ "epoch": 287.64,
1633
+ "step": 1150
1634
+ },
1635
+ {
1636
+ "loss": 0.028,
1637
+ "grad_norm": 0.031728796660900116,
1638
+ "learning_rate": 3.699885452462772e-05,
1639
+ "epoch": 288.96,
1640
+ "step": 1155
1641
+ },
1642
+ {
1643
+ "loss": 0.031,
1644
+ "grad_norm": 0.15487806499004364,
1645
+ "learning_rate": 3.671248568155785e-05,
1646
+ "epoch": 290.0,
1647
+ "step": 1160
1648
+ },
1649
+ {
1650
+ "loss": 0.0298,
1651
+ "grad_norm": 0.033745523542165756,
1652
+ "learning_rate": 3.6426116838487974e-05,
1653
+ "epoch": 291.32,
1654
+ "step": 1165
1655
+ },
1656
+ {
1657
+ "loss": 0.0265,
1658
+ "grad_norm": 0.026857230812311172,
1659
+ "learning_rate": 3.61397479954181e-05,
1660
+ "epoch": 292.64,
1661
+ "step": 1170
1662
+ },
1663
+ {
1664
+ "loss": 0.0291,
1665
+ "grad_norm": 0.03467594459652901,
1666
+ "learning_rate": 3.585337915234823e-05,
1667
+ "epoch": 293.96,
1668
+ "step": 1175
1669
+ },
1670
+ {
1671
+ "loss": 0.0304,
1672
+ "grad_norm": 0.1255461573600769,
1673
+ "learning_rate": 3.5567010309278354e-05,
1674
+ "epoch": 295.0,
1675
+ "step": 1180
1676
+ },
1677
+ {
1678
+ "loss": 0.0275,
1679
+ "grad_norm": 0.03569836914539337,
1680
+ "learning_rate": 3.528064146620848e-05,
1681
+ "epoch": 296.32,
1682
+ "step": 1185
1683
+ },
1684
+ {
1685
+ "loss": 0.0288,
1686
+ "grad_norm": 0.03207559511065483,
1687
+ "learning_rate": 3.49942726231386e-05,
1688
+ "epoch": 297.64,
1689
+ "step": 1190
1690
+ },
1691
+ {
1692
+ "loss": 0.0274,
1693
+ "grad_norm": 0.03445427492260933,
1694
+ "learning_rate": 3.4707903780068726e-05,
1695
+ "epoch": 298.96,
1696
+ "step": 1195
1697
+ },
1698
+ {
1699
+ "loss": 0.0268,
1700
+ "grad_norm": 0.11089900881052017,
1701
+ "learning_rate": 3.442153493699885e-05,
1702
+ "epoch": 300.0,
1703
+ "step": 1200
1704
+ },
1705
+ {
1706
+ "loss": 0.0285,
1707
+ "grad_norm": 0.030901776626706123,
1708
+ "learning_rate": 3.4135166093928986e-05,
1709
+ "epoch": 301.32,
1710
+ "step": 1205
1711
+ },
1712
+ {
1713
+ "loss": 0.0282,
1714
+ "grad_norm": 0.03404972329735756,
1715
+ "learning_rate": 3.384879725085911e-05,
1716
+ "epoch": 302.64,
1717
+ "step": 1210
1718
+ },
1719
+ {
1720
+ "loss": 0.0289,
1721
+ "grad_norm": 0.03297970071434975,
1722
+ "learning_rate": 3.356242840778923e-05,
1723
+ "epoch": 303.96,
1724
+ "step": 1215
1725
+ },
1726
+ {
1727
+ "loss": 0.0271,
1728
+ "grad_norm": 0.08513491600751877,
1729
+ "learning_rate": 3.327605956471936e-05,
1730
+ "epoch": 305.0,
1731
+ "step": 1220
1732
+ },
1733
+ {
1734
+ "loss": 0.028,
1735
+ "grad_norm": 0.02815438061952591,
1736
+ "learning_rate": 3.2989690721649485e-05,
1737
+ "epoch": 306.32,
1738
+ "step": 1225
1739
+ },
1740
+ {
1741
+ "loss": 0.0308,
1742
+ "grad_norm": 0.031231220811605453,
1743
+ "learning_rate": 3.270332187857961e-05,
1744
+ "epoch": 307.64,
1745
+ "step": 1230
1746
+ },
1747
+ {
1748
+ "loss": 0.0256,
1749
+ "grad_norm": 0.03579903766512871,
1750
+ "learning_rate": 3.241695303550974e-05,
1751
+ "epoch": 308.96,
1752
+ "step": 1235
1753
+ },
1754
+ {
1755
+ "loss": 0.029,
1756
+ "grad_norm": 0.1284906268119812,
1757
+ "learning_rate": 3.2130584192439865e-05,
1758
+ "epoch": 310.0,
1759
+ "step": 1240
1760
+ },
1761
+ {
1762
+ "loss": 0.0267,
1763
+ "grad_norm": 0.02885010838508606,
1764
+ "learning_rate": 3.184421534936999e-05,
1765
+ "epoch": 311.32,
1766
+ "step": 1245
1767
+ },
1768
+ {
1769
+ "loss": 0.0277,
1770
+ "grad_norm": 0.040551669895648956,
1771
+ "learning_rate": 3.155784650630012e-05,
1772
+ "epoch": 312.64,
1773
+ "step": 1250
1774
+ },
1775
+ {
1776
+ "loss": 0.0296,
1777
+ "grad_norm": 0.024676747620105743,
1778
+ "learning_rate": 3.1271477663230244e-05,
1779
+ "epoch": 313.96,
1780
+ "step": 1255
1781
+ },
1782
+ {
1783
+ "loss": 0.0295,
1784
+ "grad_norm": 0.1250019669532776,
1785
+ "learning_rate": 3.098510882016037e-05,
1786
+ "epoch": 315.0,
1787
+ "step": 1260
1788
+ },
1789
+ {
1790
+ "loss": 0.0286,
1791
+ "grad_norm": 0.03083103522658348,
1792
+ "learning_rate": 3.06987399770905e-05,
1793
+ "epoch": 316.32,
1794
+ "step": 1265
1795
+ },
1796
+ {
1797
+ "loss": 0.0277,
1798
+ "grad_norm": 0.03254910558462143,
1799
+ "learning_rate": 3.0412371134020617e-05,
1800
+ "epoch": 317.64,
1801
+ "step": 1270
1802
+ },
1803
+ {
1804
+ "loss": 0.0268,
1805
+ "grad_norm": 0.028430206701159477,
1806
+ "learning_rate": 3.0126002290950743e-05,
1807
+ "epoch": 318.96,
1808
+ "step": 1275
1809
+ },
1810
+ {
1811
+ "loss": 0.0279,
1812
+ "grad_norm": 0.10449621081352234,
1813
+ "learning_rate": 2.983963344788087e-05,
1814
+ "epoch": 320.0,
1815
+ "step": 1280
1816
+ },
1817
+ {
1818
+ "loss": 0.0287,
1819
+ "grad_norm": 0.03180396929383278,
1820
+ "learning_rate": 2.9553264604811e-05,
1821
+ "epoch": 321.32,
1822
+ "step": 1285
1823
+ },
1824
+ {
1825
+ "loss": 0.0267,
1826
+ "grad_norm": 0.03462441638112068,
1827
+ "learning_rate": 2.9266895761741126e-05,
1828
+ "epoch": 322.64,
1829
+ "step": 1290
1830
+ },
1831
+ {
1832
+ "loss": 0.0272,
1833
+ "grad_norm": 0.032813649624586105,
1834
+ "learning_rate": 2.8980526918671253e-05,
1835
+ "epoch": 323.96,
1836
+ "step": 1295
1837
+ },
1838
+ {
1839
+ "loss": 0.0301,
1840
+ "grad_norm": 0.11716829985380173,
1841
+ "learning_rate": 2.8694158075601372e-05,
1842
+ "epoch": 325.0,
1843
+ "step": 1300
1844
+ },
1845
+ {
1846
+ "loss": 0.0297,
1847
+ "grad_norm": 0.0283154658973217,
1848
+ "learning_rate": 2.8407789232531502e-05,
1849
+ "epoch": 326.32,
1850
+ "step": 1305
1851
+ },
1852
+ {
1853
+ "loss": 0.0279,
1854
+ "grad_norm": 0.037692759186029434,
1855
+ "learning_rate": 2.812142038946163e-05,
1856
+ "epoch": 327.64,
1857
+ "step": 1310
1858
+ },
1859
+ {
1860
+ "loss": 0.0272,
1861
+ "grad_norm": 0.03138533979654312,
1862
+ "learning_rate": 2.7835051546391755e-05,
1863
+ "epoch": 328.96,
1864
+ "step": 1315
1865
+ },
1866
+ {
1867
+ "loss": 0.0268,
1868
+ "grad_norm": 0.07045339792966843,
1869
+ "learning_rate": 2.754868270332188e-05,
1870
+ "epoch": 330.0,
1871
+ "step": 1320
1872
+ },
1873
+ {
1874
+ "loss": 0.0285,
1875
+ "grad_norm": 0.029422452673316002,
1876
+ "learning_rate": 2.7262313860252005e-05,
1877
+ "epoch": 331.32,
1878
+ "step": 1325
1879
+ },
1880
+ {
1881
+ "loss": 0.027,
1882
+ "grad_norm": 0.025272730737924576,
1883
+ "learning_rate": 2.697594501718213e-05,
1884
+ "epoch": 332.64,
1885
+ "step": 1330
1886
+ },
1887
+ {
1888
+ "loss": 0.0281,
1889
+ "grad_norm": 0.03468950465321541,
1890
+ "learning_rate": 2.6689576174112258e-05,
1891
+ "epoch": 333.96,
1892
+ "step": 1335
1893
+ },
1894
+ {
1895
+ "loss": 0.0283,
1896
+ "grad_norm": 0.1138090044260025,
1897
+ "learning_rate": 2.6403207331042384e-05,
1898
+ "epoch": 335.0,
1899
+ "step": 1340
1900
+ },
1901
+ {
1902
+ "loss": 0.0292,
1903
+ "grad_norm": 0.0285523422062397,
1904
+ "learning_rate": 2.611683848797251e-05,
1905
+ "epoch": 336.32,
1906
+ "step": 1345
1907
+ },
1908
+ {
1909
+ "loss": 0.0288,
1910
+ "grad_norm": 0.034624941647052765,
1911
+ "learning_rate": 2.5830469644902637e-05,
1912
+ "epoch": 337.64,
1913
+ "step": 1350
1914
+ },
1915
+ {
1916
+ "loss": 0.0262,
1917
+ "grad_norm": 0.03252566233277321,
1918
+ "learning_rate": 2.554410080183276e-05,
1919
+ "epoch": 338.96,
1920
+ "step": 1355
1921
+ },
1922
+ {
1923
+ "loss": 0.0278,
1924
+ "grad_norm": 0.10238504409790039,
1925
+ "learning_rate": 2.5257731958762887e-05,
1926
+ "epoch": 340.0,
1927
+ "step": 1360
1928
+ },
1929
+ {
1930
+ "loss": 0.0287,
1931
+ "grad_norm": 0.028706086799502373,
1932
+ "learning_rate": 2.4971363115693013e-05,
1933
+ "epoch": 341.32,
1934
+ "step": 1365
1935
+ },
1936
+ {
1937
+ "loss": 0.0288,
1938
+ "grad_norm": 0.03616653010249138,
1939
+ "learning_rate": 2.468499427262314e-05,
1940
+ "epoch": 342.64,
1941
+ "step": 1370
1942
+ },
1943
+ {
1944
+ "loss": 0.0282,
1945
+ "grad_norm": 0.033927544951438904,
1946
+ "learning_rate": 2.4398625429553266e-05,
1947
+ "epoch": 343.96,
1948
+ "step": 1375
1949
+ },
1950
+ {
1951
+ "loss": 0.0289,
1952
+ "grad_norm": 0.12410403043031693,
1953
+ "learning_rate": 2.4112256586483393e-05,
1954
+ "epoch": 345.0,
1955
+ "step": 1380
1956
+ },
1957
+ {
1958
+ "loss": 0.0285,
1959
+ "grad_norm": 0.033267851918935776,
1960
+ "learning_rate": 2.3825887743413516e-05,
1961
+ "epoch": 346.32,
1962
+ "step": 1385
1963
+ },
1964
+ {
1965
+ "loss": 0.0274,
1966
+ "grad_norm": 0.028466830030083656,
1967
+ "learning_rate": 2.3539518900343642e-05,
1968
+ "epoch": 347.64,
1969
+ "step": 1390
1970
+ },
1971
+ {
1972
+ "loss": 0.0289,
1973
+ "grad_norm": 0.0284014530479908,
1974
+ "learning_rate": 2.3253150057273772e-05,
1975
+ "epoch": 348.96,
1976
+ "step": 1395
1977
+ },
1978
+ {
1979
+ "loss": 0.0288,
1980
+ "grad_norm": 0.10417843610048294,
1981
+ "learning_rate": 2.2966781214203895e-05,
1982
+ "epoch": 350.0,
1983
+ "step": 1400
1984
+ },
1985
+ {
1986
+ "loss": 0.028,
1987
+ "grad_norm": 0.02494928613305092,
1988
+ "learning_rate": 2.268041237113402e-05,
1989
+ "epoch": 351.32,
1990
+ "step": 1405
1991
+ },
1992
+ {
1993
+ "loss": 0.0288,
1994
+ "grad_norm": 0.027743646875023842,
1995
+ "learning_rate": 2.2394043528064148e-05,
1996
+ "epoch": 352.64,
1997
+ "step": 1410
1998
+ },
1999
+ {
2000
+ "loss": 0.0268,
2001
+ "grad_norm": 0.037426408380270004,
2002
+ "learning_rate": 2.210767468499427e-05,
2003
+ "epoch": 353.96,
2004
+ "step": 1415
2005
+ },
2006
+ {
2007
+ "loss": 0.0281,
2008
+ "grad_norm": 0.06390511989593506,
2009
+ "learning_rate": 2.18213058419244e-05,
2010
+ "epoch": 355.0,
2011
+ "step": 1420
2012
+ },
2013
+ {
2014
+ "loss": 0.0267,
2015
+ "grad_norm": 0.02651941403746605,
2016
+ "learning_rate": 2.1534936998854528e-05,
2017
+ "epoch": 356.32,
2018
+ "step": 1425
2019
+ },
2020
+ {
2021
+ "loss": 0.0278,
2022
+ "grad_norm": 0.027626991271972656,
2023
+ "learning_rate": 2.124856815578465e-05,
2024
+ "epoch": 357.64,
2025
+ "step": 1430
2026
+ },
2027
+ {
2028
+ "loss": 0.0289,
2029
+ "grad_norm": 0.0289900004863739,
2030
+ "learning_rate": 2.0962199312714777e-05,
2031
+ "epoch": 358.96,
2032
+ "step": 1435
2033
+ },
2034
+ {
2035
+ "loss": 0.0254,
2036
+ "grad_norm": 0.08335373550653458,
2037
+ "learning_rate": 2.0675830469644904e-05,
2038
+ "epoch": 360.0,
2039
+ "step": 1440
2040
+ },
2041
+ {
2042
+ "loss": 0.0276,
2043
+ "grad_norm": 0.02882411703467369,
2044
+ "learning_rate": 2.038946162657503e-05,
2045
+ "epoch": 361.32,
2046
+ "step": 1445
2047
+ },
2048
+ {
2049
+ "loss": 0.0273,
2050
+ "grad_norm": 0.029498135671019554,
2051
+ "learning_rate": 2.0103092783505157e-05,
2052
+ "epoch": 362.64,
2053
+ "step": 1450
2054
+ },
2055
+ {
2056
+ "loss": 0.0266,
2057
+ "grad_norm": 0.030006349086761475,
2058
+ "learning_rate": 1.981672394043528e-05,
2059
+ "epoch": 363.96,
2060
+ "step": 1455
2061
+ },
2062
+ {
2063
+ "loss": 0.0272,
2064
+ "grad_norm": 0.08131309598684311,
2065
+ "learning_rate": 1.9530355097365406e-05,
2066
+ "epoch": 365.0,
2067
+ "step": 1460
2068
+ },
2069
+ {
2070
+ "loss": 0.0267,
2071
+ "grad_norm": 0.028547124937176704,
2072
+ "learning_rate": 1.9243986254295536e-05,
2073
+ "epoch": 366.32,
2074
+ "step": 1465
2075
+ },
2076
+ {
2077
+ "loss": 0.0273,
2078
+ "grad_norm": 0.027747539803385735,
2079
+ "learning_rate": 1.895761741122566e-05,
2080
+ "epoch": 367.64,
2081
+ "step": 1470
2082
+ },
2083
+ {
2084
+ "loss": 0.0293,
2085
+ "grad_norm": 0.032853253185749054,
2086
+ "learning_rate": 1.8671248568155786e-05,
2087
+ "epoch": 368.96,
2088
+ "step": 1475
2089
+ },
2090
+ {
2091
+ "loss": 0.027,
2092
+ "grad_norm": 0.10667946934700012,
2093
+ "learning_rate": 1.8384879725085912e-05,
2094
+ "epoch": 370.0,
2095
+ "step": 1480
2096
+ },
2097
+ {
2098
+ "loss": 0.0268,
2099
+ "grad_norm": 0.027019130066037178,
2100
+ "learning_rate": 1.809851088201604e-05,
2101
+ "epoch": 371.32,
2102
+ "step": 1485
2103
+ },
2104
+ {
2105
+ "loss": 0.0303,
2106
+ "grad_norm": 0.02968420460820198,
2107
+ "learning_rate": 1.7812142038946165e-05,
2108
+ "epoch": 372.64,
2109
+ "step": 1490
2110
+ },
2111
+ {
2112
+ "loss": 0.0285,
2113
+ "grad_norm": 0.03141555190086365,
2114
+ "learning_rate": 1.7525773195876288e-05,
2115
+ "epoch": 373.96,
2116
+ "step": 1495
2117
+ },
2118
+ {
2119
+ "loss": 0.0293,
2120
+ "grad_norm": 0.1068948432803154,
2121
+ "learning_rate": 1.7239404352806415e-05,
2122
+ "epoch": 375.0,
2123
+ "step": 1500
2124
+ },
2125
+ {
2126
+ "loss": 0.0276,
2127
+ "grad_norm": 0.03410301357507706,
2128
+ "learning_rate": 1.695303550973654e-05,
2129
+ "epoch": 376.32,
2130
+ "step": 1505
2131
+ },
2132
+ {
2133
+ "loss": 0.029,
2134
+ "grad_norm": 0.03133257105946541,
2135
+ "learning_rate": 1.6666666666666667e-05,
2136
+ "epoch": 377.64,
2137
+ "step": 1510
2138
+ },
2139
+ {
2140
+ "loss": 0.0278,
2141
+ "grad_norm": 0.028733504936099052,
2142
+ "learning_rate": 1.6380297823596794e-05,
2143
+ "epoch": 378.96,
2144
+ "step": 1515
2145
+ },
2146
+ {
2147
+ "loss": 0.027,
2148
+ "grad_norm": 0.10409895330667496,
2149
+ "learning_rate": 1.609392898052692e-05,
2150
+ "epoch": 380.0,
2151
+ "step": 1520
2152
+ },
2153
+ {
2154
+ "loss": 0.0302,
2155
+ "grad_norm": 0.036405060440301895,
2156
+ "learning_rate": 1.5807560137457044e-05,
2157
+ "epoch": 381.32,
2158
+ "step": 1525
2159
+ },
2160
+ {
2161
+ "loss": 0.0272,
2162
+ "grad_norm": 0.027341334149241447,
2163
+ "learning_rate": 1.5521191294387173e-05,
2164
+ "epoch": 382.64,
2165
+ "step": 1530
2166
+ },
2167
+ {
2168
+ "loss": 0.0279,
2169
+ "grad_norm": 0.039175573736429214,
2170
+ "learning_rate": 1.5234822451317298e-05,
2171
+ "epoch": 383.96,
2172
+ "step": 1535
2173
+ },
2174
+ {
2175
+ "loss": 0.0285,
2176
+ "grad_norm": 0.12478016316890717,
2177
+ "learning_rate": 1.4948453608247423e-05,
2178
+ "epoch": 385.0,
2179
+ "step": 1540
2180
+ },
2181
+ {
2182
+ "loss": 0.0289,
2183
+ "grad_norm": 0.029726864770054817,
2184
+ "learning_rate": 1.466208476517755e-05,
2185
+ "epoch": 386.32,
2186
+ "step": 1545
2187
+ },
2188
+ {
2189
+ "loss": 0.0269,
2190
+ "grad_norm": 0.03192641958594322,
2191
+ "learning_rate": 1.4375715922107674e-05,
2192
+ "epoch": 387.64,
2193
+ "step": 1550
2194
+ },
2195
+ {
2196
+ "loss": 0.0281,
2197
+ "grad_norm": 0.03215065971016884,
2198
+ "learning_rate": 1.40893470790378e-05,
2199
+ "epoch": 388.96,
2200
+ "step": 1555
2201
+ },
2202
+ {
2203
+ "loss": 0.0296,
2204
+ "grad_norm": 0.11953844130039215,
2205
+ "learning_rate": 1.3802978235967929e-05,
2206
+ "epoch": 390.0,
2207
+ "step": 1560
2208
+ },
2209
+ {
2210
+ "loss": 0.0278,
2211
+ "grad_norm": 0.027522824704647064,
2212
+ "learning_rate": 1.3516609392898052e-05,
2213
+ "epoch": 391.32,
2214
+ "step": 1565
2215
+ },
2216
+ {
2217
+ "loss": 0.0292,
2218
+ "grad_norm": 0.037742115557193756,
2219
+ "learning_rate": 1.323024054982818e-05,
2220
+ "epoch": 392.64,
2221
+ "step": 1570
2222
+ },
2223
+ {
2224
+ "loss": 0.0264,
2225
+ "grad_norm": 0.02829778380692005,
2226
+ "learning_rate": 1.2943871706758307e-05,
2227
+ "epoch": 393.96,
2228
+ "step": 1575
2229
+ },
2230
+ {
2231
+ "loss": 0.0305,
2232
+ "grad_norm": 0.12115279585123062,
2233
+ "learning_rate": 1.2657502863688431e-05,
2234
+ "epoch": 395.0,
2235
+ "step": 1580
2236
+ },
2237
+ {
2238
+ "loss": 0.0273,
2239
+ "grad_norm": 0.026462797075510025,
2240
+ "learning_rate": 1.2371134020618558e-05,
2241
+ "epoch": 396.32,
2242
+ "step": 1585
2243
+ },
2244
+ {
2245
+ "loss": 0.0291,
2246
+ "grad_norm": 0.03455578163266182,
2247
+ "learning_rate": 1.2084765177548683e-05,
2248
+ "epoch": 397.64,
2249
+ "step": 1590
2250
+ },
2251
+ {
2252
+ "loss": 0.0263,
2253
+ "grad_norm": 0.03112473525106907,
2254
+ "learning_rate": 1.1798396334478809e-05,
2255
+ "epoch": 398.96,
2256
+ "step": 1595
2257
+ },
2258
+ {
2259
+ "loss": 0.03,
2260
+ "grad_norm": 0.13244664669036865,
2261
+ "learning_rate": 1.1512027491408934e-05,
2262
+ "epoch": 400.0,
2263
+ "step": 1600
2264
+ },
2265
+ {
2266
+ "loss": 0.0293,
2267
+ "grad_norm": 0.030842171981930733,
2268
+ "learning_rate": 1.1225658648339062e-05,
2269
+ "epoch": 401.32,
2270
+ "step": 1605
2271
+ },
2272
+ {
2273
+ "loss": 0.0269,
2274
+ "grad_norm": 0.02750714123249054,
2275
+ "learning_rate": 1.0939289805269187e-05,
2276
+ "epoch": 402.64,
2277
+ "step": 1610
2278
+ },
2279
+ {
2280
+ "loss": 0.0274,
2281
+ "grad_norm": 0.027868203818798065,
2282
+ "learning_rate": 1.0652920962199313e-05,
2283
+ "epoch": 403.96,
2284
+ "step": 1615
2285
+ },
2286
+ {
2287
+ "loss": 0.0287,
2288
+ "grad_norm": 0.08533693850040436,
2289
+ "learning_rate": 1.036655211912944e-05,
2290
+ "epoch": 405.0,
2291
+ "step": 1620
2292
+ },
2293
+ {
2294
+ "loss": 0.0284,
2295
+ "grad_norm": 0.036794379353523254,
2296
+ "learning_rate": 1.0080183276059566e-05,
2297
+ "epoch": 406.32,
2298
+ "step": 1625
2299
+ },
2300
+ {
2301
+ "loss": 0.0264,
2302
+ "grad_norm": 0.03149307146668434,
2303
+ "learning_rate": 9.793814432989691e-06,
2304
+ "epoch": 407.64,
2305
+ "step": 1630
2306
+ },
2307
+ {
2308
+ "loss": 0.0284,
2309
+ "grad_norm": 0.03569972142577171,
2310
+ "learning_rate": 9.507445589919818e-06,
2311
+ "epoch": 408.96,
2312
+ "step": 1635
2313
+ },
2314
+ {
2315
+ "loss": 0.0276,
2316
+ "grad_norm": 0.10384050011634827,
2317
+ "learning_rate": 9.221076746849944e-06,
2318
+ "epoch": 410.0,
2319
+ "step": 1640
2320
+ },
2321
+ {
2322
+ "loss": 0.0272,
2323
+ "grad_norm": 0.028333071619272232,
2324
+ "learning_rate": 8.934707903780069e-06,
2325
+ "epoch": 411.32,
2326
+ "step": 1645
2327
+ },
2328
+ {
2329
+ "loss": 0.0295,
2330
+ "grad_norm": 0.028478605672717094,
2331
+ "learning_rate": 8.648339060710195e-06,
2332
+ "epoch": 412.64,
2333
+ "step": 1650
2334
+ },
2335
+ {
2336
+ "loss": 0.0295,
2337
+ "grad_norm": 0.028093887493014336,
2338
+ "learning_rate": 8.36197021764032e-06,
2339
+ "epoch": 413.96,
2340
+ "step": 1655
2341
+ },
2342
+ {
2343
+ "loss": 0.0286,
2344
+ "grad_norm": 0.10948823392391205,
2345
+ "learning_rate": 8.075601374570448e-06,
2346
+ "epoch": 415.0,
2347
+ "step": 1660
2348
+ },
2349
+ {
2350
+ "loss": 0.0266,
2351
+ "grad_norm": 0.02955321967601776,
2352
+ "learning_rate": 7.789232531500573e-06,
2353
+ "epoch": 416.32,
2354
+ "step": 1665
2355
+ },
2356
+ {
2357
+ "loss": 0.0273,
2358
+ "grad_norm": 0.02912413887679577,
2359
+ "learning_rate": 7.502863688430699e-06,
2360
+ "epoch": 417.64,
2361
+ "step": 1670
2362
+ },
2363
+ {
2364
+ "loss": 0.0266,
2365
+ "grad_norm": 0.028283055871725082,
2366
+ "learning_rate": 7.216494845360824e-06,
2367
+ "epoch": 418.96,
2368
+ "step": 1675
2369
+ },
2370
+ {
2371
+ "loss": 0.0282,
2372
+ "grad_norm": 0.090940460562706,
2373
+ "learning_rate": 6.930126002290952e-06,
2374
+ "epoch": 420.0,
2375
+ "step": 1680
2376
+ },
2377
+ {
2378
+ "loss": 0.027,
2379
+ "grad_norm": 0.03360769525170326,
2380
+ "learning_rate": 6.643757159221077e-06,
2381
+ "epoch": 421.32,
2382
+ "step": 1685
2383
+ },
2384
+ {
2385
+ "loss": 0.0274,
2386
+ "grad_norm": 0.029777785763144493,
2387
+ "learning_rate": 6.357388316151203e-06,
2388
+ "epoch": 422.64,
2389
+ "step": 1690
2390
+ },
2391
+ {
2392
+ "loss": 0.0273,
2393
+ "grad_norm": 0.03204215317964554,
2394
+ "learning_rate": 6.071019473081329e-06,
2395
+ "epoch": 423.96,
2396
+ "step": 1695
2397
+ },
2398
+ {
2399
+ "loss": 0.0275,
2400
+ "grad_norm": 0.1337508112192154,
2401
+ "learning_rate": 5.784650630011455e-06,
2402
+ "epoch": 425.0,
2403
+ "step": 1700
2404
+ },
2405
+ {
2406
+ "loss": 0.0274,
2407
+ "grad_norm": 0.03454073518514633,
2408
+ "learning_rate": 5.498281786941581e-06,
2409
+ "epoch": 426.32,
2410
+ "step": 1705
2411
+ },
2412
+ {
2413
+ "loss": 0.0319,
2414
+ "grad_norm": 0.029586778953671455,
2415
+ "learning_rate": 5.211912943871707e-06,
2416
+ "epoch": 427.64,
2417
+ "step": 1710
2418
+ },
2419
+ {
2420
+ "loss": 0.0297,
2421
+ "grad_norm": 0.02780616097152233,
2422
+ "learning_rate": 4.925544100801833e-06,
2423
+ "epoch": 428.96,
2424
+ "step": 1715
2425
+ },
2426
+ {
2427
+ "loss": 0.0309,
2428
+ "grad_norm": 0.13715778291225433,
2429
+ "learning_rate": 4.639175257731959e-06,
2430
+ "epoch": 430.0,
2431
+ "step": 1720
2432
+ },
2433
+ {
2434
+ "loss": 0.0276,
2435
+ "grad_norm": 0.03031608648598194,
2436
+ "learning_rate": 4.352806414662085e-06,
2437
+ "epoch": 431.32,
2438
+ "step": 1725
2439
+ },
2440
+ {
2441
+ "loss": 0.0265,
2442
+ "grad_norm": 0.031075894832611084,
2443
+ "learning_rate": 4.066437571592211e-06,
2444
+ "epoch": 432.64,
2445
+ "step": 1730
2446
+ },
2447
+ {
2448
+ "loss": 0.0271,
2449
+ "grad_norm": 0.02886197902262211,
2450
+ "learning_rate": 3.7800687285223365e-06,
2451
+ "epoch": 433.96,
2452
+ "step": 1735
2453
+ },
2454
+ {
2455
+ "loss": 0.0277,
2456
+ "grad_norm": 0.09652125835418701,
2457
+ "learning_rate": 3.493699885452463e-06,
2458
+ "epoch": 435.0,
2459
+ "step": 1740
2460
+ },
2461
+ {
2462
+ "loss": 0.0289,
2463
+ "grad_norm": 0.027949687093496323,
2464
+ "learning_rate": 3.2073310423825886e-06,
2465
+ "epoch": 436.32,
2466
+ "step": 1745
2467
+ },
2468
+ {
2469
+ "loss": 0.0282,
2470
+ "grad_norm": 0.026798376813530922,
2471
+ "learning_rate": 2.920962199312715e-06,
2472
+ "epoch": 437.64,
2473
+ "step": 1750
2474
+ },
2475
+ {
2476
+ "loss": 0.0303,
2477
+ "grad_norm": 0.032906703650951385,
2478
+ "learning_rate": 2.6345933562428407e-06,
2479
+ "epoch": 438.96,
2480
+ "step": 1755
2481
+ },
2482
+ {
2483
+ "loss": 0.0257,
2484
+ "grad_norm": 0.08160939812660217,
2485
+ "learning_rate": 2.3482245131729668e-06,
2486
+ "epoch": 440.0,
2487
+ "step": 1760
2488
+ },
2489
+ {
2490
+ "loss": 0.0281,
2491
+ "grad_norm": 0.03216954320669174,
2492
+ "learning_rate": 2.061855670103093e-06,
2493
+ "epoch": 441.32,
2494
+ "step": 1765
2495
+ },
2496
+ {
2497
+ "loss": 0.0291,
2498
+ "grad_norm": 0.03425678610801697,
2499
+ "learning_rate": 1.7754868270332189e-06,
2500
+ "epoch": 442.64,
2501
+ "step": 1770
2502
+ },
2503
+ {
2504
+ "loss": 0.0271,
2505
+ "grad_norm": 0.02900947816669941,
2506
+ "learning_rate": 1.4891179839633447e-06,
2507
+ "epoch": 443.96,
2508
+ "step": 1775
2509
+ },
2510
+ {
2511
+ "loss": 0.0293,
2512
+ "grad_norm": 0.09323178231716156,
2513
+ "learning_rate": 1.202749140893471e-06,
2514
+ "epoch": 445.0,
2515
+ "step": 1780
2516
+ },
2517
+ {
2518
+ "loss": 0.0266,
2519
+ "grad_norm": 0.028956923633813858,
2520
+ "learning_rate": 9.163802978235968e-07,
2521
+ "epoch": 446.32,
2522
+ "step": 1785
2523
+ },
2524
+ {
2525
+ "loss": 0.0278,
2526
+ "grad_norm": 0.029016662389039993,
2527
+ "learning_rate": 6.300114547537229e-07,
2528
+ "epoch": 447.64,
2529
+ "step": 1790
2530
+ },
2531
+ {
2532
+ "loss": 0.0282,
2533
+ "grad_norm": 0.03088531456887722,
2534
+ "learning_rate": 3.436426116838488e-07,
2535
+ "epoch": 448.96,
2536
+ "step": 1795
2537
+ },
2538
+ {
2539
+ "loss": 0.028,
2540
+ "grad_norm": 0.11458810418844223,
2541
+ "learning_rate": 5.72737686139748e-08,
2542
+ "epoch": 450.0,
2543
+ "step": 1800
2544
+ },
2545
+ {
2546
+ "train_runtime": 31004.9647,
2547
+ "train_samples_per_second": 1.858,
2548
+ "train_steps_per_second": 0.058,
2549
+ "total_flos": 1.5308141101056e+18,
2550
+ "train_loss": 0.054099425789382725,
2551
+ "epoch": 450.0,
2552
+ "step": 1800
2553
+ }
2554
+ ],
2555
+ "training_args": {
2556
+ "output_dir": "/hkfs/work/workspace/scratch/tum_fmp0582-dndworkspace/自己训练lora/train_lora/outputs/Mu-Math/group_01/checkpoints",
2557
+ "overwrite_output_dir": false,
2558
+ "do_train": false,
2559
+ "do_eval": false,
2560
+ "do_predict": false,
2561
+ "eval_strategy": "no",
2562
+ "prediction_loss_only": false,
2563
+ "per_device_train_batch_size": 2,
2564
+ "per_device_eval_batch_size": 8,
2565
+ "per_gpu_train_batch_size": null,
2566
+ "per_gpu_eval_batch_size": null,
2567
+ "gradient_accumulation_steps": 16,
2568
+ "eval_accumulation_steps": null,
2569
+ "eval_delay": 0,
2570
+ "torch_empty_cache_steps": null,
2571
+ "learning_rate": 0.0001,
2572
+ "weight_decay": 0.01,
2573
+ "adam_beta1": 0.9,
2574
+ "adam_beta2": 0.999,
2575
+ "adam_epsilon": 1e-08,
2576
+ "max_grad_norm": 1.0,
2577
+ "num_train_epochs": 12,
2578
+ "max_steps": 1800,
2579
+ "lr_scheduler_type": "linear",
2580
+ "lr_scheduler_kwargs": {},
2581
+ "warmup_ratio": 0.03,
2582
+ "warmup_steps": 0,
2583
+ "log_level": "passive",
2584
+ "log_level_replica": "warning",
2585
+ "log_on_each_node": true,
2586
+ "logging_dir": "/hkfs/work/workspace/scratch/tum_fmp0582-dndworkspace/自己训练lora/train_lora/logs/Mu-Math/group_01",
2587
+ "logging_strategy": "steps",
2588
+ "logging_first_step": true,
2589
+ "logging_steps": 5,
2590
+ "logging_nan_inf_filter": true,
2591
+ "save_strategy": "steps",
2592
+ "save_steps": 300,
2593
+ "save_total_limit": 6,
2594
+ "save_safetensors": true,
2595
+ "save_on_each_node": false,
2596
+ "save_only_model": false,
2597
+ "restore_callback_states_from_checkpoint": false,
2598
+ "no_cuda": false,
2599
+ "use_cpu": false,
2600
+ "use_mps_device": false,
2601
+ "seed": 42,
2602
+ "data_seed": null,
2603
+ "jit_mode_eval": false,
2604
+ "bf16": true,
2605
+ "fp16": false,
2606
+ "fp16_opt_level": "O1",
2607
+ "half_precision_backend": "auto",
2608
+ "bf16_full_eval": false,
2609
+ "fp16_full_eval": false,
2610
+ "tf32": null,
2611
+ "local_rank": 0,
2612
+ "ddp_backend": null,
2613
+ "tpu_num_cores": null,
2614
+ "tpu_metrics_debug": false,
2615
+ "debug": [],
2616
+ "dataloader_drop_last": false,
2617
+ "eval_steps": null,
2618
+ "dataloader_num_workers": 0,
2619
+ "dataloader_prefetch_factor": null,
2620
+ "past_index": -1,
2621
+ "run_name": null,
2622
+ "disable_tqdm": false,
2623
+ "remove_unused_columns": true,
2624
+ "label_names": null,
2625
+ "load_best_model_at_end": false,
2626
+ "metric_for_best_model": null,
2627
+ "greater_is_better": null,
2628
+ "ignore_data_skip": false,
2629
+ "fsdp": [],
2630
+ "fsdp_min_num_params": 0,
2631
+ "fsdp_config": {
2632
+ "min_num_params": 0,
2633
+ "xla": false,
2634
+ "xla_fsdp_v2": false,
2635
+ "xla_fsdp_grad_ckpt": false
2636
+ },
2637
+ "fsdp_transformer_layer_cls_to_wrap": null,
2638
+ "accelerator_config": {
2639
+ "split_batches": false,
2640
+ "dispatch_batches": null,
2641
+ "even_batches": true,
2642
+ "use_seedable_sampler": true,
2643
+ "non_blocking": false,
2644
+ "gradient_accumulation_kwargs": null
2645
+ },
2646
+ "parallelism_config": null,
2647
+ "deepspeed": null,
2648
+ "label_smoothing_factor": 0.0,
2649
+ "optim": "adamw_torch",
2650
+ "optim_args": null,
2651
+ "adafactor": false,
2652
+ "group_by_length": false,
2653
+ "length_column_name": "length",
2654
+ "report_to": [],
2655
+ "project": "huggingface",
2656
+ "trackio_space_id": "trackio",
2657
+ "ddp_find_unused_parameters": null,
2658
+ "ddp_bucket_cap_mb": null,
2659
+ "ddp_broadcast_buffers": null,
2660
+ "dataloader_pin_memory": true,
2661
+ "dataloader_persistent_workers": false,
2662
+ "skip_memory_metrics": true,
2663
+ "use_legacy_prediction_loop": false,
2664
+ "push_to_hub": false,
2665
+ "resume_from_checkpoint": null,
2666
+ "hub_model_id": null,
2667
+ "hub_strategy": "every_save",
2668
+ "hub_token": "<HUB_TOKEN>",
2669
+ "hub_private_repo": null,
2670
+ "hub_always_push": false,
2671
+ "hub_revision": null,
2672
+ "gradient_checkpointing": true,
2673
+ "gradient_checkpointing_kwargs": null,
2674
+ "include_inputs_for_metrics": false,
2675
+ "include_for_metrics": [],
2676
+ "eval_do_concat_batches": true,
2677
+ "fp16_backend": "auto",
2678
+ "push_to_hub_model_id": null,
2679
+ "push_to_hub_organization": null,
2680
+ "push_to_hub_token": "<PUSH_TO_HUB_TOKEN>",
2681
+ "mp_parameters": "",
2682
+ "auto_find_batch_size": false,
2683
+ "full_determinism": false,
2684
+ "torchdynamo": null,
2685
+ "ray_scope": "last",
2686
+ "ddp_timeout": 1800,
2687
+ "torch_compile": false,
2688
+ "torch_compile_backend": null,
2689
+ "torch_compile_mode": null,
2690
+ "include_tokens_per_second": false,
2691
+ "include_num_input_tokens_seen": "no",
2692
+ "neftune_noise_alpha": null,
2693
+ "optim_target_modules": null,
2694
+ "batch_eval_metrics": false,
2695
+ "eval_on_start": false,
2696
+ "use_liger_kernel": false,
2697
+ "liger_kernel_config": null,
2698
+ "eval_use_gather_object": false,
2699
+ "average_tokens_across_devices": true
2700
+ },
2701
+ "lora_config": {
2702
+ "r": 64,
2703
+ "alpha": 128,
2704
+ "dropout": 0.05,
2705
+ "target_modules": [
2706
+ "q_proj",
2707
+ "k_proj",
2708
+ "v_proj",
2709
+ "o_proj",
2710
+ "gate_proj",
2711
+ "up_proj",
2712
+ "down_proj"
2713
+ ]
2714
+ },
2715
+ "effective_batch_size": 32,
2716
+ "world_size": 1,
2717
+ "git_commit": ""
2718
+ }
Mu-Math/group_01/prompt_group.json ADDED
@@ -0,0 +1,613 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset_name": "Mu-Math",
3
+ "group_index": 1,
4
+ "source_file": "/hkfs/work/workspace/scratch/tum_fmp0582-dndworkspace/自己训练lora/prepare/data/math/Mu-Math.json",
5
+ "selected_indices": [
6
+ 13,
7
+ 25,
8
+ 39,
9
+ 56,
10
+ 77,
11
+ 81,
12
+ 108,
13
+ 118,
14
+ 124,
15
+ 189,
16
+ 198,
17
+ 205,
18
+ 206,
19
+ 210,
20
+ 217,
21
+ 232,
22
+ 233,
23
+ 236,
24
+ 259,
25
+ 274,
26
+ 275,
27
+ 283,
28
+ 287,
29
+ 291,
30
+ 301,
31
+ 303,
32
+ 334,
33
+ 353,
34
+ 371,
35
+ 375,
36
+ 385,
37
+ 390,
38
+ 397,
39
+ 410,
40
+ 412,
41
+ 422,
42
+ 428,
43
+ 435,
44
+ 438,
45
+ 450,
46
+ 467,
47
+ 471,
48
+ 474,
49
+ 476,
50
+ 477,
51
+ 480,
52
+ 481,
53
+ 496,
54
+ 499,
55
+ 502,
56
+ 503,
57
+ 517,
58
+ 524,
59
+ 534,
60
+ 549,
61
+ 568,
62
+ 574,
63
+ 583,
64
+ 586,
65
+ 600,
66
+ 617,
67
+ 632,
68
+ 640,
69
+ 644,
70
+ 656,
71
+ 669,
72
+ 680,
73
+ 687,
74
+ 706,
75
+ 709,
76
+ 710,
77
+ 719,
78
+ 745,
79
+ 755,
80
+ 762,
81
+ 768,
82
+ 771,
83
+ 776,
84
+ 777,
85
+ 787,
86
+ 801,
87
+ 837,
88
+ 849,
89
+ 854,
90
+ 927,
91
+ 928,
92
+ 936,
93
+ 949,
94
+ 964,
95
+ 978,
96
+ 1007,
97
+ 1014,
98
+ 1015,
99
+ 1018,
100
+ 1031,
101
+ 1036,
102
+ 1052,
103
+ 1058,
104
+ 1066,
105
+ 1075
106
+ ],
107
+ "total_records": 1084,
108
+ "sample_count": 100,
109
+ "generated_at": "2025-11-05T12:58:28Z",
110
+ "seed": 4171135963,
111
+ "samples": [
112
+ {
113
+ "prompt": "Compute the integral: $\\int{\\frac{ 2 \\cdot x^3 }{ \\sqrt{x^2+16} } d x}$.",
114
+ "response": "$\\int{\\frac{ 2 \\cdot x^3 }{ \\sqrt{x^2+16} } d x}$ =$C+128\\cdot\\left(\\frac{1}{3\\cdot\\left(\\cos\\left(\\arctan\\left(\\frac{x}{4}\\right)\\right)\\right)^3}-\\frac{1}{\\cos\\left(\\arctan\\left(\\frac{x}{4}\\right)\\right)}\\right)$",
115
+ "system": ""
116
+ },
117
+ {
118
+ "prompt": "Sketch the curve: \n\n$y=5 \\cdot x \\cdot \\sqrt{4-x^2}$. \n\nSubmit as your final answer:\n\n1. The domain (in interval notation)\n2. Vertical asymptotes\n3. Horizontal asymptotes\n4. Slant asymptotes\n5. Intervals where the function is increasing\n6. Intervals where the function is decreasing\n7. Intervals where the function is concave up\n8. Intervals where the function is concave down\n9. Points of inflection",
119
+ "response": "This is the final answer to the problem: \n1. The domain (in interval notation): $[-2,2]$\n2. Vertical asymptotes: None\n3. Horizontal asymptotes: None\n4. Slant asymptotes: None\n5. Intervals where the function is increasing: $\\left(-\\sqrt{2},\\sqrt{2}\\right)$\n6. Intervals where the function is decreasing: $\\left(-2,-\\sqrt{2}\\right), \\left(\\sqrt{2},2\\right)$\n7. Intervals where the function is concave up: $(-2,0)$\n8. Intervals where the function is concave down: $(0,2)$\n9. Points of inflection: $P(0,0)$",
120
+ "system": ""
121
+ },
122
+ {
123
+ "prompt": "Region $R$ is the region in the first quadrant bounded by the graphs of $y=2 \\cdot x$ and $y=x^2$.\n\n1. Write, but do not evaluate, an integral expression that gives the volume of the solid generated from revolving the region $R$ about the vertical line $x=3$.\n2. Write, but do not evaluate, an integral expression that gives the volume of the solid generated from revolving the region $R$ about the vertical line $x=-2$.",
124
+ "response": "1. $V$ = $\\int_0^4\\pi\\cdot\\left(\\left(3-\\frac{1}{2}\\cdot y\\right)^2-\\left(3-\\sqrt{y}\\right)^2\\right)dy$\n2. $V$ = $\\int_0^4\\pi\\cdot\\left(\\left(-2-\\sqrt{y}\\right)^2-\\left(-2-\\frac{1}{2}\\cdot y\\right)^2\\right)dy$",
125
+ "system": ""
126
+ },
127
+ {
128
+ "prompt": "Find a rectangular equation which is equivalent to the following parametric equations:\n\n$x^2=t^3-3 \\cdot t^2+3 \\cdot t-1$ \n\n$y^2=t^3+6 \\cdot t^2+12 \\cdot t+8$",
129
+ "response": "This is the final answer to the problem: $\\sqrt[3]{x^2}-\\sqrt[3]{y^2}=-3$",
130
+ "system": ""
131
+ },
132
+ {
133
+ "prompt": "Solve the integral: $\\int{\\frac{ \\cos(x)^3 }{ \\sin(x)^9 } d x}$.",
134
+ "response": "$\\int{\\frac{ \\cos(x)^3 }{ \\sin(x)^9 } d x}$ =$C-\\left(\\frac{1}{3}\\cdot\\left(\\cot(x)\\right)^6+\\frac{1}{4}\\cdot\\left(\\cot(x)\\right)^4+\\frac{1}{8}\\cdot\\left(\\cot(x)\\right)^8\\right)$",
135
+ "system": ""
136
+ },
137
+ {
138
+ "prompt": "Find the radius of convergence and sum of the series: $\\frac{ 1 }{ 2 }+\\frac{ x }{ 1 \\cdot 3 }+\\frac{ x^2 }{ 1 \\cdot 2 \\cdot 4 }+\\cdots+\\frac{ x^n }{ \\left(n!\\right) \\cdot (n+2) }+\\cdots$ .",
139
+ "response": "This is the final answer to the problem: 1. Radius of convergence:$R=\\infty$\n2. Sum: $f(x)=\\begin{cases}\\frac{1}{x^2}+\\frac{x\\cdot e^x-e^x}{x^2},&x\\ne0\\\\\\frac{1}{2},&x=0\\end{cases}$",
140
+ "system": ""
141
+ },
142
+ {
143
+ "prompt": "Compute the integral: $\\int{\\frac{ 6 }{ \\sin(3 \\cdot x)^6 } d x}$.",
144
+ "response": "$\\int{\\frac{ 6 }{ \\sin(3 \\cdot x)^6 } d x}$ =$-\\frac{2\\cdot\\cos(3\\cdot x)}{5\\cdot\\sin(3\\cdot x)^5}+\\frac{24}{5}\\cdot\\left(-\\frac{\\cos(3\\cdot x)}{9\\cdot\\sin(3\\cdot x)^3}-\\frac{2}{9}\\cdot\\cot(3\\cdot x)\\right)+C$",
145
+ "system": ""
146
+ },
147
+ {
148
+ "prompt": "Calculate integral: $I=\\int{4 \\cdot \\cos\\left(3 \\cdot \\ln(2 \\cdot x)\\right) d x}$.",
149
+ "response": "This is the final answer to the problem: $\\frac{1}{10}\\cdot\\left(C+4\\cdot x\\cdot\\cos\\left(3\\cdot\\ln(2\\cdot x)\\right)+12\\cdot x\\cdot\\sin\\left(3\\cdot\\ln(2\\cdot x)\\right)\\right)$",
150
+ "system": ""
151
+ },
152
+ {
153
+ "prompt": "Calculate integral: $\\int_{\\frac{ 1 }{ 2 }}^{\\frac{ \\sqrt{3} }{ 2 }}{\\frac{ 1 }{ x \\cdot \\sqrt{9-9 \\cdot x^2} } d x}$.",
154
+ "response": "$$\\int_{\\frac{ 1 }{ 2 }}^{\\frac{ \\sqrt{3} }{ 2 }}{\\frac{ 1 }{ x \\cdot \\sqrt{9-9 \\cdot x^2} } d x}=\\frac{1}{6}\\ln\\left(\\frac{7}{3}+\\frac{4}{\\sqrt{3}}\\right)$$",
155
+ "system": ""
156
+ },
157
+ {
158
+ "prompt": "Find a normal vector and a tangent vector for $2 \\cdot x^3-x^2 \\cdot y^2=3 \\cdot x-y-7$ at point $P$ : $(1,-2)$",
159
+ "response": "Normal vector:$\\vec{N}=\\vec{i}-\\vec{j}$Tangent vector: $\\vec{T}=\\vec{i}+\\vec{j}$",
160
+ "system": ""
161
+ },
162
+ {
163
+ "prompt": "Find the extrema of a function $y=\\frac{ 2 \\cdot x^4 }{ 4 }-\\frac{ x^3 }{ 3 }-\\frac{ 3 \\cdot x^2 }{ 2 }+2$. Then determine the largest and smallest value of the function when $-2 \\le x \\le 4$.",
164
+ "response": "This is the final answer to the problem: \n\n1. Extrema points: $P\\left(\\frac{3}{2},\\frac{1}{32}\\right), P\\left(-1,\\frac{4}{3}\\right), P(0,2)$\n2. The largest value: $\\frac{254}{3}$\n3. The smallest value: $\\frac{1}{32}$",
165
+ "system": ""
166
+ },
167
+ {
168
+ "prompt": "Use the method of Lagrange multipliers to maximize $U(x,y)=8 \\cdot x^{\\frac{ 4 }{ 5 }} \\cdot y^{\\frac{ 1 }{ 5 }}$; $4 \\cdot x+2 \\cdot y=12$.",
169
+ "response": "Answer: maximum $16.715$ at $P(2.4,1.2)$",
170
+ "system": ""
171
+ },
172
+ {
173
+ "prompt": "Use the method of Lagrange multipliers to maximize $U(x,y)=8 \\cdot x^{\\frac{ 4 }{ 5 }} \\cdot y^{\\frac{ 1 }{ 5 }}$; $4 \\cdot x+2 \\cdot y=12$.",
174
+ "response": "Answer: maximum $16.715$ at $P(2.4,1.2)$",
175
+ "system": ""
176
+ },
177
+ {
178
+ "prompt": "Find $\\frac{ d y }{d x}$ for $y=x \\cdot \\arccsc(x)$.",
179
+ "response": "$\\frac{ d y }{d x}$= $-\\frac{x}{|x|\\cdot\\sqrt{x^2-1}}+\\arccsc(x)$",
180
+ "system": ""
181
+ },
182
+ {
183
+ "prompt": "Compute the partial derivatives of the implicit function $z(x,y)$, given by the equation $-x-6 \\cdot y+z=3 \\cdot \\cos(-x-6 \\cdot y+z)$.\n\nSubmit as your final answer:\n\na. $\\frac{\\partial z}{\\partial x}$;\n\nb. $\\frac{\\partial z}{\\partial y}$.",
184
+ "response": "This is the final answer to the problem: \na. $1$;\n\nb. $6$.",
185
+ "system": ""
186
+ },
187
+ {
188
+ "prompt": "Solve the integral: $\\int{\\frac{ \\sqrt{x+10}+3 }{ (x+10)^2-\\sqrt{x+10} } d x}$.",
189
+ "response": "$\\int{\\frac{ \\sqrt{x+10}+3 }{ (x+10)^2-\\sqrt{x+10} } d x}$ =$C+\\frac{8}{3}\\cdot\\ln\\left(\\left|\\sqrt{x+10}-1\\right|\\right)-\\frac{4}{3}\\cdot\\ln\\left(11+\\sqrt{x+10}+x\\right)-\\frac{4}{\\sqrt{3}}\\cdot\\arctan\\left(\\frac{1}{\\sqrt{3}}\\cdot\\left(1+2\\cdot\\sqrt{x+10}\\right)\\right)$",
190
+ "system": ""
191
+ },
192
+ {
193
+ "prompt": "Solve the integral: $\\int{\\frac{ \\sqrt{x+10}+3 }{ (x+10)^2-\\sqrt{x+10} } d x}$.",
194
+ "response": "$\\int{\\frac{ \\sqrt{x+10}+3 }{ (x+10)^2-\\sqrt{x+10} } d x}$ =$C+\\frac{8}{3}\\cdot\\ln\\left(\\left|\\sqrt{x+10}-1\\right|\\right)-\\frac{4}{3}\\cdot\\ln\\left(11+\\sqrt{x+10}+x\\right)-\\frac{4}{\\sqrt{3}}\\cdot\\arctan\\left(\\frac{1}{\\sqrt{3}}\\cdot\\left(1+2\\cdot\\sqrt{x+10}\\right)\\right)$",
195
+ "system": ""
196
+ },
197
+ {
198
+ "prompt": "An alternating current for outlets in a home has voltage given by the function $V(t)=150 \\cdot \\cos(368 \\cdot t)$, where $V$ is the voltage in volts at time $t$ in seconds.\n\n1. Find the period of the function.\n2. Determine the number of periods that occur when $1$ sec. has passed.",
199
+ "response": "This is the final answer to the problem: \n\n1. the period of the function: $\\frac{\\pi}{184}$\n2. the number of periods: $58.56901906$",
200
+ "system": ""
201
+ },
202
+ {
203
+ "prompt": "Compute $\\sqrt[4]{90}$ with accuracy $0.0001$.",
204
+ "response": "This is the final answer to the problem: $3.0801$",
205
+ "system": ""
206
+ },
207
+ {
208
+ "prompt": "Solve the integral: $\\int{\\tan(x)^4 d x}$.",
209
+ "response": "$\\int{\\tan(x)^4 d x}$ = $C + x + 1/3 (sec^2(x) - 4) tan(x)$",
210
+ "system": ""
211
+ },
212
+ {
213
+ "prompt": "Solve the integral: $\\int{\\tan(x)^4 d x}$.",
214
+ "response": "$\\int{\\tan(x)^4 d x}$ = $C + x + 1/3 (sec^2(x) - 4) tan(x)$",
215
+ "system": ""
216
+ },
217
+ {
218
+ "prompt": "Solve $\\cos(2 \\cdot t)-5 \\cdot \\sin(t)-3=0$.",
219
+ "response": "This is the final answer to the problem: $t=(-1)^{n+1}\\cdot\\frac{\\pi}{6}+n\\cdot\\pi$",
220
+ "system": ""
221
+ },
222
+ {
223
+ "prompt": "Solve the following equations: 1. $-10 c=-80$\n2. $n-(-6)=12$\n3. $-82+x=-20$\n4. $-\\frac{ r }{ 2 }=5$\n5. $r-3.4=7.1$\n6. $\\frac{ g }{ 2.5 }=1.8$\n7. $4.8 m=43.2$\n8. $\\frac{ 3 }{ 4 } t=\\frac{ 9 }{ 20 }$\n9. $3\\frac{2}{3}+m=5\\frac{1}{6}$",
224
+ "response": "The solutions to the given equations are: \n1. $c=8$\n2. $n=6$\n3. $x=62$\n4. $r=-10$\n5. $r=10.5$\n6. $g=\\frac{ 9 }{ 2 }$\n7. $m=9$\n8. $t=\\frac{3}{5}$\n9. $m=\\frac{3}{2}$",
225
+ "system": ""
226
+ },
227
+ {
228
+ "prompt": "The region bounded by the arc of the curve $y=\\sqrt{2} \\cdot \\sin(2 \\cdot x)$, $0 \\le x \\le \\frac{ \\pi }{ 2 }$, is revolved around the X-axis. Compute the surface area of this solid of revolution.",
229
+ "response": "Surface Area: $\\frac{\\pi}{4}\\cdot\\left(12\\cdot\\sqrt{2}+\\ln\\left(17+12\\cdot\\sqrt{2}\\right)\\right)$",
230
+ "system": ""
231
+ },
232
+ {
233
+ "prompt": "Find $y'$ and $y''$ for $x^2+6 \\cdot x \\cdot y-2 \\cdot y^2=3$.",
234
+ "response": "$y'$= $\\frac{x+3\\cdot y}{2\\cdot y-3\\cdot x}$; \n\n$y''$= $\\frac{11\\cdot\\left(x^2+6\\cdot x\\cdot y-2\\cdot y^2\\right)}{(3\\cdot x-2\\cdot y)^3}$.",
235
+ "system": ""
236
+ },
237
+ {
238
+ "prompt": "Find $y'$ and $y''$ for $x^2+6 \\cdot x \\cdot y-2 \\cdot y^2=3$.",
239
+ "response": "$y'$= $\\frac{x+3\\cdot y}{2\\cdot y-3\\cdot x}$; \n\n$y''$= $\\frac{11\\cdot\\left(x^2+6\\cdot x\\cdot y-2\\cdot y^2\\right)}{(3\\cdot x-2\\cdot y)^3}$.",
240
+ "system": ""
241
+ },
242
+ {
243
+ "prompt": "Find the derivative of the function $y=\\frac{ 3 \\cdot \\csc(x)-4 \\cdot \\sin(x) }{ 8 \\cdot \\left(\\cos(x)\\right)^5 }-\\frac{ 76 }{ 5 } \\cdot \\cot(3 \\cdot x)$.",
244
+ "response": "$y'$=$\\frac{228}{5\\cdot\\left(\\sin(3\\cdot x)\\right)^2}+\\frac{16\\cdot\\left(\\cos(x)\\right)^6-5\\cdot\\left(\\cos(x)\\right)^4-3\\cdot\\left(\\cos(x)\\right)^6\\cdot\\left(\\csc(x)\\right)^2}{8\\cdot\\left(\\cos(x)\\right)^{10}}$",
245
+ "system": ""
246
+ },
247
+ {
248
+ "prompt": "Solve $\\sin(x)+7 \\cdot \\cos(x)+7=0$.",
249
+ "response": "This is the final answer to the problem: $x=2\\cdot\\pi\\cdot k-2\\cdot\\arctan(7) \\lor x=\\pi+2\\cdot\\pi\\cdot k$",
250
+ "system": ""
251
+ },
252
+ {
253
+ "prompt": "Find the gradient: $f(x,y)=\\frac{ \\sqrt{x}+y^2 }{ x \\cdot y }$.",
254
+ "response": "$\\nabla f(x,y)$ =$\\left\\langle\\frac{1}{2\\cdot x\\cdot y\\cdot\\sqrt{x}}-\\frac{\\sqrt{x}+y^2}{y\\cdot x^2},\\frac{2}{x}-\\frac{\\sqrt{x}+y^2}{x\\cdot y^2}\\right\\rangle$",
255
+ "system": ""
256
+ },
257
+ {
258
+ "prompt": "Find the Fourier series of the periodic function $f(x)=\\frac{ x^2 }{ 2 }$ in the interval $-2 \\cdot \\pi \\le x<2 \\cdot \\pi$ if $f(x)=f(x+4 \\cdot \\pi)$.",
259
+ "response": "The Fourier series is: $\\frac{2\\cdot\\pi^2}{3}+\\sum_{n=1}^\\infty\\left(\\frac{8\\cdot(-1)^n}{n^2}\\cdot\\cos\\left(\\frac{n\\cdot x}{2}\\right)\\right)$",
260
+ "system": ""
261
+ },
262
+ {
263
+ "prompt": "Compute the integral: $\\int{\\frac{ 1 }{ 2 \\cdot \\sin\\left(\\frac{ x }{ 2 }\\right)^6 } d x}$.",
264
+ "response": "$\\int{\\frac{ 1 }{ 2 \\cdot \\sin\\left(\\frac{ x }{ 2 }\\right)^6 } d x}$ =$C-\\frac{1}{5}\\cdot\\left(\\cot\\left(\\frac{x}{2}\\right)\\right)^5-\\frac{2}{3}\\cdot\\left(\\cot\\left(\\frac{x}{2}\\right)\\right)^3-\\cot\\left(\\frac{x}{2}\\right)$",
265
+ "system": ""
266
+ },
267
+ {
268
+ "prompt": "|$n$ \n\n$\\ln(n)$ \n\n| |\n| 1 | 0.00 |\n| 2 | 0.69 |\n| 3 | 1.10 |\n| 4 | 1.39 |\n| 5 | 1.61 |\n| 6 | 1.79 |\n| 7 | 1.95 |\n| 8 | 2.08 |\n| 9 | 2.20 |\n| 10 | 2.30 |\n\n \n \nUsing the table above, estimate the logarithm.1. $\\ln(16)$\n2. $\\ln\\left(3^4\\right)$\n3. $\\ln(2.5)$\n4. $\\ln\\left(\\sqrt{630}\\right)$\n5. $\\ln(0.4)$",
269
+ "response": "1. $\\ln(16)$≈$2.78$\n2. $\\ln\\left(3^4\\right)$≈$4.4$\n3. $\\ln(2.5)$≈$0.92$\n4. $\\ln\\left(\\sqrt{630}\\right)$≈$3.2228$\n5. $\\ln(0.4)$≈$-0.92$",
270
+ "system": ""
271
+ },
272
+ {
273
+ "prompt": "Solve the following problems by integration of the geometric series:\n\n$\\sum_{n=0}^\\infty\\left(x^n\\right)=\\frac{ 1 }{ 1-x }$, $|x|<1$\n\n1. $\\sum_{n=0}^\\infty\\left(\\frac{ 1 }{ (n+1) \\cdot 2^{n+1} }\\right)$\n2. $\\sum_{n=2}^\\infty\\left(\\frac{ 1 }{ n \\cdot 5^{n+1} }\\right)$\n3. $\\sum_{n=1}^\\infty\\left(\\frac{ 1 }{ n \\cdot 6^{n+3} }\\right)$\n4. $\\sum_{n=0}^\\infty\\left(\\frac{ 1 }{ (n+1) \\cdot (n+2) \\cdot 4^{n+2} }\\right)$\n5. $\\sum_{n=3}^\\infty\\left(\\frac{ 1 }{ n \\cdot (n+1) \\cdot 4^{n+3} }\\right)$",
274
+ "response": "1. $\\ln(2)$\n2. $\\frac{1}{5}\\cdot\\ln\\left(\\frac{5}{4}\\right)-\\frac{1}{25}$\n3. $\\frac{1}{216}\\cdot\\ln\\left(\\frac{6}{5}\\right)$\n4. $\\frac{ 3 }{ 4 } \\cdot \\ln\\left(\\frac{ 3 }{ 4 }\\right)+\\frac{ 1 }{ 4 }$\n5. $\\frac{3}{64}\\cdot\\ln\\left(\\frac{3}{4}\\right)+\\frac{83}{6144}$",
275
+ "system": ""
276
+ },
277
+ {
278
+ "prompt": "Find the Taylor series for $f(x)=\\frac{ x }{ (2+x)^3 }$, centered at $x=-1$. Write out the sum of the first four non-zero terms, followed by dots.",
279
+ "response": "This is the final answer to the problem: $x\\cdot\\left(1-3\\cdot(x+1)+6\\cdot(x+1)^2-10\\cdot(x+1)^3+\\cdots\\right)$=\r\n= $-1 + 4 (x + 1) - 9 (x + 1)^2 + 16 (x + 1)^3$\r\n",
280
+ "system": ""
281
+ },
282
+ {
283
+ "prompt": "Write the Taylor series for the function $f(x)=x \\cdot \\cos(2 \\cdot x)$ at the point $x=\\frac{ \\pi }{ 2 }$ up to the third term (zero or non-zero).",
284
+ "response": "This is the final answer to the problem: $-\\frac{\\pi}{2}-\\left(x-\\frac{\\pi}{2}\\right)+\\pi\\cdot\\left(x-\\frac{\\pi}{2}\\right)^2$",
285
+ "system": ""
286
+ },
287
+ {
288
+ "prompt": "Compute $\\int_{0}^{\\frac{ 1 }{ 3 }}{e^{-\\frac{ x^2 }{ 3 }} d x}$ with accuracy $0.00001$.",
289
+ "response": "This is the final answer to the problem: $0.32926$",
290
+ "system": ""
291
+ },
292
+ {
293
+ "prompt": "Find the moment of inertia of an isosceles triangle $I_{x}$ relative to its hypotenuse, if at each of its points the surface density is proportional to its distance to the hypotenuse.",
294
+ "response": "$I_{x}$ = $\\frac{k}{10}\\cdot a^5$",
295
+ "system": ""
296
+ },
297
+ {
298
+ "prompt": "Make full curve sketching of $y=\\ln\\left(\\left|\\frac{ 3 \\cdot x-2 }{ 3 \\cdot x+2 }\\right|\\right)$. Submit as your final answer:\n\n1. The domain (in interval notation)\n2. Vertical asymptotes\n3. Horizontal asymptotes\n4. Slant asymptotes\n5. Intervals where the function is increasing\n6. Intervals where the function is decreasing\n7. Intervals where the function is concave up\n8. Intervals where the function is concave down\n9. Points of inflection",
299
+ "response": "This is the final answer to the problem:\n\n1. The domain (in interval notation) $\\left(-\\infty,-\\frac{2}{3}\\right)\\cup\\left(-\\frac{2}{3},\\frac{2}{3}\\right)\\cup\\left(\\frac{2}{3},\\infty\\right)$\n2. Vertical asymptotes $x=\\frac{2}{3}, x=-\\frac{2}{3}$\n3. Horizontal asymptotes $y=0$\n4. Slant asymptotes None\n5. Intervals where the function is increasing $\\left(\\frac{2}{3},\\infty\\right), \\left(-\\infty,-\\frac{2}{3}\\right)$\n6. Intervals where the function is decreasing $\\left(-\\frac{2}{3},\\frac{2}{3}\\right)$\n7. Intervals where the function is concave up $\\left(-\\frac{2}{3},0\\right), \\left(-\\infty,-\\frac{2}{3}\\right)$\n8. Intervals where the function is concave down $\\left(0,\\frac{2}{3}\\right) \\cup \\left(\\frac{2}{3}, \\infty\\right)$\n9. Points of inflection $P(0,0)$",
300
+ "system": ""
301
+ },
302
+ {
303
+ "prompt": "The force of gravity $\\vec{F}$ acting on an object is given by $\\vec{F}=m \\cdot \\vec{g}$, where $m$ is the mass of the object (expressed in kilograms) and $\\vec{g}$ is acceleration resulting from gravity, with $\\left\\lVert\\vec{g}\\right\\rVert=9.8$ N/kg. A $2$-kg disco ball hangs by a chain from the ceiling of a room. \n\n1. Find the force of gravity $\\vec{F}$ acting on a disco ball and find its magnitude.\n2. Find the force of tension $\\vec{T}$ in the chain and its magnitude.\n\nExpress answers using standard unit vectors.",
304
+ "response": "1. $\\vec{F}$=$-19.6\\cdot\\vec{k}$; $\\left\\lVert\\vec{F}\\right\\rVert$=$19.6$\n2. $\\vec{T}$=$19.6\\cdot\\vec{k}$; $\\left\\lVert\\vec{T}\\right\\rVert$=$19.6$",
305
+ "system": ""
306
+ },
307
+ {
308
+ "prompt": "Find the derivative of the function $y=\\frac{ 2 \\cdot \\csc(x)-7 \\cdot \\sin(x) }{ 4 \\cdot \\left(\\cos(x)\\right)^5 }-\\frac{ 3 }{ 5 } \\cdot \\cot(2 \\cdot x)$.",
309
+ "response": "$y'$=$\\frac{6}{5\\cdot\\left(\\sin(2\\cdot x)\\right)^2}+\\frac{28\\cdot\\left(\\cos(x)\\right)^6-25\\cdot\\left(\\cos(x)\\right)^4-2\\cdot\\left(\\cos(x)\\right)^6\\cdot\\left(\\csc(x)\\right)^2}{4\\cdot\\left(\\cos(x)\\right)^{10}}$",
310
+ "system": ""
311
+ },
312
+ {
313
+ "prompt": "Find the derivative of the function $y=\\arcsin\\left(\\frac{ 2 \\cdot x }{ 1+x^2 }\\right)$.",
314
+ "response": "$y'$=$\\frac{2\\cdot\\left(1-x^2\\right)}{\\left|1-x^2\\right|\\cdot\\left(1+x^2\\right)}$",
315
+ "system": ""
316
+ },
317
+ {
318
+ "prompt": "Find the sum of the $\\sum_{n=0}^\\infty\\left(\\frac{ (-1)^n }{ (2 \\cdot n+1)! }\\right)$ with estimate error $0.01$.",
319
+ "response": "This is the final answer to the problem: $\\frac{101}{120}$",
320
+ "system": ""
321
+ },
322
+ {
323
+ "prompt": "For the function $f(x)=x^{11}-6 \\cdot x^{10}$, determine:\n\n1. Intervals where:\n1. $f$ is increasing\n2. $f$ is decreasing\n3. $f$ is concave up\n4. $f$ is concave down\n\n3. find:\n1. local minima\n2. local maxima\n3. the inflection points of $f$",
324
+ "response": "This is the final answer to the problem:1. Intervals where:\n1. $f$ is increasing: $\\left(\\frac{60}{11},\\infty\\right), (-\\infty,0)$\n2. $f$ is decreasing: $\\left(0,\\frac{60}{11}\\right)$\n3. $f$ is concave up: $\\left(\\frac{54}{11},\\infty\\right)$\n4. $f$ is concave down: $\\left(0,\\frac{54}{11}\\right), (-\\infty,0)$\n\n3. find:\n1. local minima: $\\frac{60}{11}$\n2. local maxima: $0$\n3. the inflection points of $f$: $P\\left(\\frac{54}{11},-\\frac{2529990231179046912}{285311670611}\\right)$",
325
+ "system": ""
326
+ },
327
+ {
328
+ "prompt": "Consider points $P$$P(3,7,-2)$ and $Q$$P(1,1,-3)$. Determine the angle between vectors $\\vec{OP}$ and $\\vec{OQ}$. Express the answer in radians rounded to two decimal places.",
329
+ "response": "$\\theta$ =$0.91$",
330
+ "system": ""
331
+ },
332
+ {
333
+ "prompt": "Consider points $P$$P(3,7,-2)$ and $Q$$P(1,1,-3)$. Determine the angle between vectors $\\vec{OP}$ and $\\vec{OQ}$. Express the answer in radians rounded to two decimal places.",
334
+ "response": "$\\theta$ =$0.91$",
335
+ "system": ""
336
+ },
337
+ {
338
+ "prompt": "Find the Fourier expansion of this function: $f(x)=x^2$ at $(-\\pi,\\pi)$.",
339
+ "response": "The Fourier series is: $f(x)=\\frac{\\pi^2}{3}+4\\cdot\\sum_{n=1}^\\infty\\left(\\frac{(-1)^{n}\\cdot\\cos(n\\cdot x)}{n^2}\\right)$",
340
+ "system": ""
341
+ },
342
+ {
343
+ "prompt": "Find the Fourier expansion of this function: $f(x)=x^2$ at $(-\\pi,\\pi)$.",
344
+ "response": "The Fourier series is: $f(x)=\\frac{\\pi^2}{3}+4\\cdot\\sum_{n=1}^\\infty\\left(\\frac{(-1)^{n}\\cdot\\cos(n\\cdot x)}{n^2}\\right)$",
345
+ "system": ""
346
+ },
347
+ {
348
+ "prompt": "Write the Taylor series for the function $f(x)=x \\cdot \\sin(2 \\cdot x)$ at the point $x=\\pi$ up to the third term (zero or non-zero).",
349
+ "response": "This is the final answer to the problem: $0+2\\cdot\\pi\\cdot(x-\\pi)+\\frac{4}{2}\\cdot(x-\\pi)^2$",
350
+ "system": ""
351
+ },
352
+ {
353
+ "prompt": "Write the Taylor series for the function $f(x)=x \\cdot \\sin(2 \\cdot x)$ at the point $x=\\pi$ up to the third term (zero or non-zero).",
354
+ "response": "This is the final answer to the problem: $0+2\\cdot\\pi\\cdot(x-\\pi)+\\frac{4}{2}\\cdot(x-\\pi)^2$",
355
+ "system": ""
356
+ },
357
+ {
358
+ "prompt": "Compute the second derivative $\\frac{d ^2y}{ d x^2}$ for the parametrically defined function $x=2 \\cdot \\cos(3 \\cdot t)$, $y=\\sin(2 \\cdot t)$.",
359
+ "response": "$\\frac{d ^2y}{ d x^2}$=$-\\frac{24\\cdot\\sin(2\\cdot t)\\cdot\\sin(3\\cdot t)+36\\cdot\\cos(2\\cdot t)\\cdot\\cos(3\\cdot t)}{216\\cdot\\left(\\sin(3\\cdot t)\\right)^3}$",
360
+ "system": ""
361
+ },
362
+ {
363
+ "prompt": "Compute the second derivative $\\frac{d ^2y}{ d x^2}$ for the parametrically defined function $x=2 \\cdot \\cos(3 \\cdot t)$, $y=\\sin(2 \\cdot t)$.",
364
+ "response": "$\\frac{d ^2y}{ d x^2}$=$-\\frac{24\\cdot\\sin(2\\cdot t)\\cdot\\sin(3\\cdot t)+36\\cdot\\cos(2\\cdot t)\\cdot\\cos(3\\cdot t)}{216\\cdot\\left(\\sin(3\\cdot t)\\right)^3}$",
365
+ "system": ""
366
+ },
367
+ {
368
+ "prompt": "Compute the integral $\\int{\\frac{ 6 \\cdot x^3-7 \\cdot x^2+3 \\cdot x-1 }{ 2 \\cdot x-3 \\cdot x^2 } d x}$.",
369
+ "response": "Answer is:$-x^2+x-\\frac{1}{3}\\cdot\\ln\\left(\\left|x-\\frac{2}{3}\\right|\\right)+\\frac{1}{2}\\cdot\\ln\\left(\\left|1-\\frac{2}{3\\cdot x}\\right|\\right)+C$",
370
+ "system": ""
371
+ },
372
+ {
373
+ "prompt": "Solve the following equations: 1. $-t+(5 t-7)=-5$\n2. $21-3 (2-w)=-12$\n3. $9=8 b-(2 b-3)$\n4. $4.5 r-2 r+3 (r-1)=10.75$\n5. $1.2 (x-8)+2.4 (x+1)=7.2$\n6. $4.9 m+(-3.2 m)-13=-2.63$\n7. $4 (2.25 w+3.1)-2.75 w=44.9$",
374
+ "response": "The solutions to the given equations are: 1. $t=\\frac{ 1 }{ 2 }$\n2. $w=-9$\n3. $b=1$\n4. $r=\\frac{ 5 }{ 2 }$\n5. $x=4$\n6. $m=\\frac{ 61 }{ 10 }$\n7. $w=\\frac{ 26 }{ 5 }$",
375
+ "system": ""
376
+ },
377
+ {
378
+ "prompt": "Compute the integral $\\int{\\frac{ \\sqrt{1+x^2} }{ x } d x}$.",
379
+ "response": "$\\int{\\frac{ \\sqrt{1+x^2} }{ x } d x}$ =$\\sqrt{x^2+1}+\\frac{1}{2}\\cdot\\ln\\left(\\left|\\frac{\\sqrt{x^2+1}-1}{\\sqrt{x^2+1}+1}\\right|\\right)+C$",
380
+ "system": ""
381
+ },
382
+ {
383
+ "prompt": "Use the substitution $(b+x)^r=(b+a)^r \\cdot \\left(1+\\frac{ x-a }{ b+a }\\right)^r$ in the binomial expansion to find the Taylor series of function $\\sqrt{x+2}$ with the center $a=1$.",
384
+ "response": "$\\sqrt{x+2}$ =$\\sum_{n=0}^\\infty\\left(3^{\\frac{1}{2}-n}\\cdot C_{\\frac{1}{2}}^n\\cdot(x-1)^n\\right)$",
385
+ "system": ""
386
+ },
387
+ {
388
+ "prompt": "Given that $\\frac{ 1 }{ 1-x }=\\sum_{n=0}^\\infty x^n$ , use term-by-term differentiation or integration to find power series for function $f(x)=\\ln(x)$ centered at $x=1$ .",
389
+ "response": "$\\ln(x)$ =$\\sum_{n=0}^\\infty\\left((-1)^n\\cdot\\frac{(x-1)^{n+1}}{n+1}\\right)$",
390
+ "system": ""
391
+ },
392
+ {
393
+ "prompt": "Compute the integral $\\int{\\frac{ \\tan(x) }{ \\sqrt{\\sin(x)^4+\\cos(x)^4} } d x}$.",
394
+ "response": "$\\int{\\frac{ \\tan(x) }{ \\sqrt{\\sin(x)^4+\\cos(x)^4} } d x}$ =$\\frac{1}{2}\\cdot\\ln\\left(\\tan(x)^2+\\sqrt{\\tan(x)^4+1}\\right)+C$",
395
+ "system": ""
396
+ },
397
+ {
398
+ "prompt": "Find the first derivative of the function: $y=\\left(3 \\cdot a^2-2 \\cdot a \\cdot b \\cdot x+\\frac{ 5 }{ 3 } \\cdot b^2 \\cdot x^2\\right) \\cdot \\sqrt[3]{\\left(\\frac{ a }{ 3 }+\\frac{ b }{ 3 } \\cdot x\\right)^2}$.",
399
+ "response": "The first derivative is:$\\frac{40\\cdot b^3\\cdot x^2}{9\\cdot3^{\\frac{2}{3}}\\cdot\\sqrt[3]{a+b\\cdot x}}$",
400
+ "system": ""
401
+ },
402
+ {
403
+ "prompt": "Find a “reasonable” upper-bound on the error in approximating $f(x)=x \\cdot \\ln(x)$ by its 3rd order Taylor polynomial $P_{3}(x)$ at $a=1$ valid for all values of $x$ such that $|x-1| \\le 0.7$.",
404
+ "response": "This is the final answer to the problem: $\\frac{2}{(0.3)^3}\\cdot\\frac{(0.7)^4}{4!}$",
405
+ "system": ""
406
+ },
407
+ {
408
+ "prompt": "A projectile is shot in the air from ground level with an initial velocity of $500$ m/sec at an angle of $60$ with the horizontal. What is the maximum range? Round your answer to one decimal digit.",
409
+ "response": "Answer: $22092.5$ m",
410
+ "system": ""
411
+ },
412
+ {
413
+ "prompt": "Sketch the curve: \n\n$y=2 \\cdot x \\cdot \\sqrt{3-x^2}$. \n\nSubmit as your final answer:\n\n1. The domain (in interval notation)\n2. Vertical asymptotes\n3. Horizontal asymptotes\n4. Slant asymptotes\n5. Intervals where the function is increasing\n6. Intervals where the function is decreasing\n7. Intervals where the function is concave up\n8. Intervals where the function is concave down\n9. Points of inflection",
414
+ "response": "This is the final answer to the problem: \n1. The domain (in interval notation): $\\left[-1\\cdot3^{2^{-1}},3^{2^{-1}}\\right]$\n2. Vertical asymptotes: None\n3. Horizontal asymptotes: None\n4. Slant asymptotes: None\n5. Intervals where the function is increasing: $\\left(-\\sqrt{\\frac{3}{2}},\\sqrt{\\frac{3}{2}}\\right)$\n6. Intervals where the function is decreasing: $\\left(\\sqrt{\\frac{3}{2}},3^{2^{-1}}\\right), \\left(-3^{2^{-1}},-\\sqrt{\\frac{3}{2}}\\right)$\n7. Intervals where the function is concave up: $\\left(-3^{2^{-1}},0\\right)$\n8. Intervals where the function is concave down: $\\left(0,3^{2^{-1}}\\right)$\n9. Points of inflection: $P(0,0)$",
415
+ "system": ""
416
+ },
417
+ {
418
+ "prompt": "For the curve $x=a\\left(t-\\sin(t)\\right)$, $y=a\\left(1-\\cos(t)\\right)$ determine the curvature. Use $a=10$.",
419
+ "response": "The curvature is:$\\frac{1}{40\\cdot\\left|\\sin\\left(\\frac{t}{2}\\right)\\right|}$",
420
+ "system": ""
421
+ },
422
+ {
423
+ "prompt": "Compute $\\sqrt[3]{130}$ with accuracy $0.0001$.",
424
+ "response": "This is the final answer to the problem: $5.0658$",
425
+ "system": ""
426
+ },
427
+ {
428
+ "prompt": "Compute the integral: $\\int{\\frac{ x }{ \\left(x^2-4 \\cdot x+8\\right)^2 } d x}$.",
429
+ "response": "$\\int{\\frac{ x }{ \\left(x^2-4 \\cdot x+8\\right)^2 } d x}$ =$C+\\frac{x-2}{2\\cdot\\left(8+2\\cdot(x-2)^2\\right)}+\\frac{1}{8}\\cdot\\arctan\\left(\\frac{1}{2}\\cdot(x-2)\\right)-\\frac{1}{2\\cdot\\left(x^2-4\\cdot x+8\\right)}$",
430
+ "system": ""
431
+ },
432
+ {
433
+ "prompt": "Determine the Taylor series for $y=\\left(\\sin(x)\\right)^2$, centered at $x_{0}=\\frac{ \\pi }{ 2 }$. Write out the sum of the first three non-zero terms, followed by dots.",
434
+ "response": "This is the final answer to the problem: $1-\\frac{2}{2!}\\cdot\\left(x-\\frac{\\pi}{2}\\right)^2+\\frac{2^3}{4!}\\cdot\\left(x-\\frac{\\pi}{2}\\right)^4+\\cdots$",
435
+ "system": ""
436
+ },
437
+ {
438
+ "prompt": "Compute the integral: $\\int{\\frac{ \\sin(x)^4 }{ \\cos(x) } d x}$.",
439
+ "response": "$\\int{\\frac{ \\sin(x)^4 }{ \\cos(x) } d x}$ =$C-\\frac{1}{2}\\cdot\\ln\\left(\\left|\\frac{1-\\sin(x)}{1+\\sin(x)}\\right|\\right)-\\frac{1}{3}\\cdot\\left(\\sin(x)\\right)^3-\\sin(x)$",
440
+ "system": ""
441
+ },
442
+ {
443
+ "prompt": "Find the Fourier series of the function $f(x)=\\frac{ -1 }{ 2 } \\cdot x$ in the interval $[-2,2]$.",
444
+ "response": "The Fourier series is: $\\sum_{n=1}^\\infty\\left(\\frac{2\\cdot(-1)^n}{\\pi\\cdot n}\\cdot\\sin\\left(\\frac{\\pi\\cdot n\\cdot x}{2}\\right)\\right)$",
445
+ "system": ""
446
+ },
447
+ {
448
+ "prompt": "Find the moment of inertia of one arch of the cycloid $x=3 \\cdot a \\cdot \\left(\\frac{ t }{ 2 }-\\sin\\left(\\frac{ t }{ 2 }\\right)\\right)$, $y=3 \\cdot a \\cdot \\left(1-\\cos\\left(\\frac{ t }{ 2 }\\right)\\right)$ relative to the x-axis.",
449
+ "response": "Moment of Inertia: $\\frac{1152}{5}\\cdot a^3$",
450
+ "system": ""
451
+ },
452
+ {
453
+ "prompt": "Find the tangential and normal components of acceleration if $\\vec{r}(t)=\\left\\langle 6 \\cdot t,3 \\cdot t^2,2 \\cdot t^3 \\right\\rangle$",
454
+ "response": "$a_{T}$ =$\\frac{12\\cdot t^3+6\\cdot t}{\\sqrt{t^4+t^2+1}}$ ; $a_{N}$ = $\\frac{6\\cdot\\sqrt{t^4+4\\cdot t^2+1}}{\\sqrt{t^4+t^2+1}}$",
455
+ "system": ""
456
+ },
457
+ {
458
+ "prompt": "Find $\\frac{ d y }{d x}$, given $y=\\tan(2 \\cdot v)$ and $v=\\arctan(2 \\cdot x-1)$.",
459
+ "response": "This is the final answer to the problem: $\\frac{dy}{dx}=\\frac{2\\cdot x^2-2\\cdot x+1}{2\\cdot\\left(x-x^2\\right)^2}$",
460
+ "system": ""
461
+ },
462
+ {
463
+ "prompt": "Find $\\frac{ d y }{d x}$, given $y=\\tan(2 \\cdot v)$ and $v=\\arctan(2 \\cdot x-1)$.",
464
+ "response": "This is the final answer to the problem: $\\frac{dy}{dx}=\\frac{2\\cdot x^2-2\\cdot x+1}{2\\cdot\\left(x-x^2\\right)^2}$",
465
+ "system": ""
466
+ },
467
+ {
468
+ "prompt": "Let $z=e^{1-x \\cdot y}$, $x=t^{\\frac{ 1 }{ 3 }}$, $y=t^3$. Find $\\frac{ d z }{d t}$.",
469
+ "response": "$\\frac{ d z }{d t}$ =$\\frac{-(10\\cdot e)}{3}\\cdot e^{-t^3\\cdot\\sqrt[3]{t}}\\cdot t^2\\cdot\\sqrt[3]{t}$",
470
+ "system": ""
471
+ },
472
+ {
473
+ "prompt": "Find the derivative of the function: $y=-3 \\cdot x^{\\sqrt[3]{2 \\cdot x}}$.",
474
+ "response": "$\\frac{ d y }{d x}$ =$-\\left(\\frac{3\\cdot\\sqrt[3]{2}}{x^{\\frac{2}{3}}}+\\frac{\\sqrt[3]{2}\\cdot\\ln(x)}{x^{\\frac{2}{3}}}\\right)\\cdot x^{\\sqrt[3]{2}\\cdot\\sqrt[3]{x}}$",
475
+ "system": ""
476
+ },
477
+ {
478
+ "prompt": "Let $R$ be the region bounded by the graphs of $y=\\frac{ 1 }{ x+2 }$ and $y=-\\frac{ 1 }{ 2 } \\cdot x+3$.\n\nFind the volume of the solid generated when $R$ is rotated about the vertical line $x=-3$.",
479
+ "response": "The volume of the solid is $292.097$ units³.",
480
+ "system": ""
481
+ },
482
+ {
483
+ "prompt": "Find zeros of $f(x)=\\sin(x)+\\sin(2 \\cdot x)+2 \\cdot \\sin(x) \\cdot \\sin(2 \\cdot x)-2 \\cdot \\cos(x)-\\cos(2 \\cdot x)$.",
484
+ "response": "This is the final answer to the problem: $x_1=-\\frac{\\pi}{2}+2\\cdot\\pi\\cdot n, x_2=-\\frac{2\\cdot\\pi}{3}+2\\cdot\\pi\\cdot n, x_3=\\frac{2\\cdot\\pi}{3}+2\\cdot\\pi\\cdot n, x_4=(-1)^n\\cdot\\frac{\\pi}{6}+\\pi\\cdot n$",
485
+ "system": ""
486
+ },
487
+ {
488
+ "prompt": "Find the Fourier series of the function $\\psi(x)=e^{-x}$ in the interval $(-2 \\cdot \\pi,\\pi \\cdot 2)$.",
489
+ "response": "The Fourier series is: $e^{-x}=\\frac{\\left(e^{2\\cdot\\pi}-e^{-2\\cdot\\pi}\\right)}{\\pi}\\cdot\\left(\\frac{1}{4}+\\sum_{n=1}^\\infty\\left(\\frac{(-1)^n}{4+n^2}\\cdot\\left(2\\cdot\\cos\\left(\\frac{n}{2}\\cdot x\\right)+n\\cdot\\sin\\left(\\frac{n}{2}\\cdot x\\right)\\right)\\right)\\right)$",
490
+ "system": ""
491
+ },
492
+ {
493
+ "prompt": "Find the Fourier series of the function $\\psi(x)=e^{-x}$ in the interval $(-2 \\cdot \\pi,\\pi \\cdot 2)$.",
494
+ "response": "The Fourier series is: $e^{-x}=\\frac{\\left(e^{2\\cdot\\pi}-e^{-2\\cdot\\pi}\\right)}{\\pi}\\cdot\\left(\\frac{1}{4}+\\sum_{n=1}^\\infty\\left(\\frac{(-1)^n}{4+n^2}\\cdot\\left(2\\cdot\\cos\\left(\\frac{n}{2}\\cdot x\\right)+n\\cdot\\sin\\left(\\frac{n}{2}\\cdot x\\right)\\right)\\right)\\right)$",
495
+ "system": ""
496
+ },
497
+ {
498
+ "prompt": "Using the series expansion for the function $(1+x)^m$ calculate approximately $\\sqrt[3]{7}$ with accuracy 0.0001.",
499
+ "response": "This is the final answer to the problem: $1.9129$",
500
+ "system": ""
501
+ },
502
+ {
503
+ "prompt": "Using the series expansion for the function $(1+x)^m$ calculate approximately $\\sqrt[3]{7}$ with accuracy 0.0001.",
504
+ "response": "This is the final answer to the problem: $1.9129$",
505
+ "system": ""
506
+ },
507
+ {
508
+ "prompt": "Given $y=3 \\cdot x^5+20 \\cdot x^4+40 \\cdot x^3+100$ find where the function is concave up, down, and point(s) of inflection.",
509
+ "response": "Concave up:$(0,\\infty)$Concave down:$(-2,0), (-\\infty,-2)$Point(s) of Inflection:$P(0,100)$",
510
+ "system": ""
511
+ },
512
+ {
513
+ "prompt": "Compute the integral: $\\int{\\frac{ -12 }{ \\sin(6 \\cdot x)^6 } d x}$.",
514
+ "response": "$\\int{\\frac{ -12 }{ \\sin(6 \\cdot x)^6 } d x}$ =$C+2\\cdot\\cot(6\\cdot x)+\\frac{2}{5}\\cdot\\left(\\cot(6\\cdot x)\\right)^5+\\frac{4}{3}\\cdot\\left(\\cot(6\\cdot x)\\right)^3$",
515
+ "system": ""
516
+ },
517
+ {
518
+ "prompt": "Use the substitution $(b+x)^r=(b+a)^r \\cdot \\left(1+\\frac{ x-a }{ b+a }\\right)^r$ in the binomial expansion to find the Taylor series of the function $x^{\\frac{ 1 }{ 3 }}$ with the center $a=27$.",
519
+ "response": "$x^{\\frac{ 1 }{ 3 }}$ =$\\sum_{n=0}^\\infty\\left(3^{1-3\\cdot n}\\cdot C_n^{\\frac{1}{3}}\\cdot(x-27)^n\\right)$",
520
+ "system": ""
521
+ },
522
+ {
523
+ "prompt": "Evaluate the integral: $I=\\int{3 \\cdot x \\cdot \\ln\\left(4+\\frac{ 1 }{ x }\\right) d x}$.",
524
+ "response": "This is the final answer to the problem: $\\left(\\frac{3}{2}\\cdot x^2\\cdot\\ln(4\\cdot x+1)-\\frac{3\\cdot x^2}{4}+\\frac{3\\cdot x}{8}-\\frac{3}{32}\\cdot\\ln\\left(x+\\frac{1}{4}\\right)\\right)-\\left(\\frac{3}{2}\\cdot x^2\\cdot\\ln(x)-\\left(C+\\frac{3}{4}\\cdot x^2\\right)\\right)$",
525
+ "system": ""
526
+ },
527
+ {
528
+ "prompt": "Find the Fourier series of the function $\\psi(x)=e^{-x}$ in the interval $(-\\pi,\\pi)$.",
529
+ "response": "The Fourier series is: $e^{-x}=\\frac{e^\\pi-e^{-\\pi}}{2\\cdot\\pi}\\cdot\\left(\\frac{1}{2}+\\sum_{n=1}^\\infty\\left(\\frac{(-1)^n}{1+n^2}\\cdot\\left(\\cos(n\\cdot x)+n\\cdot\\sin(n\\cdot x)\\right)\\right)\\right)$",
530
+ "system": ""
531
+ },
532
+ {
533
+ "prompt": "Sketch the curve: \n\n$y=3 \\cdot x \\cdot \\sqrt{2-x^2}$. \n\nSubmit as your final answer:\n\n1. The domain (in interval notation)\n2. Vertical asymptotes\n3. Horizontal asymptotes\n4. Slant asymptotes\n5. Intervals where the function is increasing\n6. Intervals where the function is decreasing\n7. Intervals where the function is concave up\n8. Intervals where the function is concave down\n9. Points of inflection",
534
+ "response": "This is the final answer to the problem: \n1. The domain (in interval notation): $\\left[-1\\cdot2^{2^{-1}},2^{2^{-1}}\\right]$\n2. Vertical asymptotes: None\n3. Horizontal asymptotes: None\n4. Slant asymptotes: None\n5. Intervals where the function is increasing: $(-1,1)$\n6. Intervals where the function is decreasing: $\\left(-2^{2^{-1}},-1\\right), \\left(1,2^{2^{-1}}\\right)$\n7. Intervals where the function is concave up: $\\left(-2^{2^{-1}},0\\right)$\n8. Intervals where the function is concave down: $\\left(0,2^{2^{-1}}\\right)$\n9. Points of inflection: $P(0,0)$",
535
+ "system": ""
536
+ },
537
+ {
538
+ "prompt": "Determine the Taylor series for $f(x)=\\frac{ 2 \\cdot x-1 }{ x^2-3 \\cdot x+2 }$, centered at $x_{0}=4$. Write out the sum of the first four non-zero terms, followed by dots.",
539
+ "response": "This is the final answer to the problem: $\\frac{7}{6}+\\left(\\frac{1}{3^2}-\\frac{3}{2^2}\\right)\\cdot(x-4)-\\left(\\frac{1}{3^3}-\\frac{3}{2^3}\\right)\\cdot(x-4)^2+\\left(\\frac{1}{3^4}-\\frac{3}{2^4}\\right)\\cdot(x-4)^3+\\cdots$",
540
+ "system": ""
541
+ },
542
+ {
543
+ "prompt": "Find the 3rd order Taylor polynomial $P_{3}(x)$ for the function $f(x)=\\arctan(x)$ in powers of $x-1$ and give the Lagrange form of the remainder.",
544
+ "response": "$P_{3}(x)$=$\\frac{ \\pi }{ 4 }+\\frac{ 1 }{ 2 } \\cdot (x-1)-\\frac{ 1 }{ 4 } \\cdot (x-1)^2+\\frac{ 1 }{ 12 } \\cdot (x-1)^3$ \n\n$R_{3}(x)$=$\\frac{ -\\frac{ 48 \\cdot c^3 }{ \\left(1+c^2\\right)^4 }+\\frac{ 24 \\cdot c }{ \\left(1+c^2\\right)^3 } }{ 4! } \\cdot (x-1)^4$",
545
+ "system": ""
546
+ },
547
+ {
548
+ "prompt": "Find the area of the surface formed by rotating the arc of the circle $x^2+y^2=1$ between the points $(1,0)$ and $(0,1)$ in the first quadrant, around the line $x+y=1$.",
549
+ "response": "This is the final answer to the problem: $\\frac{4\\cdot\\pi-\\pi^2}{\\sqrt{2}}$",
550
+ "system": ""
551
+ },
552
+ {
553
+ "prompt": "Expand the function: $y=\\ln\\left(x+\\sqrt{1+x^2}\\right)$ in a power series.",
554
+ "response": "This is the final answer to the problem: $x-\\frac{1}{2}\\cdot\\frac{x^3}{3}+\\frac{1\\cdot3}{4\\cdot2}\\cdot\\frac{x^5}{5}-\\frac{1\\cdot3\\cdot5}{2\\cdot4\\cdot6}\\cdot\\frac{x^7}{7}+\\cdots+\\frac{(2\\cdot n-1)!!}{(2\\cdot n)!!}\\cdot\\frac{x^{2\\cdot n+1}}{2\\cdot n+1}+\\cdots$",
555
+ "system": ""
556
+ },
557
+ {
558
+ "prompt": "Apply the gradient descent algorithm to the function $g(x,y)=\\left(x^2-1\\right) \\cdot \\left(x^2-3 \\cdot x+1\\right)+y^2$ with step size $\\frac{ 1 }{ 5 }$ and initial guess $p_{0}$=$\\left\\langle 0,0 \\right\\rangle$ for three steps (so steps $p_{1}$, $p_{2}$, and $p_{3}$).",
559
+ "response": "| $i$ | $1$ | $2$ | $3$ |\n| --- | --- | --- | --- |\n| $p_{i}$ | $\\left\\langle-\\frac{3}{5},0\\right\\rangle$ | $\\left\\langle-\\frac{237}{625},0\\right\\rangle$ | $\\left\\langle-\\frac{826\\ 113\\ 663}{1\\ 220\\ 703\\ 125},0\\right\\rangle$ |\n| $g\\left(p_{i}\\right)$ | $-\\frac{1264}{625}$ | $-\\frac{99667587}{1220703125}$ |$-\\frac{2760602760604515522296126283436630289590864}{2220446049250313080847263336181640625}$ |",
560
+ "system": ""
561
+ },
562
+ {
563
+ "prompt": "Find the Fourier integral of the function $q(x)=\\begin{cases} 0, & x<0 \\\\ \\pi \\cdot x, & 0 \\le x \\le 2 \\\\ 0, & x>2 \\end{cases}$.",
564
+ "response": "$q(x)$ = $\\int_0^\\infty\\left(\\frac{\\left(2\\cdot\\alpha\\cdot\\sin\\left(2\\cdot\\alpha\\right)+\\cos\\left(2\\cdot\\alpha\\right)-1\\right)\\cdot\\cos\\left(\\alpha\\cdot x\\right)+\\left(\\sin\\left(2\\cdot\\alpha\\right)-2\\cdot\\alpha\\cdot\\cos\\left(2\\cdot\\alpha\\right)\\right)\\cdot\\sin\\left(\\alpha\\cdot x\\right)}{\\alpha^2}\\right)d\\alpha$",
565
+ "system": ""
566
+ },
567
+ {
568
+ "prompt": "Find the derivative of $y=\\sin(2 \\cdot x) \\cdot \\cos(3 \\cdot x)-\\frac{ \\ln(x-1) }{ \\ln(x+1) }+c$",
569
+ "response": "This is the final answer to the problem: $y'=2\\cdot\\cos(5\\cdot x)-\\sin(3\\cdot x)\\cdot\\sin(2\\cdot x)-\\frac{(x+1)\\cdot\\ln(x+1)-(x-1)\\cdot\\ln(x-1)}{(x-1)\\cdot(x+1)\\cdot\\left(\\ln(x+1)\\right)^2}$",
570
+ "system": ""
571
+ },
572
+ {
573
+ "prompt": "Find the derivative of $y=\\sin(2 \\cdot x) \\cdot \\cos(3 \\cdot x)-\\frac{ \\ln(x-1) }{ \\ln(x+1) }+c$",
574
+ "response": "This is the final answer to the problem: $y'=2\\cdot\\cos(5\\cdot x)-\\sin(3\\cdot x)\\cdot\\sin(2\\cdot x)-\\frac{(x+1)\\cdot\\ln(x+1)-(x-1)\\cdot\\ln(x-1)}{(x-1)\\cdot(x+1)\\cdot\\left(\\ln(x+1)\\right)^2}$",
575
+ "system": ""
576
+ },
577
+ {
578
+ "prompt": "Compute the integral: $\\int{\\frac{ 1 }{ (x+4) \\cdot \\sqrt{x^2+2 \\cdot x+5} } d x}$.",
579
+ "response": "$\\int{\\frac{ 1 }{ (x+4) \\cdot \\sqrt{x^2+2 \\cdot x+5} } d x}$ =$C+\\frac{1}{\\sqrt{13}}\\cdot\\ln\\left(\\sqrt{13}-4-x-\\sqrt{x^2+2\\cdot x+5}\\right)-\\frac{1}{\\sqrt{13}}\\cdot\\ln\\left(4+\\sqrt{13}+x+\\sqrt{x^2+2\\cdot x+5}\\right)$",
580
+ "system": ""
581
+ },
582
+ {
583
+ "prompt": "Consider points $A$$P(3,-1,2)$, $B$$P(2,1,5)$, and $C$$P(1,-2,-2)$.\n\n1. Find the area of parallelogram ABCD with adjacent sides $\\vec{AB}$ and $\\vec{AC}$.\n2. Find the area of triangle ABC.\n3. Find the distance from point $A$ to line BC.",
584
+ "response": "1. $A$=$5\\cdot\\sqrt{6}$\n2. $A$=$\\frac{5\\cdot\\sqrt{6}}{2}$\n3. $d$=$\\frac{5\\cdot\\sqrt{6}}{\\sqrt{59}}$",
585
+ "system": ""
586
+ },
587
+ {
588
+ "prompt": "Find the generalized center of mass between $y=b \\cdot \\sin(a \\cdot x)$, $x=0$, and $x=\\frac{ \\pi }{ a }$ . Then, use the Pappus theorem to find the volume of the solid generated when revolving around the $y$-axis.",
589
+ "response": "$(x,y)$ = $P\\left(\\frac{\\pi}{2\\cdot a},\\frac{\\pi\\cdot b}{8}\\right)$ \n\n$V$ = $\\frac{2\\cdot\\pi^2\\cdot b}{a^2}$",
590
+ "system": ""
591
+ },
592
+ {
593
+ "prompt": "Find the equation of the tangent line to the curve: $r=3+\\cos(2 \\cdot t)$, $t=\\frac{ 3 \\cdot \\pi }{ 4 }$.",
594
+ "response": "$y$ = $\\frac{1}{5}\\cdot\\left(x+\\frac{3}{\\sqrt{2}}\\right)+\\frac{3}{\\sqrt{2}}$",
595
+ "system": ""
596
+ },
597
+ {
598
+ "prompt": "Find and classify all critical points of the function $f(x,y)=x \\cdot y \\cdot (1-7 \\cdot x-9 \\cdot y)$.",
599
+ "response": "Points of local minima: None.\n\nPoints of local maxima: $P\\left(\\frac{1}{21},\\frac{1}{27}\\right)$.\n\nSaddle points: $P\\left(0,\\frac{1}{9}\\right), P\\left(\\frac{1}{7},0\\right), P(0,0)$.",
600
+ "system": ""
601
+ },
602
+ {
603
+ "prompt": "When hired at a new job selling electronics, you are given two pay options:\n\nOption A: Base salary of $20\\ 000$ USD a year with a commission of $12$ percent of your sales.\n\nOption B: Base salary of $26\\ 000$ USD a year with a commission of $3$ percent of your sales.\n\nHow much electronics would you need to sell for Option A to produce a larger income? Give your answer either exactly or rounded to two decimal places.",
604
+ "response": "This is the final answer to the problem: $66666.67$",
605
+ "system": ""
606
+ },
607
+ {
608
+ "prompt": "Find $\\frac{d ^3}{ d x^3}f(x)$, given $f(x)=\\ln\\left(\\frac{ x+7 }{ x-7 }\\right)$.",
609
+ "response": "This is the final answer to the problem: $\\frac{d^3}{dx^3}f(x)=-\\frac{84\\cdot x^2+1372}{\\left(x^2-49\\right)^3}$",
610
+ "system": ""
611
+ }
612
+ ]
613
+ }
Mu-Math/group_01/tokenizer/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }