JacobLinCool commited on
Commit
15506a2
·
verified ·
0 Parent(s):

Super-squash branch 'main' using huggingface_hub

Browse files
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
last-checkpoint/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e96643407f870d165fe248087f58882b754a46c68ed8bbbb8296cfb49e4ba16f
3
+ size 21577620
last-checkpoint/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21173a6e85c0d01bd89f0254e9cf12c9f5e201ec065dbf6d1248341f50a19ff7
3
+ size 43185803
last-checkpoint/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07f59e599f3fcca86c21092f1cdf3a0d446538a97483247948f9a0dc00ffc94d
3
+ size 14645
last-checkpoint/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44f986f908978821791045c895960ea997610e2ba08e49fde2d7ec564e6ee286
3
+ size 1465
last-checkpoint/trainer_state.json ADDED
@@ -0,0 +1,1786 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 2469,
3
+ "best_metric": 0.7291858792304993,
4
+ "best_model_checkpoint": "./cefr_reg/checkpoint-2469",
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 2469,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0,
14
+ "eval_acc_0.5": 0.0,
15
+ "eval_acc_1.0": 0.0,
16
+ "eval_loss": 4.52817964553833,
17
+ "eval_mae": 4.471426963806152,
18
+ "eval_pcc": -0.06410610675811768,
19
+ "eval_qwk": -0.0003021129118856436,
20
+ "eval_rmse": 4.532072067260742,
21
+ "eval_runtime": 28.8535,
22
+ "eval_samples_per_second": 111.217,
23
+ "eval_src": -0.05987504900250908,
24
+ "eval_steps_per_second": 6.966,
25
+ "step": 0
26
+ },
27
+ {
28
+ "epoch": 0.004050222762251924,
29
+ "grad_norm": 10.551199913024902,
30
+ "learning_rate": 9.987849331713245e-05,
31
+ "loss": 2.7505,
32
+ "step": 10
33
+ },
34
+ {
35
+ "epoch": 0.008100445524503848,
36
+ "grad_norm": 1.4525120258331299,
37
+ "learning_rate": 9.974348589172406e-05,
38
+ "loss": 0.9612,
39
+ "step": 20
40
+ },
41
+ {
42
+ "epoch": 0.012150668286755772,
43
+ "grad_norm": 5.914994716644287,
44
+ "learning_rate": 9.960847846631565e-05,
45
+ "loss": 0.7054,
46
+ "step": 30
47
+ },
48
+ {
49
+ "epoch": 0.016200891049007696,
50
+ "grad_norm": 7.5304484367370605,
51
+ "learning_rate": 9.947347104090725e-05,
52
+ "loss": 0.7235,
53
+ "step": 40
54
+ },
55
+ {
56
+ "epoch": 0.02025111381125962,
57
+ "grad_norm": 9.218586921691895,
58
+ "learning_rate": 9.933846361549886e-05,
59
+ "loss": 0.7934,
60
+ "step": 50
61
+ },
62
+ {
63
+ "epoch": 0.024301336573511544,
64
+ "grad_norm": 4.827125072479248,
65
+ "learning_rate": 9.920345619009046e-05,
66
+ "loss": 0.7254,
67
+ "step": 60
68
+ },
69
+ {
70
+ "epoch": 0.028351559335763468,
71
+ "grad_norm": 2.5279595851898193,
72
+ "learning_rate": 9.906844876468207e-05,
73
+ "loss": 0.7811,
74
+ "step": 70
75
+ },
76
+ {
77
+ "epoch": 0.03240178209801539,
78
+ "grad_norm": 1.1081438064575195,
79
+ "learning_rate": 9.893344133927366e-05,
80
+ "loss": 0.7307,
81
+ "step": 80
82
+ },
83
+ {
84
+ "epoch": 0.03645200486026731,
85
+ "grad_norm": 3.5692625045776367,
86
+ "learning_rate": 9.879843391386526e-05,
87
+ "loss": 0.7255,
88
+ "step": 90
89
+ },
90
+ {
91
+ "epoch": 0.04050222762251924,
92
+ "grad_norm": 5.297443389892578,
93
+ "learning_rate": 9.866342648845687e-05,
94
+ "loss": 0.7331,
95
+ "step": 100
96
+ },
97
+ {
98
+ "epoch": 0.04455245038477116,
99
+ "grad_norm": 2.653315305709839,
100
+ "learning_rate": 9.852841906304847e-05,
101
+ "loss": 0.6897,
102
+ "step": 110
103
+ },
104
+ {
105
+ "epoch": 0.04860267314702309,
106
+ "grad_norm": 1.8816819190979004,
107
+ "learning_rate": 9.839341163764008e-05,
108
+ "loss": 0.6987,
109
+ "step": 120
110
+ },
111
+ {
112
+ "epoch": 0.05265289590927501,
113
+ "grad_norm": 1.560968279838562,
114
+ "learning_rate": 9.825840421223169e-05,
115
+ "loss": 0.6981,
116
+ "step": 130
117
+ },
118
+ {
119
+ "epoch": 0.056703118671526935,
120
+ "grad_norm": 4.910257339477539,
121
+ "learning_rate": 9.812339678682328e-05,
122
+ "loss": 0.7113,
123
+ "step": 140
124
+ },
125
+ {
126
+ "epoch": 0.060753341433778855,
127
+ "grad_norm": 5.892327785491943,
128
+ "learning_rate": 9.798838936141488e-05,
129
+ "loss": 0.7456,
130
+ "step": 150
131
+ },
132
+ {
133
+ "epoch": 0.06480356419603078,
134
+ "grad_norm": 2.5749614238739014,
135
+ "learning_rate": 9.785338193600649e-05,
136
+ "loss": 0.7249,
137
+ "step": 160
138
+ },
139
+ {
140
+ "epoch": 0.0688537869582827,
141
+ "grad_norm": 2.3996565341949463,
142
+ "learning_rate": 9.771837451059809e-05,
143
+ "loss": 0.6995,
144
+ "step": 170
145
+ },
146
+ {
147
+ "epoch": 0.07290400972053462,
148
+ "grad_norm": 5.240713119506836,
149
+ "learning_rate": 9.75833670851897e-05,
150
+ "loss": 0.7327,
151
+ "step": 180
152
+ },
153
+ {
154
+ "epoch": 0.07695423248278656,
155
+ "grad_norm": 7.391425609588623,
156
+ "learning_rate": 9.744835965978129e-05,
157
+ "loss": 0.7053,
158
+ "step": 190
159
+ },
160
+ {
161
+ "epoch": 0.08100445524503848,
162
+ "grad_norm": 7.433688163757324,
163
+ "learning_rate": 9.731335223437289e-05,
164
+ "loss": 0.7026,
165
+ "step": 200
166
+ },
167
+ {
168
+ "epoch": 0.0850546780072904,
169
+ "grad_norm": 2.655003786087036,
170
+ "learning_rate": 9.71783448089645e-05,
171
+ "loss": 0.6712,
172
+ "step": 210
173
+ },
174
+ {
175
+ "epoch": 0.08910490076954232,
176
+ "grad_norm": 1.991586446762085,
177
+ "learning_rate": 9.70433373835561e-05,
178
+ "loss": 0.6711,
179
+ "step": 220
180
+ },
181
+ {
182
+ "epoch": 0.09315512353179425,
183
+ "grad_norm": 1.123203992843628,
184
+ "learning_rate": 9.69083299581477e-05,
185
+ "loss": 0.6624,
186
+ "step": 230
187
+ },
188
+ {
189
+ "epoch": 0.09720534629404617,
190
+ "grad_norm": 7.008766174316406,
191
+ "learning_rate": 9.67733225327393e-05,
192
+ "loss": 0.7115,
193
+ "step": 240
194
+ },
195
+ {
196
+ "epoch": 0.1012555690562981,
197
+ "grad_norm": 7.505290508270264,
198
+ "learning_rate": 9.663831510733091e-05,
199
+ "loss": 0.7537,
200
+ "step": 250
201
+ },
202
+ {
203
+ "epoch": 0.10530579181855002,
204
+ "grad_norm": 3.7045347690582275,
205
+ "learning_rate": 9.650330768192251e-05,
206
+ "loss": 0.6647,
207
+ "step": 260
208
+ },
209
+ {
210
+ "epoch": 0.10935601458080195,
211
+ "grad_norm": 3.018012523651123,
212
+ "learning_rate": 9.63683002565141e-05,
213
+ "loss": 0.7184,
214
+ "step": 270
215
+ },
216
+ {
217
+ "epoch": 0.11340623734305387,
218
+ "grad_norm": 9.340592384338379,
219
+ "learning_rate": 9.623329283110572e-05,
220
+ "loss": 0.6556,
221
+ "step": 280
222
+ },
223
+ {
224
+ "epoch": 0.11745646010530579,
225
+ "grad_norm": 6.4435014724731445,
226
+ "learning_rate": 9.609828540569731e-05,
227
+ "loss": 0.7042,
228
+ "step": 290
229
+ },
230
+ {
231
+ "epoch": 0.12150668286755771,
232
+ "grad_norm": 4.117043495178223,
233
+ "learning_rate": 9.596327798028892e-05,
234
+ "loss": 0.6879,
235
+ "step": 300
236
+ },
237
+ {
238
+ "epoch": 0.12555690562980965,
239
+ "grad_norm": 1.508130431175232,
240
+ "learning_rate": 9.582827055488053e-05,
241
+ "loss": 0.7142,
242
+ "step": 310
243
+ },
244
+ {
245
+ "epoch": 0.12960712839206157,
246
+ "grad_norm": 3.512599229812622,
247
+ "learning_rate": 9.569326312947212e-05,
248
+ "loss": 0.6657,
249
+ "step": 320
250
+ },
251
+ {
252
+ "epoch": 0.1336573511543135,
253
+ "grad_norm": 1.518929123878479,
254
+ "learning_rate": 9.555825570406373e-05,
255
+ "loss": 0.6845,
256
+ "step": 330
257
+ },
258
+ {
259
+ "epoch": 0.1377075739165654,
260
+ "grad_norm": 2.9133493900299072,
261
+ "learning_rate": 9.542324827865534e-05,
262
+ "loss": 0.7757,
263
+ "step": 340
264
+ },
265
+ {
266
+ "epoch": 0.14175779667881733,
267
+ "grad_norm": 4.808182239532471,
268
+ "learning_rate": 9.528824085324693e-05,
269
+ "loss": 0.6532,
270
+ "step": 350
271
+ },
272
+ {
273
+ "epoch": 0.14580801944106925,
274
+ "grad_norm": 4.0984392166137695,
275
+ "learning_rate": 9.515323342783854e-05,
276
+ "loss": 0.681,
277
+ "step": 360
278
+ },
279
+ {
280
+ "epoch": 0.1498582422033212,
281
+ "grad_norm": 3.545947551727295,
282
+ "learning_rate": 9.501822600243013e-05,
283
+ "loss": 0.7553,
284
+ "step": 370
285
+ },
286
+ {
287
+ "epoch": 0.15390846496557312,
288
+ "grad_norm": 6.998193264007568,
289
+ "learning_rate": 9.488321857702174e-05,
290
+ "loss": 0.6444,
291
+ "step": 380
292
+ },
293
+ {
294
+ "epoch": 0.15795868772782504,
295
+ "grad_norm": 1.2728239297866821,
296
+ "learning_rate": 9.474821115161335e-05,
297
+ "loss": 0.6596,
298
+ "step": 390
299
+ },
300
+ {
301
+ "epoch": 0.16200891049007696,
302
+ "grad_norm": 2.098156690597534,
303
+ "learning_rate": 9.461320372620494e-05,
304
+ "loss": 0.7261,
305
+ "step": 400
306
+ },
307
+ {
308
+ "epoch": 0.16605913325232888,
309
+ "grad_norm": 7.71750545501709,
310
+ "learning_rate": 9.447819630079655e-05,
311
+ "loss": 0.6531,
312
+ "step": 410
313
+ },
314
+ {
315
+ "epoch": 0.1701093560145808,
316
+ "grad_norm": 3.735727548599243,
317
+ "learning_rate": 9.434318887538815e-05,
318
+ "loss": 0.7082,
319
+ "step": 420
320
+ },
321
+ {
322
+ "epoch": 0.17415957877683272,
323
+ "grad_norm": 1.3447264432907104,
324
+ "learning_rate": 9.420818144997975e-05,
325
+ "loss": 0.7463,
326
+ "step": 430
327
+ },
328
+ {
329
+ "epoch": 0.17820980153908464,
330
+ "grad_norm": 1.6115808486938477,
331
+ "learning_rate": 9.407317402457136e-05,
332
+ "loss": 0.7036,
333
+ "step": 440
334
+ },
335
+ {
336
+ "epoch": 0.1822600243013366,
337
+ "grad_norm": 5.158803462982178,
338
+ "learning_rate": 9.393816659916295e-05,
339
+ "loss": 0.6981,
340
+ "step": 450
341
+ },
342
+ {
343
+ "epoch": 0.1863102470635885,
344
+ "grad_norm": 3.6466639041900635,
345
+ "learning_rate": 9.380315917375456e-05,
346
+ "loss": 0.7357,
347
+ "step": 460
348
+ },
349
+ {
350
+ "epoch": 0.19036046982584043,
351
+ "grad_norm": 3.898153781890869,
352
+ "learning_rate": 9.366815174834617e-05,
353
+ "loss": 0.6626,
354
+ "step": 470
355
+ },
356
+ {
357
+ "epoch": 0.19441069258809235,
358
+ "grad_norm": 5.203450679779053,
359
+ "learning_rate": 9.353314432293777e-05,
360
+ "loss": 0.7449,
361
+ "step": 480
362
+ },
363
+ {
364
+ "epoch": 0.19846091535034427,
365
+ "grad_norm": 2.795436143875122,
366
+ "learning_rate": 9.339813689752937e-05,
367
+ "loss": 0.6609,
368
+ "step": 490
369
+ },
370
+ {
371
+ "epoch": 0.2025111381125962,
372
+ "grad_norm": 5.741515636444092,
373
+ "learning_rate": 9.326312947212096e-05,
374
+ "loss": 0.6996,
375
+ "step": 500
376
+ },
377
+ {
378
+ "epoch": 0.2065613608748481,
379
+ "grad_norm": 7.058830738067627,
380
+ "learning_rate": 9.312812204671257e-05,
381
+ "loss": 0.6827,
382
+ "step": 510
383
+ },
384
+ {
385
+ "epoch": 0.21061158363710003,
386
+ "grad_norm": 5.432498931884766,
387
+ "learning_rate": 9.299311462130418e-05,
388
+ "loss": 0.7386,
389
+ "step": 520
390
+ },
391
+ {
392
+ "epoch": 0.21466180639935195,
393
+ "grad_norm": 4.78528356552124,
394
+ "learning_rate": 9.285810719589578e-05,
395
+ "loss": 0.6675,
396
+ "step": 530
397
+ },
398
+ {
399
+ "epoch": 0.2187120291616039,
400
+ "grad_norm": 4.96106481552124,
401
+ "learning_rate": 9.272309977048738e-05,
402
+ "loss": 0.7469,
403
+ "step": 540
404
+ },
405
+ {
406
+ "epoch": 0.22276225192385582,
407
+ "grad_norm": 4.435976982116699,
408
+ "learning_rate": 9.258809234507899e-05,
409
+ "loss": 0.6679,
410
+ "step": 550
411
+ },
412
+ {
413
+ "epoch": 0.22681247468610774,
414
+ "grad_norm": 2.9775969982147217,
415
+ "learning_rate": 9.245308491967058e-05,
416
+ "loss": 0.7926,
417
+ "step": 560
418
+ },
419
+ {
420
+ "epoch": 0.23086269744835966,
421
+ "grad_norm": 6.675817012786865,
422
+ "learning_rate": 9.23180774942622e-05,
423
+ "loss": 0.6973,
424
+ "step": 570
425
+ },
426
+ {
427
+ "epoch": 0.23491292021061158,
428
+ "grad_norm": 3.2485146522521973,
429
+ "learning_rate": 9.218307006885379e-05,
430
+ "loss": 0.636,
431
+ "step": 580
432
+ },
433
+ {
434
+ "epoch": 0.2389631429728635,
435
+ "grad_norm": 5.162914752960205,
436
+ "learning_rate": 9.20480626434454e-05,
437
+ "loss": 0.7364,
438
+ "step": 590
439
+ },
440
+ {
441
+ "epoch": 0.24301336573511542,
442
+ "grad_norm": 1.276663899421692,
443
+ "learning_rate": 9.1913055218037e-05,
444
+ "loss": 0.6902,
445
+ "step": 600
446
+ },
447
+ {
448
+ "epoch": 0.24706358849736734,
449
+ "grad_norm": 6.817727565765381,
450
+ "learning_rate": 9.17780477926286e-05,
451
+ "loss": 0.7476,
452
+ "step": 610
453
+ },
454
+ {
455
+ "epoch": 0.2511138112596193,
456
+ "grad_norm": 1.8886030912399292,
457
+ "learning_rate": 9.16430403672202e-05,
458
+ "loss": 0.7089,
459
+ "step": 620
460
+ },
461
+ {
462
+ "epoch": 0.2551640340218712,
463
+ "grad_norm": 4.143105506896973,
464
+ "learning_rate": 9.15080329418118e-05,
465
+ "loss": 0.662,
466
+ "step": 630
467
+ },
468
+ {
469
+ "epoch": 0.25921425678412313,
470
+ "grad_norm": 3.767601490020752,
471
+ "learning_rate": 9.137302551640341e-05,
472
+ "loss": 0.6633,
473
+ "step": 640
474
+ },
475
+ {
476
+ "epoch": 0.26326447954637505,
477
+ "grad_norm": 4.995476245880127,
478
+ "learning_rate": 9.123801809099502e-05,
479
+ "loss": 0.7448,
480
+ "step": 650
481
+ },
482
+ {
483
+ "epoch": 0.267314702308627,
484
+ "grad_norm": 7.73383903503418,
485
+ "learning_rate": 9.11030106655866e-05,
486
+ "loss": 0.7195,
487
+ "step": 660
488
+ },
489
+ {
490
+ "epoch": 0.2713649250708789,
491
+ "grad_norm": 6.871426582336426,
492
+ "learning_rate": 9.096800324017821e-05,
493
+ "loss": 0.6511,
494
+ "step": 670
495
+ },
496
+ {
497
+ "epoch": 0.2754151478331308,
498
+ "grad_norm": 3.914733409881592,
499
+ "learning_rate": 9.083299581476981e-05,
500
+ "loss": 0.7573,
501
+ "step": 680
502
+ },
503
+ {
504
+ "epoch": 0.27946537059538273,
505
+ "grad_norm": 10.242081642150879,
506
+ "learning_rate": 9.069798838936142e-05,
507
+ "loss": 0.6825,
508
+ "step": 690
509
+ },
510
+ {
511
+ "epoch": 0.28351559335763465,
512
+ "grad_norm": 6.312399864196777,
513
+ "learning_rate": 9.056298096395303e-05,
514
+ "loss": 0.7502,
515
+ "step": 700
516
+ },
517
+ {
518
+ "epoch": 0.2875658161198866,
519
+ "grad_norm": 3.6532344818115234,
520
+ "learning_rate": 9.042797353854461e-05,
521
+ "loss": 0.7196,
522
+ "step": 710
523
+ },
524
+ {
525
+ "epoch": 0.2916160388821385,
526
+ "grad_norm": 7.512033462524414,
527
+ "learning_rate": 9.029296611313622e-05,
528
+ "loss": 0.6044,
529
+ "step": 720
530
+ },
531
+ {
532
+ "epoch": 0.2956662616443904,
533
+ "grad_norm": 3.490694284439087,
534
+ "learning_rate": 9.015795868772783e-05,
535
+ "loss": 0.6686,
536
+ "step": 730
537
+ },
538
+ {
539
+ "epoch": 0.2997164844066424,
540
+ "grad_norm": 2.284693479537964,
541
+ "learning_rate": 9.002295126231943e-05,
542
+ "loss": 0.6267,
543
+ "step": 740
544
+ },
545
+ {
546
+ "epoch": 0.3037667071688943,
547
+ "grad_norm": 5.484620571136475,
548
+ "learning_rate": 8.988794383691104e-05,
549
+ "loss": 0.7032,
550
+ "step": 750
551
+ },
552
+ {
553
+ "epoch": 0.30781692993114623,
554
+ "grad_norm": 2.1748404502868652,
555
+ "learning_rate": 8.975293641150264e-05,
556
+ "loss": 0.6879,
557
+ "step": 760
558
+ },
559
+ {
560
+ "epoch": 0.31186715269339815,
561
+ "grad_norm": 1.6475012302398682,
562
+ "learning_rate": 8.961792898609423e-05,
563
+ "loss": 0.6903,
564
+ "step": 770
565
+ },
566
+ {
567
+ "epoch": 0.3159173754556501,
568
+ "grad_norm": 1.1322206258773804,
569
+ "learning_rate": 8.948292156068584e-05,
570
+ "loss": 0.6528,
571
+ "step": 780
572
+ },
573
+ {
574
+ "epoch": 0.319967598217902,
575
+ "grad_norm": 2.2691164016723633,
576
+ "learning_rate": 8.934791413527744e-05,
577
+ "loss": 0.6407,
578
+ "step": 790
579
+ },
580
+ {
581
+ "epoch": 0.3240178209801539,
582
+ "grad_norm": 4.608505725860596,
583
+ "learning_rate": 8.921290670986905e-05,
584
+ "loss": 0.7131,
585
+ "step": 800
586
+ },
587
+ {
588
+ "epoch": 0.32806804374240583,
589
+ "grad_norm": 1.64627206325531,
590
+ "learning_rate": 8.907789928446065e-05,
591
+ "loss": 0.6729,
592
+ "step": 810
593
+ },
594
+ {
595
+ "epoch": 0.33211826650465776,
596
+ "grad_norm": 2.928720235824585,
597
+ "learning_rate": 8.894289185905226e-05,
598
+ "loss": 0.673,
599
+ "step": 820
600
+ },
601
+ {
602
+ "epoch": 0.3361684892669097,
603
+ "grad_norm": 1.43545663356781,
604
+ "learning_rate": 8.880788443364385e-05,
605
+ "loss": 0.6354,
606
+ "step": 830
607
+ },
608
+ {
609
+ "epoch": 0.3402187120291616,
610
+ "grad_norm": 3.5672547817230225,
611
+ "learning_rate": 8.867287700823545e-05,
612
+ "loss": 0.6499,
613
+ "step": 840
614
+ },
615
+ {
616
+ "epoch": 0.3442689347914135,
617
+ "grad_norm": 4.033609867095947,
618
+ "learning_rate": 8.853786958282706e-05,
619
+ "loss": 0.7174,
620
+ "step": 850
621
+ },
622
+ {
623
+ "epoch": 0.34831915755366544,
624
+ "grad_norm": 3.793355941772461,
625
+ "learning_rate": 8.840286215741867e-05,
626
+ "loss": 0.7206,
627
+ "step": 860
628
+ },
629
+ {
630
+ "epoch": 0.35236938031591736,
631
+ "grad_norm": 4.056700706481934,
632
+ "learning_rate": 8.826785473201027e-05,
633
+ "loss": 0.7267,
634
+ "step": 870
635
+ },
636
+ {
637
+ "epoch": 0.3564196030781693,
638
+ "grad_norm": 2.8090407848358154,
639
+ "learning_rate": 8.813284730660186e-05,
640
+ "loss": 0.6465,
641
+ "step": 880
642
+ },
643
+ {
644
+ "epoch": 0.3604698258404212,
645
+ "grad_norm": 3.3442647457122803,
646
+ "learning_rate": 8.799783988119346e-05,
647
+ "loss": 0.6746,
648
+ "step": 890
649
+ },
650
+ {
651
+ "epoch": 0.3645200486026732,
652
+ "grad_norm": 5.908054828643799,
653
+ "learning_rate": 8.786283245578507e-05,
654
+ "loss": 0.6155,
655
+ "step": 900
656
+ },
657
+ {
658
+ "epoch": 0.3685702713649251,
659
+ "grad_norm": 7.263230800628662,
660
+ "learning_rate": 8.772782503037668e-05,
661
+ "loss": 0.6545,
662
+ "step": 910
663
+ },
664
+ {
665
+ "epoch": 0.372620494127177,
666
+ "grad_norm": 3.1377646923065186,
667
+ "learning_rate": 8.759281760496828e-05,
668
+ "loss": 0.7712,
669
+ "step": 920
670
+ },
671
+ {
672
+ "epoch": 0.37667071688942894,
673
+ "grad_norm": 1.6817662715911865,
674
+ "learning_rate": 8.745781017955989e-05,
675
+ "loss": 0.6771,
676
+ "step": 930
677
+ },
678
+ {
679
+ "epoch": 0.38072093965168086,
680
+ "grad_norm": 3.2026214599609375,
681
+ "learning_rate": 8.732280275415148e-05,
682
+ "loss": 0.7437,
683
+ "step": 940
684
+ },
685
+ {
686
+ "epoch": 0.3847711624139328,
687
+ "grad_norm": 1.632327914237976,
688
+ "learning_rate": 8.718779532874308e-05,
689
+ "loss": 0.7053,
690
+ "step": 950
691
+ },
692
+ {
693
+ "epoch": 0.3888213851761847,
694
+ "grad_norm": 3.373493194580078,
695
+ "learning_rate": 8.705278790333469e-05,
696
+ "loss": 0.7054,
697
+ "step": 960
698
+ },
699
+ {
700
+ "epoch": 0.3928716079384366,
701
+ "grad_norm": 2.388636827468872,
702
+ "learning_rate": 8.691778047792629e-05,
703
+ "loss": 0.6703,
704
+ "step": 970
705
+ },
706
+ {
707
+ "epoch": 0.39692183070068854,
708
+ "grad_norm": 1.4855486154556274,
709
+ "learning_rate": 8.67827730525179e-05,
710
+ "loss": 0.6924,
711
+ "step": 980
712
+ },
713
+ {
714
+ "epoch": 0.40097205346294046,
715
+ "grad_norm": 1.8625538349151611,
716
+ "learning_rate": 8.66477656271095e-05,
717
+ "loss": 0.6808,
718
+ "step": 990
719
+ },
720
+ {
721
+ "epoch": 0.4050222762251924,
722
+ "grad_norm": 9.942273139953613,
723
+ "learning_rate": 8.651275820170109e-05,
724
+ "loss": 0.6786,
725
+ "step": 1000
726
+ },
727
+ {
728
+ "epoch": 0.4090724989874443,
729
+ "grad_norm": 6.859795570373535,
730
+ "learning_rate": 8.63777507762927e-05,
731
+ "loss": 0.7188,
732
+ "step": 1010
733
+ },
734
+ {
735
+ "epoch": 0.4131227217496962,
736
+ "grad_norm": 6.556404113769531,
737
+ "learning_rate": 8.62427433508843e-05,
738
+ "loss": 0.7225,
739
+ "step": 1020
740
+ },
741
+ {
742
+ "epoch": 0.41717294451194814,
743
+ "grad_norm": 4.393095016479492,
744
+ "learning_rate": 8.610773592547591e-05,
745
+ "loss": 0.6596,
746
+ "step": 1030
747
+ },
748
+ {
749
+ "epoch": 0.42122316727420006,
750
+ "grad_norm": 6.862849235534668,
751
+ "learning_rate": 8.597272850006752e-05,
752
+ "loss": 0.6699,
753
+ "step": 1040
754
+ },
755
+ {
756
+ "epoch": 0.425273390036452,
757
+ "grad_norm": 3.898430347442627,
758
+ "learning_rate": 8.58377210746591e-05,
759
+ "loss": 0.6913,
760
+ "step": 1050
761
+ },
762
+ {
763
+ "epoch": 0.4293236127987039,
764
+ "grad_norm": 3.683239459991455,
765
+ "learning_rate": 8.570271364925071e-05,
766
+ "loss": 0.6312,
767
+ "step": 1060
768
+ },
769
+ {
770
+ "epoch": 0.4333738355609559,
771
+ "grad_norm": 1.4455596208572388,
772
+ "learning_rate": 8.556770622384232e-05,
773
+ "loss": 0.7048,
774
+ "step": 1070
775
+ },
776
+ {
777
+ "epoch": 0.4374240583232078,
778
+ "grad_norm": 2.023144483566284,
779
+ "learning_rate": 8.543269879843392e-05,
780
+ "loss": 0.6634,
781
+ "step": 1080
782
+ },
783
+ {
784
+ "epoch": 0.4414742810854597,
785
+ "grad_norm": 1.454586386680603,
786
+ "learning_rate": 8.529769137302553e-05,
787
+ "loss": 0.639,
788
+ "step": 1090
789
+ },
790
+ {
791
+ "epoch": 0.44552450384771164,
792
+ "grad_norm": 2.0027737617492676,
793
+ "learning_rate": 8.516268394761712e-05,
794
+ "loss": 0.7068,
795
+ "step": 1100
796
+ },
797
+ {
798
+ "epoch": 0.44957472660996356,
799
+ "grad_norm": 1.4496852159500122,
800
+ "learning_rate": 8.502767652220872e-05,
801
+ "loss": 0.7654,
802
+ "step": 1110
803
+ },
804
+ {
805
+ "epoch": 0.4536249493722155,
806
+ "grad_norm": 1.5118701457977295,
807
+ "learning_rate": 8.489266909680033e-05,
808
+ "loss": 0.7211,
809
+ "step": 1120
810
+ },
811
+ {
812
+ "epoch": 0.4576751721344674,
813
+ "grad_norm": 4.295745372772217,
814
+ "learning_rate": 8.475766167139193e-05,
815
+ "loss": 0.6662,
816
+ "step": 1130
817
+ },
818
+ {
819
+ "epoch": 0.4617253948967193,
820
+ "grad_norm": 1.5748212337493896,
821
+ "learning_rate": 8.462265424598354e-05,
822
+ "loss": 0.6695,
823
+ "step": 1140
824
+ },
825
+ {
826
+ "epoch": 0.46577561765897124,
827
+ "grad_norm": 3.8355488777160645,
828
+ "learning_rate": 8.448764682057513e-05,
829
+ "loss": 0.6277,
830
+ "step": 1150
831
+ },
832
+ {
833
+ "epoch": 0.46982584042122316,
834
+ "grad_norm": 2.747668743133545,
835
+ "learning_rate": 8.435263939516674e-05,
836
+ "loss": 0.689,
837
+ "step": 1160
838
+ },
839
+ {
840
+ "epoch": 0.4738760631834751,
841
+ "grad_norm": 3.341548204421997,
842
+ "learning_rate": 8.421763196975834e-05,
843
+ "loss": 0.6295,
844
+ "step": 1170
845
+ },
846
+ {
847
+ "epoch": 0.477926285945727,
848
+ "grad_norm": 2.240903854370117,
849
+ "learning_rate": 8.408262454434994e-05,
850
+ "loss": 0.6915,
851
+ "step": 1180
852
+ },
853
+ {
854
+ "epoch": 0.4819765087079789,
855
+ "grad_norm": 2.9906082153320312,
856
+ "learning_rate": 8.394761711894155e-05,
857
+ "loss": 0.7148,
858
+ "step": 1190
859
+ },
860
+ {
861
+ "epoch": 0.48602673147023084,
862
+ "grad_norm": 3.217128276824951,
863
+ "learning_rate": 8.381260969353314e-05,
864
+ "loss": 0.7287,
865
+ "step": 1200
866
+ },
867
+ {
868
+ "epoch": 0.49007695423248276,
869
+ "grad_norm": 5.23323392868042,
870
+ "learning_rate": 8.367760226812475e-05,
871
+ "loss": 0.651,
872
+ "step": 1210
873
+ },
874
+ {
875
+ "epoch": 0.4941271769947347,
876
+ "grad_norm": 1.4018148183822632,
877
+ "learning_rate": 8.354259484271635e-05,
878
+ "loss": 0.7042,
879
+ "step": 1220
880
+ },
881
+ {
882
+ "epoch": 0.49817739975698666,
883
+ "grad_norm": 5.82692813873291,
884
+ "learning_rate": 8.340758741730795e-05,
885
+ "loss": 0.6557,
886
+ "step": 1230
887
+ },
888
+ {
889
+ "epoch": 0.5022276225192386,
890
+ "grad_norm": 4.461226940155029,
891
+ "learning_rate": 8.327257999189956e-05,
892
+ "loss": 0.7046,
893
+ "step": 1240
894
+ },
895
+ {
896
+ "epoch": 0.5062778452814904,
897
+ "grad_norm": 1.4963361024856567,
898
+ "learning_rate": 8.313757256649117e-05,
899
+ "loss": 0.6771,
900
+ "step": 1250
901
+ },
902
+ {
903
+ "epoch": 0.5103280680437424,
904
+ "grad_norm": 7.774729251861572,
905
+ "learning_rate": 8.300256514108276e-05,
906
+ "loss": 0.6568,
907
+ "step": 1260
908
+ },
909
+ {
910
+ "epoch": 0.5143782908059943,
911
+ "grad_norm": 10.685174942016602,
912
+ "learning_rate": 8.286755771567437e-05,
913
+ "loss": 0.7036,
914
+ "step": 1270
915
+ },
916
+ {
917
+ "epoch": 0.5184285135682463,
918
+ "grad_norm": 7.026299953460693,
919
+ "learning_rate": 8.273255029026596e-05,
920
+ "loss": 0.6717,
921
+ "step": 1280
922
+ },
923
+ {
924
+ "epoch": 0.5224787363304981,
925
+ "grad_norm": 1.4644521474838257,
926
+ "learning_rate": 8.259754286485757e-05,
927
+ "loss": 0.7273,
928
+ "step": 1290
929
+ },
930
+ {
931
+ "epoch": 0.5265289590927501,
932
+ "grad_norm": 4.5040693283081055,
933
+ "learning_rate": 8.246253543944918e-05,
934
+ "loss": 0.6519,
935
+ "step": 1300
936
+ },
937
+ {
938
+ "epoch": 0.5305791818550021,
939
+ "grad_norm": 10.927738189697266,
940
+ "learning_rate": 8.232752801404077e-05,
941
+ "loss": 0.69,
942
+ "step": 1310
943
+ },
944
+ {
945
+ "epoch": 0.534629404617254,
946
+ "grad_norm": 2.803307294845581,
947
+ "learning_rate": 8.219252058863238e-05,
948
+ "loss": 0.7042,
949
+ "step": 1320
950
+ },
951
+ {
952
+ "epoch": 0.5386796273795059,
953
+ "grad_norm": 2.241880416870117,
954
+ "learning_rate": 8.205751316322398e-05,
955
+ "loss": 0.6556,
956
+ "step": 1330
957
+ },
958
+ {
959
+ "epoch": 0.5427298501417578,
960
+ "grad_norm": 1.8177238702774048,
961
+ "learning_rate": 8.192250573781558e-05,
962
+ "loss": 0.7063,
963
+ "step": 1340
964
+ },
965
+ {
966
+ "epoch": 0.5467800729040098,
967
+ "grad_norm": 3.659595251083374,
968
+ "learning_rate": 8.178749831240719e-05,
969
+ "loss": 0.6645,
970
+ "step": 1350
971
+ },
972
+ {
973
+ "epoch": 0.5508302956662616,
974
+ "grad_norm": 2.6234257221221924,
975
+ "learning_rate": 8.165249088699878e-05,
976
+ "loss": 0.6759,
977
+ "step": 1360
978
+ },
979
+ {
980
+ "epoch": 0.5548805184285136,
981
+ "grad_norm": 7.211588382720947,
982
+ "learning_rate": 8.15174834615904e-05,
983
+ "loss": 0.6575,
984
+ "step": 1370
985
+ },
986
+ {
987
+ "epoch": 0.5589307411907655,
988
+ "grad_norm": 3.50372314453125,
989
+ "learning_rate": 8.1382476036182e-05,
990
+ "loss": 0.7123,
991
+ "step": 1380
992
+ },
993
+ {
994
+ "epoch": 0.5629809639530174,
995
+ "grad_norm": 1.490511178970337,
996
+ "learning_rate": 8.124746861077359e-05,
997
+ "loss": 0.6976,
998
+ "step": 1390
999
+ },
1000
+ {
1001
+ "epoch": 0.5670311867152693,
1002
+ "grad_norm": 11.36801815032959,
1003
+ "learning_rate": 8.11124611853652e-05,
1004
+ "loss": 0.6626,
1005
+ "step": 1400
1006
+ },
1007
+ {
1008
+ "epoch": 0.5710814094775213,
1009
+ "grad_norm": 6.033326148986816,
1010
+ "learning_rate": 8.09774537599568e-05,
1011
+ "loss": 0.6583,
1012
+ "step": 1410
1013
+ },
1014
+ {
1015
+ "epoch": 0.5751316322397731,
1016
+ "grad_norm": 4.926746368408203,
1017
+ "learning_rate": 8.08424463345484e-05,
1018
+ "loss": 0.6809,
1019
+ "step": 1420
1020
+ },
1021
+ {
1022
+ "epoch": 0.5791818550020251,
1023
+ "grad_norm": 1.5155569314956665,
1024
+ "learning_rate": 8.070743890914001e-05,
1025
+ "loss": 0.8433,
1026
+ "step": 1430
1027
+ },
1028
+ {
1029
+ "epoch": 0.583232077764277,
1030
+ "grad_norm": 7.069480895996094,
1031
+ "learning_rate": 8.057243148373161e-05,
1032
+ "loss": 0.6695,
1033
+ "step": 1440
1034
+ },
1035
+ {
1036
+ "epoch": 0.587282300526529,
1037
+ "grad_norm": 3.5053625106811523,
1038
+ "learning_rate": 8.043742405832321e-05,
1039
+ "loss": 0.712,
1040
+ "step": 1450
1041
+ },
1042
+ {
1043
+ "epoch": 0.5913325232887808,
1044
+ "grad_norm": 8.239558219909668,
1045
+ "learning_rate": 8.030241663291482e-05,
1046
+ "loss": 0.6655,
1047
+ "step": 1460
1048
+ },
1049
+ {
1050
+ "epoch": 0.5953827460510328,
1051
+ "grad_norm": 3.499582052230835,
1052
+ "learning_rate": 8.016740920750641e-05,
1053
+ "loss": 0.6695,
1054
+ "step": 1470
1055
+ },
1056
+ {
1057
+ "epoch": 0.5994329688132848,
1058
+ "grad_norm": 6.012964725494385,
1059
+ "learning_rate": 8.003240178209802e-05,
1060
+ "loss": 0.6664,
1061
+ "step": 1480
1062
+ },
1063
+ {
1064
+ "epoch": 0.6034831915755366,
1065
+ "grad_norm": 8.99488353729248,
1066
+ "learning_rate": 7.989739435668962e-05,
1067
+ "loss": 0.6465,
1068
+ "step": 1490
1069
+ },
1070
+ {
1071
+ "epoch": 0.6075334143377886,
1072
+ "grad_norm": 1.608788013458252,
1073
+ "learning_rate": 7.976238693128122e-05,
1074
+ "loss": 0.6762,
1075
+ "step": 1500
1076
+ },
1077
+ {
1078
+ "epoch": 0.6115836371000405,
1079
+ "grad_norm": 7.744218349456787,
1080
+ "learning_rate": 7.962737950587283e-05,
1081
+ "loss": 0.7005,
1082
+ "step": 1510
1083
+ },
1084
+ {
1085
+ "epoch": 0.6156338598622925,
1086
+ "grad_norm": 1.5159974098205566,
1087
+ "learning_rate": 7.949237208046442e-05,
1088
+ "loss": 0.7335,
1089
+ "step": 1520
1090
+ },
1091
+ {
1092
+ "epoch": 0.6196840826245443,
1093
+ "grad_norm": 5.266475200653076,
1094
+ "learning_rate": 7.935736465505604e-05,
1095
+ "loss": 0.6717,
1096
+ "step": 1530
1097
+ },
1098
+ {
1099
+ "epoch": 0.6237343053867963,
1100
+ "grad_norm": 4.198414325714111,
1101
+ "learning_rate": 7.922235722964763e-05,
1102
+ "loss": 0.6076,
1103
+ "step": 1540
1104
+ },
1105
+ {
1106
+ "epoch": 0.6277845281490482,
1107
+ "grad_norm": 6.395306587219238,
1108
+ "learning_rate": 7.908734980423924e-05,
1109
+ "loss": 0.7478,
1110
+ "step": 1550
1111
+ },
1112
+ {
1113
+ "epoch": 0.6318347509113001,
1114
+ "grad_norm": 7.0668182373046875,
1115
+ "learning_rate": 7.895234237883084e-05,
1116
+ "loss": 0.6454,
1117
+ "step": 1560
1118
+ },
1119
+ {
1120
+ "epoch": 0.635884973673552,
1121
+ "grad_norm": 5.721327781677246,
1122
+ "learning_rate": 7.881733495342243e-05,
1123
+ "loss": 0.6866,
1124
+ "step": 1570
1125
+ },
1126
+ {
1127
+ "epoch": 0.639935196435804,
1128
+ "grad_norm": 1.5752720832824707,
1129
+ "learning_rate": 7.868232752801405e-05,
1130
+ "loss": 0.6574,
1131
+ "step": 1580
1132
+ },
1133
+ {
1134
+ "epoch": 0.6439854191980559,
1135
+ "grad_norm": 3.1622705459594727,
1136
+ "learning_rate": 7.854732010260564e-05,
1137
+ "loss": 0.6968,
1138
+ "step": 1590
1139
+ },
1140
+ {
1141
+ "epoch": 0.6480356419603078,
1142
+ "grad_norm": 2.200968027114868,
1143
+ "learning_rate": 7.841231267719725e-05,
1144
+ "loss": 0.7232,
1145
+ "step": 1600
1146
+ },
1147
+ {
1148
+ "epoch": 0.6520858647225597,
1149
+ "grad_norm": 5.88210391998291,
1150
+ "learning_rate": 7.827730525178886e-05,
1151
+ "loss": 0.6027,
1152
+ "step": 1610
1153
+ },
1154
+ {
1155
+ "epoch": 0.6561360874848117,
1156
+ "grad_norm": 2.330373764038086,
1157
+ "learning_rate": 7.814229782638044e-05,
1158
+ "loss": 0.7234,
1159
+ "step": 1620
1160
+ },
1161
+ {
1162
+ "epoch": 0.6601863102470635,
1163
+ "grad_norm": 3.580780029296875,
1164
+ "learning_rate": 7.800729040097206e-05,
1165
+ "loss": 0.6395,
1166
+ "step": 1630
1167
+ },
1168
+ {
1169
+ "epoch": 0.6642365330093155,
1170
+ "grad_norm": 3.347949504852295,
1171
+ "learning_rate": 7.787228297556367e-05,
1172
+ "loss": 0.7026,
1173
+ "step": 1640
1174
+ },
1175
+ {
1176
+ "epoch": 0.6682867557715675,
1177
+ "grad_norm": 2.739063024520874,
1178
+ "learning_rate": 7.773727555015526e-05,
1179
+ "loss": 0.7006,
1180
+ "step": 1650
1181
+ },
1182
+ {
1183
+ "epoch": 0.6723369785338194,
1184
+ "grad_norm": 1.8707573413848877,
1185
+ "learning_rate": 7.760226812474687e-05,
1186
+ "loss": 0.6809,
1187
+ "step": 1660
1188
+ },
1189
+ {
1190
+ "epoch": 0.6763872012960713,
1191
+ "grad_norm": 1.339418888092041,
1192
+ "learning_rate": 7.746726069933847e-05,
1193
+ "loss": 0.6299,
1194
+ "step": 1670
1195
+ },
1196
+ {
1197
+ "epoch": 0.6804374240583232,
1198
+ "grad_norm": 1.8292454481124878,
1199
+ "learning_rate": 7.733225327393007e-05,
1200
+ "loss": 0.6736,
1201
+ "step": 1680
1202
+ },
1203
+ {
1204
+ "epoch": 0.6844876468205752,
1205
+ "grad_norm": 8.511611938476562,
1206
+ "learning_rate": 7.719724584852168e-05,
1207
+ "loss": 0.7129,
1208
+ "step": 1690
1209
+ },
1210
+ {
1211
+ "epoch": 0.688537869582827,
1212
+ "grad_norm": 5.8545379638671875,
1213
+ "learning_rate": 7.706223842311327e-05,
1214
+ "loss": 0.7271,
1215
+ "step": 1700
1216
+ },
1217
+ {
1218
+ "epoch": 0.692588092345079,
1219
+ "grad_norm": 3.0557925701141357,
1220
+ "learning_rate": 7.692723099770488e-05,
1221
+ "loss": 0.7228,
1222
+ "step": 1710
1223
+ },
1224
+ {
1225
+ "epoch": 0.6966383151073309,
1226
+ "grad_norm": 7.087495803833008,
1227
+ "learning_rate": 7.679222357229648e-05,
1228
+ "loss": 0.6529,
1229
+ "step": 1720
1230
+ },
1231
+ {
1232
+ "epoch": 0.7006885378695829,
1233
+ "grad_norm": 1.2610193490982056,
1234
+ "learning_rate": 7.665721614688808e-05,
1235
+ "loss": 0.6423,
1236
+ "step": 1730
1237
+ },
1238
+ {
1239
+ "epoch": 0.7047387606318347,
1240
+ "grad_norm": 3.634399175643921,
1241
+ "learning_rate": 7.652220872147969e-05,
1242
+ "loss": 0.6765,
1243
+ "step": 1740
1244
+ },
1245
+ {
1246
+ "epoch": 0.7087889833940867,
1247
+ "grad_norm": 2.021672248840332,
1248
+ "learning_rate": 7.638720129607128e-05,
1249
+ "loss": 0.6853,
1250
+ "step": 1750
1251
+ },
1252
+ {
1253
+ "epoch": 0.7128392061563386,
1254
+ "grad_norm": 1.397925615310669,
1255
+ "learning_rate": 7.625219387066289e-05,
1256
+ "loss": 0.6029,
1257
+ "step": 1760
1258
+ },
1259
+ {
1260
+ "epoch": 0.7168894289185905,
1261
+ "grad_norm": 2.4325027465820312,
1262
+ "learning_rate": 7.61171864452545e-05,
1263
+ "loss": 0.6678,
1264
+ "step": 1770
1265
+ },
1266
+ {
1267
+ "epoch": 0.7209396516808424,
1268
+ "grad_norm": 1.489314317703247,
1269
+ "learning_rate": 7.59821790198461e-05,
1270
+ "loss": 0.6743,
1271
+ "step": 1780
1272
+ },
1273
+ {
1274
+ "epoch": 0.7249898744430944,
1275
+ "grad_norm": 1.6524492502212524,
1276
+ "learning_rate": 7.58471715944377e-05,
1277
+ "loss": 0.6636,
1278
+ "step": 1790
1279
+ },
1280
+ {
1281
+ "epoch": 0.7290400972053463,
1282
+ "grad_norm": 2.6753315925598145,
1283
+ "learning_rate": 7.571216416902929e-05,
1284
+ "loss": 0.7202,
1285
+ "step": 1800
1286
+ },
1287
+ {
1288
+ "epoch": 0.7330903199675982,
1289
+ "grad_norm": 5.01415491104126,
1290
+ "learning_rate": 7.55771567436209e-05,
1291
+ "loss": 0.7076,
1292
+ "step": 1810
1293
+ },
1294
+ {
1295
+ "epoch": 0.7371405427298502,
1296
+ "grad_norm": 4.823099613189697,
1297
+ "learning_rate": 7.544214931821251e-05,
1298
+ "loss": 0.7228,
1299
+ "step": 1820
1300
+ },
1301
+ {
1302
+ "epoch": 0.741190765492102,
1303
+ "grad_norm": 7.68076753616333,
1304
+ "learning_rate": 7.530714189280411e-05,
1305
+ "loss": 0.6443,
1306
+ "step": 1830
1307
+ },
1308
+ {
1309
+ "epoch": 0.745240988254354,
1310
+ "grad_norm": 8.639177322387695,
1311
+ "learning_rate": 7.51721344673957e-05,
1312
+ "loss": 0.6837,
1313
+ "step": 1840
1314
+ },
1315
+ {
1316
+ "epoch": 0.7492912110166059,
1317
+ "grad_norm": 2.379915237426758,
1318
+ "learning_rate": 7.503712704198732e-05,
1319
+ "loss": 0.6505,
1320
+ "step": 1850
1321
+ },
1322
+ {
1323
+ "epoch": 0.7533414337788579,
1324
+ "grad_norm": 2.4480419158935547,
1325
+ "learning_rate": 7.490211961657891e-05,
1326
+ "loss": 0.6698,
1327
+ "step": 1860
1328
+ },
1329
+ {
1330
+ "epoch": 0.7573916565411097,
1331
+ "grad_norm": 1.723561406135559,
1332
+ "learning_rate": 7.476711219117052e-05,
1333
+ "loss": 0.7275,
1334
+ "step": 1870
1335
+ },
1336
+ {
1337
+ "epoch": 0.7614418793033617,
1338
+ "grad_norm": 2.184816837310791,
1339
+ "learning_rate": 7.463210476576212e-05,
1340
+ "loss": 0.6425,
1341
+ "step": 1880
1342
+ },
1343
+ {
1344
+ "epoch": 0.7654921020656136,
1345
+ "grad_norm": 2.710773468017578,
1346
+ "learning_rate": 7.449709734035373e-05,
1347
+ "loss": 0.7748,
1348
+ "step": 1890
1349
+ },
1350
+ {
1351
+ "epoch": 0.7695423248278656,
1352
+ "grad_norm": 2.839587688446045,
1353
+ "learning_rate": 7.436208991494533e-05,
1354
+ "loss": 0.6263,
1355
+ "step": 1900
1356
+ },
1357
+ {
1358
+ "epoch": 0.7735925475901174,
1359
+ "grad_norm": 5.725647449493408,
1360
+ "learning_rate": 7.422708248953692e-05,
1361
+ "loss": 0.7141,
1362
+ "step": 1910
1363
+ },
1364
+ {
1365
+ "epoch": 0.7776427703523694,
1366
+ "grad_norm": 5.777345180511475,
1367
+ "learning_rate": 7.409207506412853e-05,
1368
+ "loss": 0.7056,
1369
+ "step": 1920
1370
+ },
1371
+ {
1372
+ "epoch": 0.7816929931146213,
1373
+ "grad_norm": 7.272107124328613,
1374
+ "learning_rate": 7.395706763872013e-05,
1375
+ "loss": 0.6735,
1376
+ "step": 1930
1377
+ },
1378
+ {
1379
+ "epoch": 0.7857432158768732,
1380
+ "grad_norm": 4.4248552322387695,
1381
+ "learning_rate": 7.382206021331174e-05,
1382
+ "loss": 0.6701,
1383
+ "step": 1940
1384
+ },
1385
+ {
1386
+ "epoch": 0.7897934386391251,
1387
+ "grad_norm": 1.2805415391921997,
1388
+ "learning_rate": 7.368705278790335e-05,
1389
+ "loss": 0.6853,
1390
+ "step": 1950
1391
+ },
1392
+ {
1393
+ "epoch": 0.7938436614013771,
1394
+ "grad_norm": 3.7856080532073975,
1395
+ "learning_rate": 7.355204536249493e-05,
1396
+ "loss": 0.6966,
1397
+ "step": 1960
1398
+ },
1399
+ {
1400
+ "epoch": 0.797893884163629,
1401
+ "grad_norm": 2.6894679069519043,
1402
+ "learning_rate": 7.341703793708654e-05,
1403
+ "loss": 0.6878,
1404
+ "step": 1970
1405
+ },
1406
+ {
1407
+ "epoch": 0.8019441069258809,
1408
+ "grad_norm": 1.721479058265686,
1409
+ "learning_rate": 7.328203051167815e-05,
1410
+ "loss": 0.6527,
1411
+ "step": 1980
1412
+ },
1413
+ {
1414
+ "epoch": 0.8059943296881329,
1415
+ "grad_norm": 3.666368246078491,
1416
+ "learning_rate": 7.314702308626975e-05,
1417
+ "loss": 0.659,
1418
+ "step": 1990
1419
+ },
1420
+ {
1421
+ "epoch": 0.8100445524503848,
1422
+ "grad_norm": 4.111802577972412,
1423
+ "learning_rate": 7.301201566086136e-05,
1424
+ "loss": 0.7143,
1425
+ "step": 2000
1426
+ },
1427
+ {
1428
+ "epoch": 0.8140947752126367,
1429
+ "grad_norm": 5.371121883392334,
1430
+ "learning_rate": 7.287700823545294e-05,
1431
+ "loss": 0.6453,
1432
+ "step": 2010
1433
+ },
1434
+ {
1435
+ "epoch": 0.8181449979748886,
1436
+ "grad_norm": 8.05588436126709,
1437
+ "learning_rate": 7.274200081004455e-05,
1438
+ "loss": 0.6846,
1439
+ "step": 2020
1440
+ },
1441
+ {
1442
+ "epoch": 0.8221952207371406,
1443
+ "grad_norm": 1.3259375095367432,
1444
+ "learning_rate": 7.260699338463616e-05,
1445
+ "loss": 0.5983,
1446
+ "step": 2030
1447
+ },
1448
+ {
1449
+ "epoch": 0.8262454434993924,
1450
+ "grad_norm": 1.8734527826309204,
1451
+ "learning_rate": 7.247198595922776e-05,
1452
+ "loss": 0.6277,
1453
+ "step": 2040
1454
+ },
1455
+ {
1456
+ "epoch": 0.8302956662616444,
1457
+ "grad_norm": 6.942389965057373,
1458
+ "learning_rate": 7.233697853381937e-05,
1459
+ "loss": 0.7137,
1460
+ "step": 2050
1461
+ },
1462
+ {
1463
+ "epoch": 0.8343458890238963,
1464
+ "grad_norm": 5.275886535644531,
1465
+ "learning_rate": 7.220197110841097e-05,
1466
+ "loss": 0.6507,
1467
+ "step": 2060
1468
+ },
1469
+ {
1470
+ "epoch": 0.8383961117861483,
1471
+ "grad_norm": 5.971946716308594,
1472
+ "learning_rate": 7.206696368300256e-05,
1473
+ "loss": 0.6335,
1474
+ "step": 2070
1475
+ },
1476
+ {
1477
+ "epoch": 0.8424463345484001,
1478
+ "grad_norm": 4.011186599731445,
1479
+ "learning_rate": 7.193195625759417e-05,
1480
+ "loss": 0.7592,
1481
+ "step": 2080
1482
+ },
1483
+ {
1484
+ "epoch": 0.8464965573106521,
1485
+ "grad_norm": 4.385476589202881,
1486
+ "learning_rate": 7.179694883218577e-05,
1487
+ "loss": 0.6859,
1488
+ "step": 2090
1489
+ },
1490
+ {
1491
+ "epoch": 0.850546780072904,
1492
+ "grad_norm": 5.010826110839844,
1493
+ "learning_rate": 7.166194140677738e-05,
1494
+ "loss": 0.6739,
1495
+ "step": 2100
1496
+ },
1497
+ {
1498
+ "epoch": 0.8545970028351559,
1499
+ "grad_norm": 1.4120031595230103,
1500
+ "learning_rate": 7.152693398136898e-05,
1501
+ "loss": 0.6207,
1502
+ "step": 2110
1503
+ },
1504
+ {
1505
+ "epoch": 0.8586472255974078,
1506
+ "grad_norm": 3.627908706665039,
1507
+ "learning_rate": 7.139192655596059e-05,
1508
+ "loss": 0.6847,
1509
+ "step": 2120
1510
+ },
1511
+ {
1512
+ "epoch": 0.8626974483596598,
1513
+ "grad_norm": 7.15459680557251,
1514
+ "learning_rate": 7.125691913055218e-05,
1515
+ "loss": 0.655,
1516
+ "step": 2130
1517
+ },
1518
+ {
1519
+ "epoch": 0.8667476711219118,
1520
+ "grad_norm": 4.2000250816345215,
1521
+ "learning_rate": 7.112191170514378e-05,
1522
+ "loss": 0.6123,
1523
+ "step": 2140
1524
+ },
1525
+ {
1526
+ "epoch": 0.8707978938841636,
1527
+ "grad_norm": 5.208048343658447,
1528
+ "learning_rate": 7.098690427973539e-05,
1529
+ "loss": 0.6525,
1530
+ "step": 2150
1531
+ },
1532
+ {
1533
+ "epoch": 0.8748481166464156,
1534
+ "grad_norm": 3.048551082611084,
1535
+ "learning_rate": 7.0851896854327e-05,
1536
+ "loss": 0.6834,
1537
+ "step": 2160
1538
+ },
1539
+ {
1540
+ "epoch": 0.8788983394086675,
1541
+ "grad_norm": 7.1293768882751465,
1542
+ "learning_rate": 7.07168894289186e-05,
1543
+ "loss": 0.6337,
1544
+ "step": 2170
1545
+ },
1546
+ {
1547
+ "epoch": 0.8829485621709194,
1548
+ "grad_norm": 1.4484611749649048,
1549
+ "learning_rate": 7.058188200351019e-05,
1550
+ "loss": 0.6295,
1551
+ "step": 2180
1552
+ },
1553
+ {
1554
+ "epoch": 0.8869987849331713,
1555
+ "grad_norm": 4.266237258911133,
1556
+ "learning_rate": 7.044687457810179e-05,
1557
+ "loss": 0.7005,
1558
+ "step": 2190
1559
+ },
1560
+ {
1561
+ "epoch": 0.8910490076954233,
1562
+ "grad_norm": 4.0993852615356445,
1563
+ "learning_rate": 7.03118671526934e-05,
1564
+ "loss": 0.656,
1565
+ "step": 2200
1566
+ },
1567
+ {
1568
+ "epoch": 0.8950992304576751,
1569
+ "grad_norm": 3.7144315242767334,
1570
+ "learning_rate": 7.017685972728501e-05,
1571
+ "loss": 0.6571,
1572
+ "step": 2210
1573
+ },
1574
+ {
1575
+ "epoch": 0.8991494532199271,
1576
+ "grad_norm": 7.986518859863281,
1577
+ "learning_rate": 7.00418523018766e-05,
1578
+ "loss": 0.7044,
1579
+ "step": 2220
1580
+ },
1581
+ {
1582
+ "epoch": 0.903199675982179,
1583
+ "grad_norm": 2.8526182174682617,
1584
+ "learning_rate": 6.990684487646822e-05,
1585
+ "loss": 0.6409,
1586
+ "step": 2230
1587
+ },
1588
+ {
1589
+ "epoch": 0.907249898744431,
1590
+ "grad_norm": 5.810249328613281,
1591
+ "learning_rate": 6.977183745105981e-05,
1592
+ "loss": 0.6444,
1593
+ "step": 2240
1594
+ },
1595
+ {
1596
+ "epoch": 0.9113001215066828,
1597
+ "grad_norm": 4.703845977783203,
1598
+ "learning_rate": 6.963683002565141e-05,
1599
+ "loss": 0.632,
1600
+ "step": 2250
1601
+ },
1602
+ {
1603
+ "epoch": 0.9153503442689348,
1604
+ "grad_norm": 4.955256938934326,
1605
+ "learning_rate": 6.950182260024302e-05,
1606
+ "loss": 0.6859,
1607
+ "step": 2260
1608
+ },
1609
+ {
1610
+ "epoch": 0.9194005670311867,
1611
+ "grad_norm": 3.5239837169647217,
1612
+ "learning_rate": 6.936681517483462e-05,
1613
+ "loss": 0.6738,
1614
+ "step": 2270
1615
+ },
1616
+ {
1617
+ "epoch": 0.9234507897934386,
1618
+ "grad_norm": 4.58236837387085,
1619
+ "learning_rate": 6.923180774942623e-05,
1620
+ "loss": 0.7172,
1621
+ "step": 2280
1622
+ },
1623
+ {
1624
+ "epoch": 0.9275010125556906,
1625
+ "grad_norm": 1.3130325078964233,
1626
+ "learning_rate": 6.909680032401784e-05,
1627
+ "loss": 0.7214,
1628
+ "step": 2290
1629
+ },
1630
+ {
1631
+ "epoch": 0.9315512353179425,
1632
+ "grad_norm": 4.8997273445129395,
1633
+ "learning_rate": 6.896179289860942e-05,
1634
+ "loss": 0.6714,
1635
+ "step": 2300
1636
+ },
1637
+ {
1638
+ "epoch": 0.9356014580801945,
1639
+ "grad_norm": 4.900916576385498,
1640
+ "learning_rate": 6.882678547320103e-05,
1641
+ "loss": 0.6748,
1642
+ "step": 2310
1643
+ },
1644
+ {
1645
+ "epoch": 0.9396516808424463,
1646
+ "grad_norm": 4.187465667724609,
1647
+ "learning_rate": 6.869177804779263e-05,
1648
+ "loss": 0.6894,
1649
+ "step": 2320
1650
+ },
1651
+ {
1652
+ "epoch": 0.9437019036046983,
1653
+ "grad_norm": 5.871070861816406,
1654
+ "learning_rate": 6.855677062238424e-05,
1655
+ "loss": 0.6843,
1656
+ "step": 2330
1657
+ },
1658
+ {
1659
+ "epoch": 0.9477521263669502,
1660
+ "grad_norm": 4.267555236816406,
1661
+ "learning_rate": 6.842176319697585e-05,
1662
+ "loss": 0.688,
1663
+ "step": 2340
1664
+ },
1665
+ {
1666
+ "epoch": 0.9518023491292021,
1667
+ "grad_norm": 4.01849889755249,
1668
+ "learning_rate": 6.828675577156743e-05,
1669
+ "loss": 0.7011,
1670
+ "step": 2350
1671
+ },
1672
+ {
1673
+ "epoch": 0.955852571891454,
1674
+ "grad_norm": 3.507786512374878,
1675
+ "learning_rate": 6.815174834615904e-05,
1676
+ "loss": 0.6639,
1677
+ "step": 2360
1678
+ },
1679
+ {
1680
+ "epoch": 0.959902794653706,
1681
+ "grad_norm": 1.8241889476776123,
1682
+ "learning_rate": 6.801674092075065e-05,
1683
+ "loss": 0.6805,
1684
+ "step": 2370
1685
+ },
1686
+ {
1687
+ "epoch": 0.9639530174159578,
1688
+ "grad_norm": 1.9074022769927979,
1689
+ "learning_rate": 6.788173349534225e-05,
1690
+ "loss": 0.663,
1691
+ "step": 2380
1692
+ },
1693
+ {
1694
+ "epoch": 0.9680032401782098,
1695
+ "grad_norm": 3.4003665447235107,
1696
+ "learning_rate": 6.774672606993386e-05,
1697
+ "loss": 0.6879,
1698
+ "step": 2390
1699
+ },
1700
+ {
1701
+ "epoch": 0.9720534629404617,
1702
+ "grad_norm": 1.4989181756973267,
1703
+ "learning_rate": 6.761171864452545e-05,
1704
+ "loss": 0.7023,
1705
+ "step": 2400
1706
+ },
1707
+ {
1708
+ "epoch": 0.9761036857027137,
1709
+ "grad_norm": 2.1352789402008057,
1710
+ "learning_rate": 6.747671121911705e-05,
1711
+ "loss": 0.6844,
1712
+ "step": 2410
1713
+ },
1714
+ {
1715
+ "epoch": 0.9801539084649655,
1716
+ "grad_norm": 10.008374214172363,
1717
+ "learning_rate": 6.734170379370866e-05,
1718
+ "loss": 0.6487,
1719
+ "step": 2420
1720
+ },
1721
+ {
1722
+ "epoch": 0.9842041312272175,
1723
+ "grad_norm": 2.7483386993408203,
1724
+ "learning_rate": 6.720669636830026e-05,
1725
+ "loss": 0.693,
1726
+ "step": 2430
1727
+ },
1728
+ {
1729
+ "epoch": 0.9882543539894694,
1730
+ "grad_norm": 11.50084400177002,
1731
+ "learning_rate": 6.707168894289187e-05,
1732
+ "loss": 0.6443,
1733
+ "step": 2440
1734
+ },
1735
+ {
1736
+ "epoch": 0.9923045767517213,
1737
+ "grad_norm": 1.6837221384048462,
1738
+ "learning_rate": 6.693668151748346e-05,
1739
+ "loss": 0.6805,
1740
+ "step": 2450
1741
+ },
1742
+ {
1743
+ "epoch": 0.9963547995139733,
1744
+ "grad_norm": 1.9835008382797241,
1745
+ "learning_rate": 6.680167409207507e-05,
1746
+ "loss": 0.6773,
1747
+ "step": 2460
1748
+ },
1749
+ {
1750
+ "epoch": 1.0,
1751
+ "eval_acc_0.5": 0.5288251791835463,
1752
+ "eval_acc_1.0": 0.8367092552196946,
1753
+ "eval_loss": 0.7152395248413086,
1754
+ "eval_mae": 0.5716977715492249,
1755
+ "eval_pcc": 0.3787382245063782,
1756
+ "eval_qwk": 0.2012183219341548,
1757
+ "eval_rmse": 0.7291858792304993,
1758
+ "eval_runtime": 18.4684,
1759
+ "eval_samples_per_second": 173.756,
1760
+ "eval_src": 0.36552091512825263,
1761
+ "eval_steps_per_second": 10.883,
1762
+ "step": 2469
1763
+ }
1764
+ ],
1765
+ "logging_steps": 10,
1766
+ "max_steps": 7407,
1767
+ "num_input_tokens_seen": 0,
1768
+ "num_train_epochs": 3,
1769
+ "save_steps": 500,
1770
+ "stateful_callbacks": {
1771
+ "TrainerControl": {
1772
+ "args": {
1773
+ "should_epoch_stop": false,
1774
+ "should_evaluate": false,
1775
+ "should_log": false,
1776
+ "should_save": true,
1777
+ "should_training_stop": false
1778
+ },
1779
+ "attributes": {}
1780
+ }
1781
+ },
1782
+ "total_flos": 0.0,
1783
+ "train_batch_size": 16,
1784
+ "trial_name": null,
1785
+ "trial_params": null
1786
+ }
last-checkpoint/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c10c1fc8c0b9dcd9239f4e440fbdb09666dd23e353de595abf6545a9b59d92d
3
+ size 5777