diff --git "a/checkpoint-832/trainer_state.json" "b/checkpoint-832/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-832/trainer_state.json" @@ -0,0 +1,5858 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9963985594237696, + "eval_steps": 500, + "global_step": 832, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0024009603841536613, + "grad_norm": 3.307639305281489, + "learning_rate": 0.0, + "loss": 1.027, + "step": 1 + }, + { + "epoch": 0.004801920768307323, + "grad_norm": 3.4815729475698585, + "learning_rate": 2.5000000000000004e-07, + "loss": 1.0887, + "step": 2 + }, + { + "epoch": 0.007202881152460984, + "grad_norm": 3.1919511484422025, + "learning_rate": 5.000000000000001e-07, + "loss": 0.9992, + "step": 3 + }, + { + "epoch": 0.009603841536614645, + "grad_norm": 3.192234811391807, + "learning_rate": 7.5e-07, + "loss": 1.0676, + "step": 4 + }, + { + "epoch": 0.012004801920768308, + "grad_norm": 3.147628945570283, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.0932, + "step": 5 + }, + { + "epoch": 0.014405762304921969, + "grad_norm": 2.9173656342980085, + "learning_rate": 1.25e-06, + "loss": 1.0108, + "step": 6 + }, + { + "epoch": 0.01680672268907563, + "grad_norm": 2.965584304042035, + "learning_rate": 1.5e-06, + "loss": 1.0785, + "step": 7 + }, + { + "epoch": 0.01920768307322929, + "grad_norm": 2.6367740096927643, + "learning_rate": 1.75e-06, + "loss": 1.0562, + "step": 8 + }, + { + "epoch": 0.021608643457382955, + "grad_norm": 2.468163591370129, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.0243, + "step": 9 + }, + { + "epoch": 0.024009603841536616, + "grad_norm": 2.372272215735556, + "learning_rate": 2.25e-06, + "loss": 1.0174, + "step": 10 + }, + { + "epoch": 0.026410564225690276, + "grad_norm": 2.4133969432396736, + "learning_rate": 2.5e-06, + "loss": 0.9751, + "step": 11 + }, + { + "epoch": 0.028811524609843937, + "grad_norm": 1.9294470459831854, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.9501, + "step": 12 + }, + { + "epoch": 0.031212484993997598, + "grad_norm": 1.6356435792199144, + "learning_rate": 3e-06, + "loss": 0.8964, + "step": 13 + }, + { + "epoch": 0.03361344537815126, + "grad_norm": 1.4259278773927042, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.9262, + "step": 14 + }, + { + "epoch": 0.03601440576230492, + "grad_norm": 1.4029510526993147, + "learning_rate": 3.5e-06, + "loss": 0.8773, + "step": 15 + }, + { + "epoch": 0.03841536614645858, + "grad_norm": 1.3072701950591272, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.8863, + "step": 16 + }, + { + "epoch": 0.04081632653061224, + "grad_norm": 1.3415661249879083, + "learning_rate": 4.000000000000001e-06, + "loss": 0.8391, + "step": 17 + }, + { + "epoch": 0.04321728691476591, + "grad_norm": 1.2551228414808617, + "learning_rate": 4.25e-06, + "loss": 0.8391, + "step": 18 + }, + { + "epoch": 0.04561824729891957, + "grad_norm": 1.4822117298636246, + "learning_rate": 4.5e-06, + "loss": 0.9249, + "step": 19 + }, + { + "epoch": 0.04801920768307323, + "grad_norm": 2.293387776769893, + "learning_rate": 4.75e-06, + "loss": 0.8683, + "step": 20 + }, + { + "epoch": 0.05042016806722689, + "grad_norm": 1.276527897418368, + "learning_rate": 5e-06, + "loss": 0.8863, + "step": 21 + }, + { + "epoch": 0.05282112845138055, + "grad_norm": 1.205563120747159, + "learning_rate": 5.2500000000000006e-06, + "loss": 0.8641, + "step": 22 + }, + { + "epoch": 0.055222088835534214, + "grad_norm": 1.2341891463491723, + "learning_rate": 5.500000000000001e-06, + "loss": 0.8741, + "step": 23 + }, + { + "epoch": 0.057623049219687875, + "grad_norm": 1.194537311448108, + "learning_rate": 5.75e-06, + "loss": 0.806, + "step": 24 + }, + { + "epoch": 0.060024009603841535, + "grad_norm": 1.2485956345196676, + "learning_rate": 6e-06, + "loss": 0.8281, + "step": 25 + }, + { + "epoch": 0.062424969987995196, + "grad_norm": 1.1779315877632301, + "learning_rate": 6.25e-06, + "loss": 0.8552, + "step": 26 + }, + { + "epoch": 0.06482593037214886, + "grad_norm": 1.4180731528009602, + "learning_rate": 6.5000000000000004e-06, + "loss": 0.8506, + "step": 27 + }, + { + "epoch": 0.06722689075630252, + "grad_norm": 3.041894752350215, + "learning_rate": 6.750000000000001e-06, + "loss": 0.8885, + "step": 28 + }, + { + "epoch": 0.06962785114045618, + "grad_norm": 1.1860989270253326, + "learning_rate": 7e-06, + "loss": 0.8331, + "step": 29 + }, + { + "epoch": 0.07202881152460984, + "grad_norm": 1.206902242271321, + "learning_rate": 7.25e-06, + "loss": 0.8206, + "step": 30 + }, + { + "epoch": 0.0744297719087635, + "grad_norm": 1.1671538322004624, + "learning_rate": 7.500000000000001e-06, + "loss": 0.8167, + "step": 31 + }, + { + "epoch": 0.07683073229291716, + "grad_norm": 1.0422949222046782, + "learning_rate": 7.75e-06, + "loss": 0.8007, + "step": 32 + }, + { + "epoch": 0.07923169267707082, + "grad_norm": 1.8592446673491745, + "learning_rate": 8.000000000000001e-06, + "loss": 0.8507, + "step": 33 + }, + { + "epoch": 0.08163265306122448, + "grad_norm": 1.6612165554220881, + "learning_rate": 8.25e-06, + "loss": 0.8508, + "step": 34 + }, + { + "epoch": 0.08403361344537816, + "grad_norm": 1.0713315873933724, + "learning_rate": 8.5e-06, + "loss": 0.8605, + "step": 35 + }, + { + "epoch": 0.08643457382953182, + "grad_norm": 1.1490421409260716, + "learning_rate": 8.750000000000001e-06, + "loss": 0.8529, + "step": 36 + }, + { + "epoch": 0.08883553421368548, + "grad_norm": 1.1077552226131402, + "learning_rate": 9e-06, + "loss": 0.8251, + "step": 37 + }, + { + "epoch": 0.09123649459783914, + "grad_norm": 0.9873238748489626, + "learning_rate": 9.250000000000001e-06, + "loss": 0.7978, + "step": 38 + }, + { + "epoch": 0.0936374549819928, + "grad_norm": 1.3452579503150184, + "learning_rate": 9.5e-06, + "loss": 0.8637, + "step": 39 + }, + { + "epoch": 0.09603841536614646, + "grad_norm": 1.4631068406573169, + "learning_rate": 9.75e-06, + "loss": 0.8023, + "step": 40 + }, + { + "epoch": 0.09843937575030012, + "grad_norm": 1.265806360917147, + "learning_rate": 1e-05, + "loss": 0.8345, + "step": 41 + }, + { + "epoch": 0.10084033613445378, + "grad_norm": 1.1943330814308766, + "learning_rate": 9.999960664124435e-06, + "loss": 0.8704, + "step": 42 + }, + { + "epoch": 0.10324129651860744, + "grad_norm": 1.061823017754904, + "learning_rate": 9.999842657116667e-06, + "loss": 0.8463, + "step": 43 + }, + { + "epoch": 0.1056422569027611, + "grad_norm": 1.0296226499358687, + "learning_rate": 9.999645980833454e-06, + "loss": 0.825, + "step": 44 + }, + { + "epoch": 0.10804321728691477, + "grad_norm": 1.046275065503394, + "learning_rate": 9.999370638369377e-06, + "loss": 0.8341, + "step": 45 + }, + { + "epoch": 0.11044417767106843, + "grad_norm": 1.0958287099043753, + "learning_rate": 9.999016634056764e-06, + "loss": 0.7894, + "step": 46 + }, + { + "epoch": 0.11284513805522209, + "grad_norm": 1.030922973342192, + "learning_rate": 9.998583973465647e-06, + "loss": 0.787, + "step": 47 + }, + { + "epoch": 0.11524609843937575, + "grad_norm": 1.2421669534880109, + "learning_rate": 9.998072663403657e-06, + "loss": 0.7978, + "step": 48 + }, + { + "epoch": 0.11764705882352941, + "grad_norm": 1.3970247274710101, + "learning_rate": 9.997482711915926e-06, + "loss": 0.7801, + "step": 49 + }, + { + "epoch": 0.12004801920768307, + "grad_norm": 0.9723742502152596, + "learning_rate": 9.99681412828496e-06, + "loss": 0.7895, + "step": 50 + }, + { + "epoch": 0.12244897959183673, + "grad_norm": 1.2807089303778385, + "learning_rate": 9.996066923030484e-06, + "loss": 0.8543, + "step": 51 + }, + { + "epoch": 0.12484993997599039, + "grad_norm": 1.27637006864039, + "learning_rate": 9.99524110790929e-06, + "loss": 0.8535, + "step": 52 + }, + { + "epoch": 0.12725090036014405, + "grad_norm": 0.9583150465246295, + "learning_rate": 9.994336695915041e-06, + "loss": 0.7819, + "step": 53 + }, + { + "epoch": 0.12965186074429771, + "grad_norm": 1.066121996350333, + "learning_rate": 9.993353701278072e-06, + "loss": 0.7962, + "step": 54 + }, + { + "epoch": 0.13205282112845138, + "grad_norm": 1.0321017551908462, + "learning_rate": 9.992292139465166e-06, + "loss": 0.794, + "step": 55 + }, + { + "epoch": 0.13445378151260504, + "grad_norm": 1.1056060968463626, + "learning_rate": 9.991152027179307e-06, + "loss": 0.8431, + "step": 56 + }, + { + "epoch": 0.1368547418967587, + "grad_norm": 1.6782765780764348, + "learning_rate": 9.989933382359423e-06, + "loss": 0.8303, + "step": 57 + }, + { + "epoch": 0.13925570228091236, + "grad_norm": 1.1967331617073076, + "learning_rate": 9.988636224180097e-06, + "loss": 0.7792, + "step": 58 + }, + { + "epoch": 0.14165666266506602, + "grad_norm": 1.8049549266124272, + "learning_rate": 9.987260573051268e-06, + "loss": 0.7988, + "step": 59 + }, + { + "epoch": 0.14405762304921968, + "grad_norm": 1.046695584455822, + "learning_rate": 9.985806450617916e-06, + "loss": 0.8037, + "step": 60 + }, + { + "epoch": 0.14645858343337334, + "grad_norm": 1.2121335142230327, + "learning_rate": 9.984273879759713e-06, + "loss": 0.8515, + "step": 61 + }, + { + "epoch": 0.148859543817527, + "grad_norm": 1.3076719515024817, + "learning_rate": 9.982662884590662e-06, + "loss": 0.8178, + "step": 62 + }, + { + "epoch": 0.15126050420168066, + "grad_norm": 1.2564921607093686, + "learning_rate": 9.980973490458728e-06, + "loss": 0.8399, + "step": 63 + }, + { + "epoch": 0.15366146458583432, + "grad_norm": 1.0641383621124019, + "learning_rate": 9.97920572394543e-06, + "loss": 0.7677, + "step": 64 + }, + { + "epoch": 0.15606242496998798, + "grad_norm": 1.1281926663515902, + "learning_rate": 9.977359612865424e-06, + "loss": 0.7974, + "step": 65 + }, + { + "epoch": 0.15846338535414164, + "grad_norm": 1.0094739962757735, + "learning_rate": 9.975435186266069e-06, + "loss": 0.7676, + "step": 66 + }, + { + "epoch": 0.1608643457382953, + "grad_norm": 1.3602443348603426, + "learning_rate": 9.973432474426968e-06, + "loss": 0.8171, + "step": 67 + }, + { + "epoch": 0.16326530612244897, + "grad_norm": 0.9792328660506742, + "learning_rate": 9.971351508859488e-06, + "loss": 0.7561, + "step": 68 + }, + { + "epoch": 0.16566626650660263, + "grad_norm": 1.397240469672876, + "learning_rate": 9.969192322306271e-06, + "loss": 0.8365, + "step": 69 + }, + { + "epoch": 0.16806722689075632, + "grad_norm": 1.3993747618987225, + "learning_rate": 9.966954948740717e-06, + "loss": 0.7281, + "step": 70 + }, + { + "epoch": 0.17046818727490998, + "grad_norm": 1.0666295786391167, + "learning_rate": 9.964639423366442e-06, + "loss": 0.8563, + "step": 71 + }, + { + "epoch": 0.17286914765906364, + "grad_norm": 1.6098094871731998, + "learning_rate": 9.962245782616734e-06, + "loss": 0.8027, + "step": 72 + }, + { + "epoch": 0.1752701080432173, + "grad_norm": 1.0029300637530163, + "learning_rate": 9.959774064153977e-06, + "loss": 0.7768, + "step": 73 + }, + { + "epoch": 0.17767106842737096, + "grad_norm": 1.1557098402816255, + "learning_rate": 9.957224306869053e-06, + "loss": 0.7915, + "step": 74 + }, + { + "epoch": 0.18007202881152462, + "grad_norm": 1.1411436589531005, + "learning_rate": 9.954596550880735e-06, + "loss": 0.8207, + "step": 75 + }, + { + "epoch": 0.18247298919567828, + "grad_norm": 0.9898519262158086, + "learning_rate": 9.951890837535058e-06, + "loss": 0.8303, + "step": 76 + }, + { + "epoch": 0.18487394957983194, + "grad_norm": 1.015734040263864, + "learning_rate": 9.949107209404664e-06, + "loss": 0.7928, + "step": 77 + }, + { + "epoch": 0.1872749099639856, + "grad_norm": 1.0314783989677847, + "learning_rate": 9.946245710288132e-06, + "loss": 0.7407, + "step": 78 + }, + { + "epoch": 0.18967587034813926, + "grad_norm": 2.1401440917485943, + "learning_rate": 9.94330638520929e-06, + "loss": 0.7775, + "step": 79 + }, + { + "epoch": 0.19207683073229292, + "grad_norm": 0.9890405160925984, + "learning_rate": 9.940289280416509e-06, + "loss": 0.7566, + "step": 80 + }, + { + "epoch": 0.19447779111644659, + "grad_norm": 0.9739543182418596, + "learning_rate": 9.937194443381972e-06, + "loss": 0.8159, + "step": 81 + }, + { + "epoch": 0.19687875150060025, + "grad_norm": 1.1138048276749253, + "learning_rate": 9.934021922800931e-06, + "loss": 0.7939, + "step": 82 + }, + { + "epoch": 0.1992797118847539, + "grad_norm": 1.0939436400008165, + "learning_rate": 9.930771768590934e-06, + "loss": 0.8144, + "step": 83 + }, + { + "epoch": 0.20168067226890757, + "grad_norm": 1.1293648489823769, + "learning_rate": 9.927444031891044e-06, + "loss": 0.7755, + "step": 84 + }, + { + "epoch": 0.20408163265306123, + "grad_norm": 1.0430331448777437, + "learning_rate": 9.924038765061042e-06, + "loss": 0.8215, + "step": 85 + }, + { + "epoch": 0.2064825930372149, + "grad_norm": 1.9537322284504008, + "learning_rate": 9.92055602168058e-06, + "loss": 0.7563, + "step": 86 + }, + { + "epoch": 0.20888355342136855, + "grad_norm": 1.3079086606300951, + "learning_rate": 9.916995856548371e-06, + "loss": 0.8157, + "step": 87 + }, + { + "epoch": 0.2112845138055222, + "grad_norm": 0.9221688539776796, + "learning_rate": 9.913358325681292e-06, + "loss": 0.782, + "step": 88 + }, + { + "epoch": 0.21368547418967587, + "grad_norm": 1.1087405466326585, + "learning_rate": 9.909643486313533e-06, + "loss": 0.8061, + "step": 89 + }, + { + "epoch": 0.21608643457382953, + "grad_norm": 1.2632508204228434, + "learning_rate": 9.905851396895679e-06, + "loss": 0.7788, + "step": 90 + }, + { + "epoch": 0.2184873949579832, + "grad_norm": 1.2841221080352947, + "learning_rate": 9.901982117093786e-06, + "loss": 0.7707, + "step": 91 + }, + { + "epoch": 0.22088835534213686, + "grad_norm": 0.9416571120286465, + "learning_rate": 9.898035707788462e-06, + "loss": 0.7386, + "step": 92 + }, + { + "epoch": 0.22328931572629052, + "grad_norm": 1.0394686320396263, + "learning_rate": 9.894012231073895e-06, + "loss": 0.7729, + "step": 93 + }, + { + "epoch": 0.22569027611044418, + "grad_norm": 1.1642999740100775, + "learning_rate": 9.889911750256873e-06, + "loss": 0.7667, + "step": 94 + }, + { + "epoch": 0.22809123649459784, + "grad_norm": 1.0191501234681113, + "learning_rate": 9.885734329855798e-06, + "loss": 0.7415, + "step": 95 + }, + { + "epoch": 0.2304921968787515, + "grad_norm": 1.1396215768269955, + "learning_rate": 9.881480035599667e-06, + "loss": 0.846, + "step": 96 + }, + { + "epoch": 0.23289315726290516, + "grad_norm": 1.4770430873772629, + "learning_rate": 9.877148934427037e-06, + "loss": 0.8105, + "step": 97 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 1.1245875213463206, + "learning_rate": 9.872741094484965e-06, + "loss": 0.7857, + "step": 98 + }, + { + "epoch": 0.23769507803121248, + "grad_norm": 0.9225248681792048, + "learning_rate": 9.868256585127956e-06, + "loss": 0.7292, + "step": 99 + }, + { + "epoch": 0.24009603841536614, + "grad_norm": 1.0574440145567323, + "learning_rate": 9.863695476916846e-06, + "loss": 0.8077, + "step": 100 + }, + { + "epoch": 0.2424969987995198, + "grad_norm": 1.1365137152954194, + "learning_rate": 9.859057841617709e-06, + "loss": 0.8066, + "step": 101 + }, + { + "epoch": 0.24489795918367346, + "grad_norm": 1.1942946403505255, + "learning_rate": 9.854343752200725e-06, + "loss": 0.8139, + "step": 102 + }, + { + "epoch": 0.24729891956782712, + "grad_norm": 1.0543479220950105, + "learning_rate": 9.849553282839025e-06, + "loss": 0.7701, + "step": 103 + }, + { + "epoch": 0.24969987995198079, + "grad_norm": 0.926743129553519, + "learning_rate": 9.844686508907538e-06, + "loss": 0.7927, + "step": 104 + }, + { + "epoch": 0.25210084033613445, + "grad_norm": 1.011488885523952, + "learning_rate": 9.839743506981783e-06, + "loss": 0.7914, + "step": 105 + }, + { + "epoch": 0.2545018007202881, + "grad_norm": 1.0064857869367891, + "learning_rate": 9.834724354836684e-06, + "loss": 0.7701, + "step": 106 + }, + { + "epoch": 0.25690276110444177, + "grad_norm": 1.2884610797254652, + "learning_rate": 9.829629131445342e-06, + "loss": 0.7797, + "step": 107 + }, + { + "epoch": 0.25930372148859543, + "grad_norm": 1.022923038777103, + "learning_rate": 9.824457916977785e-06, + "loss": 0.7563, + "step": 108 + }, + { + "epoch": 0.2617046818727491, + "grad_norm": 1.3622638931252318, + "learning_rate": 9.819210792799711e-06, + "loss": 0.791, + "step": 109 + }, + { + "epoch": 0.26410564225690275, + "grad_norm": 1.6261815582055081, + "learning_rate": 9.81388784147121e-06, + "loss": 0.7791, + "step": 110 + }, + { + "epoch": 0.2665066026410564, + "grad_norm": 1.3875850558476444, + "learning_rate": 9.808489146745466e-06, + "loss": 0.8065, + "step": 111 + }, + { + "epoch": 0.2689075630252101, + "grad_norm": 1.0566640937075868, + "learning_rate": 9.803014793567429e-06, + "loss": 0.7662, + "step": 112 + }, + { + "epoch": 0.27130852340936373, + "grad_norm": 0.9297104835810871, + "learning_rate": 9.797464868072489e-06, + "loss": 0.7338, + "step": 113 + }, + { + "epoch": 0.2737094837935174, + "grad_norm": 0.9992084910627275, + "learning_rate": 9.791839457585118e-06, + "loss": 0.7773, + "step": 114 + }, + { + "epoch": 0.27611044417767105, + "grad_norm": 1.1581522702903588, + "learning_rate": 9.786138650617494e-06, + "loss": 0.7929, + "step": 115 + }, + { + "epoch": 0.2785114045618247, + "grad_norm": 1.259081791921377, + "learning_rate": 9.780362536868113e-06, + "loss": 0.7279, + "step": 116 + }, + { + "epoch": 0.2809123649459784, + "grad_norm": 1.1510251918441672, + "learning_rate": 9.774511207220369e-06, + "loss": 0.7412, + "step": 117 + }, + { + "epoch": 0.28331332533013204, + "grad_norm": 0.9943579023159135, + "learning_rate": 9.768584753741134e-06, + "loss": 0.7927, + "step": 118 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 1.061487526635317, + "learning_rate": 9.762583269679304e-06, + "loss": 0.8363, + "step": 119 + }, + { + "epoch": 0.28811524609843936, + "grad_norm": 1.1215382733800197, + "learning_rate": 9.756506849464327e-06, + "loss": 0.8032, + "step": 120 + }, + { + "epoch": 0.290516206482593, + "grad_norm": 1.0985810622772545, + "learning_rate": 9.750355588704728e-06, + "loss": 0.8062, + "step": 121 + }, + { + "epoch": 0.2929171668667467, + "grad_norm": 1.0397097160893556, + "learning_rate": 9.744129584186599e-06, + "loss": 0.7867, + "step": 122 + }, + { + "epoch": 0.29531812725090034, + "grad_norm": 1.0985983029789874, + "learning_rate": 9.737828933872076e-06, + "loss": 0.7645, + "step": 123 + }, + { + "epoch": 0.297719087635054, + "grad_norm": 1.0341542778256945, + "learning_rate": 9.731453736897796e-06, + "loss": 0.7682, + "step": 124 + }, + { + "epoch": 0.30012004801920766, + "grad_norm": 0.9637556163656881, + "learning_rate": 9.725004093573343e-06, + "loss": 0.7819, + "step": 125 + }, + { + "epoch": 0.3025210084033613, + "grad_norm": 0.9476489500974398, + "learning_rate": 9.718480105379663e-06, + "loss": 0.7875, + "step": 126 + }, + { + "epoch": 0.304921968787515, + "grad_norm": 1.3085520795980778, + "learning_rate": 9.711881874967471e-06, + "loss": 0.7908, + "step": 127 + }, + { + "epoch": 0.30732292917166865, + "grad_norm": 0.9414082676216773, + "learning_rate": 9.705209506155635e-06, + "loss": 0.7454, + "step": 128 + }, + { + "epoch": 0.3097238895558223, + "grad_norm": 1.2351287282053665, + "learning_rate": 9.698463103929542e-06, + "loss": 0.7813, + "step": 129 + }, + { + "epoch": 0.31212484993997597, + "grad_norm": 1.021583176125914, + "learning_rate": 9.69164277443945e-06, + "loss": 0.8417, + "step": 130 + }, + { + "epoch": 0.31452581032412963, + "grad_norm": 1.4089615187590279, + "learning_rate": 9.68474862499881e-06, + "loss": 0.7358, + "step": 131 + }, + { + "epoch": 0.3169267707082833, + "grad_norm": 0.9398428308548782, + "learning_rate": 9.677780764082583e-06, + "loss": 0.7637, + "step": 132 + }, + { + "epoch": 0.31932773109243695, + "grad_norm": 1.1136331841332863, + "learning_rate": 9.670739301325534e-06, + "loss": 0.7723, + "step": 133 + }, + { + "epoch": 0.3217286914765906, + "grad_norm": 1.1992806853517075, + "learning_rate": 9.663624347520506e-06, + "loss": 0.7788, + "step": 134 + }, + { + "epoch": 0.3241296518607443, + "grad_norm": 1.311509502714469, + "learning_rate": 9.65643601461667e-06, + "loss": 0.7696, + "step": 135 + }, + { + "epoch": 0.32653061224489793, + "grad_norm": 0.9623376564123519, + "learning_rate": 9.649174415717776e-06, + "loss": 0.7414, + "step": 136 + }, + { + "epoch": 0.3289315726290516, + "grad_norm": 1.0486840444117425, + "learning_rate": 9.641839665080363e-06, + "loss": 0.7909, + "step": 137 + }, + { + "epoch": 0.33133253301320525, + "grad_norm": 1.172648282688611, + "learning_rate": 9.634431878111969e-06, + "loss": 0.7684, + "step": 138 + }, + { + "epoch": 0.33373349339735897, + "grad_norm": 1.3883931408734287, + "learning_rate": 9.626951171369306e-06, + "loss": 0.7824, + "step": 139 + }, + { + "epoch": 0.33613445378151263, + "grad_norm": 1.0459492696464887, + "learning_rate": 9.619397662556434e-06, + "loss": 0.7729, + "step": 140 + }, + { + "epoch": 0.3385354141656663, + "grad_norm": 1.090681276032822, + "learning_rate": 9.611771470522908e-06, + "loss": 0.78, + "step": 141 + }, + { + "epoch": 0.34093637454981995, + "grad_norm": 1.077439313381409, + "learning_rate": 9.604072715261902e-06, + "loss": 0.7746, + "step": 142 + }, + { + "epoch": 0.3433373349339736, + "grad_norm": 1.0119948107455536, + "learning_rate": 9.596301517908329e-06, + "loss": 0.7725, + "step": 143 + }, + { + "epoch": 0.3457382953181273, + "grad_norm": 1.0261650566322091, + "learning_rate": 9.588458000736929e-06, + "loss": 0.8126, + "step": 144 + }, + { + "epoch": 0.34813925570228094, + "grad_norm": 3.2915968630848296, + "learning_rate": 9.580542287160348e-06, + "loss": 0.8087, + "step": 145 + }, + { + "epoch": 0.3505402160864346, + "grad_norm": 1.1260052143600965, + "learning_rate": 9.572554501727198e-06, + "loss": 0.7626, + "step": 146 + }, + { + "epoch": 0.35294117647058826, + "grad_norm": 1.1992270704321544, + "learning_rate": 9.564494770120089e-06, + "loss": 0.7942, + "step": 147 + }, + { + "epoch": 0.3553421368547419, + "grad_norm": 0.8934279508188496, + "learning_rate": 9.556363219153664e-06, + "loss": 0.7121, + "step": 148 + }, + { + "epoch": 0.3577430972388956, + "grad_norm": 1.627029857609285, + "learning_rate": 9.548159976772593e-06, + "loss": 0.8305, + "step": 149 + }, + { + "epoch": 0.36014405762304924, + "grad_norm": 1.2904112895903999, + "learning_rate": 9.539885172049563e-06, + "loss": 0.7903, + "step": 150 + }, + { + "epoch": 0.3625450180072029, + "grad_norm": 0.9013735933905231, + "learning_rate": 9.531538935183252e-06, + "loss": 0.7532, + "step": 151 + }, + { + "epoch": 0.36494597839135656, + "grad_norm": 1.1957280125341616, + "learning_rate": 9.52312139749627e-06, + "loss": 0.7654, + "step": 152 + }, + { + "epoch": 0.3673469387755102, + "grad_norm": 1.1864890770385386, + "learning_rate": 9.514632691433108e-06, + "loss": 0.7774, + "step": 153 + }, + { + "epoch": 0.3697478991596639, + "grad_norm": 1.2602126687460933, + "learning_rate": 9.506072950558036e-06, + "loss": 0.7308, + "step": 154 + }, + { + "epoch": 0.37214885954381755, + "grad_norm": 0.9838354835983466, + "learning_rate": 9.497442309553017e-06, + "loss": 0.8117, + "step": 155 + }, + { + "epoch": 0.3745498199279712, + "grad_norm": 1.0381143846448244, + "learning_rate": 9.488740904215578e-06, + "loss": 0.7949, + "step": 156 + }, + { + "epoch": 0.37695078031212487, + "grad_norm": 1.0974151220229578, + "learning_rate": 9.47996887145668e-06, + "loss": 0.8096, + "step": 157 + }, + { + "epoch": 0.3793517406962785, + "grad_norm": 1.0644973845655654, + "learning_rate": 9.471126349298557e-06, + "loss": 0.769, + "step": 158 + }, + { + "epoch": 0.3817527010804322, + "grad_norm": 1.0805369220592416, + "learning_rate": 9.46221347687255e-06, + "loss": 0.7528, + "step": 159 + }, + { + "epoch": 0.38415366146458585, + "grad_norm": 1.0244991912422095, + "learning_rate": 9.453230394416914e-06, + "loss": 0.7883, + "step": 160 + }, + { + "epoch": 0.3865546218487395, + "grad_norm": 0.9661371626174572, + "learning_rate": 9.444177243274619e-06, + "loss": 0.784, + "step": 161 + }, + { + "epoch": 0.38895558223289317, + "grad_norm": 1.3148287207686615, + "learning_rate": 9.43505416589111e-06, + "loss": 0.7656, + "step": 162 + }, + { + "epoch": 0.39135654261704683, + "grad_norm": 1.1619055283617015, + "learning_rate": 9.425861305812083e-06, + "loss": 0.7824, + "step": 163 + }, + { + "epoch": 0.3937575030012005, + "grad_norm": 4.651513081191553, + "learning_rate": 9.416598807681221e-06, + "loss": 0.8066, + "step": 164 + }, + { + "epoch": 0.39615846338535415, + "grad_norm": 0.9479957292008633, + "learning_rate": 9.40726681723791e-06, + "loss": 0.7651, + "step": 165 + }, + { + "epoch": 0.3985594237695078, + "grad_norm": 1.1703185690222349, + "learning_rate": 9.397865481314959e-06, + "loss": 0.7656, + "step": 166 + }, + { + "epoch": 0.4009603841536615, + "grad_norm": 1.0368151311067004, + "learning_rate": 9.388394947836278e-06, + "loss": 0.7917, + "step": 167 + }, + { + "epoch": 0.40336134453781514, + "grad_norm": 2.298562774870953, + "learning_rate": 9.37885536581456e-06, + "loss": 0.762, + "step": 168 + }, + { + "epoch": 0.4057623049219688, + "grad_norm": 1.1062217919442925, + "learning_rate": 9.369246885348926e-06, + "loss": 0.7585, + "step": 169 + }, + { + "epoch": 0.40816326530612246, + "grad_norm": 1.173156390813293, + "learning_rate": 9.359569657622573e-06, + "loss": 0.7797, + "step": 170 + }, + { + "epoch": 0.4105642256902761, + "grad_norm": 1.109809941134885, + "learning_rate": 9.349823834900396e-06, + "loss": 0.7588, + "step": 171 + }, + { + "epoch": 0.4129651860744298, + "grad_norm": 1.2813177351957437, + "learning_rate": 9.340009570526578e-06, + "loss": 0.7616, + "step": 172 + }, + { + "epoch": 0.41536614645858344, + "grad_norm": 1.0591641661573232, + "learning_rate": 9.330127018922195e-06, + "loss": 0.7564, + "step": 173 + }, + { + "epoch": 0.4177671068427371, + "grad_norm": 2.376914868004014, + "learning_rate": 9.320176335582772e-06, + "loss": 0.7366, + "step": 174 + }, + { + "epoch": 0.42016806722689076, + "grad_norm": 0.9902616640042359, + "learning_rate": 9.310157677075847e-06, + "loss": 0.7429, + "step": 175 + }, + { + "epoch": 0.4225690276110444, + "grad_norm": 1.6847159971307049, + "learning_rate": 9.300071201038503e-06, + "loss": 0.7825, + "step": 176 + }, + { + "epoch": 0.4249699879951981, + "grad_norm": 0.9942879778667437, + "learning_rate": 9.289917066174887e-06, + "loss": 0.7318, + "step": 177 + }, + { + "epoch": 0.42737094837935174, + "grad_norm": 1.1742665576935563, + "learning_rate": 9.27969543225371e-06, + "loss": 0.7623, + "step": 178 + }, + { + "epoch": 0.4297719087635054, + "grad_norm": 1.2676535006378862, + "learning_rate": 9.269406460105742e-06, + "loss": 0.7583, + "step": 179 + }, + { + "epoch": 0.43217286914765907, + "grad_norm": 1.678798458940733, + "learning_rate": 9.259050311621274e-06, + "loss": 0.7881, + "step": 180 + }, + { + "epoch": 0.4345738295318127, + "grad_norm": 1.1441951682044669, + "learning_rate": 9.248627149747573e-06, + "loss": 0.7455, + "step": 181 + }, + { + "epoch": 0.4369747899159664, + "grad_norm": 1.0055990609873005, + "learning_rate": 9.238137138486318e-06, + "loss": 0.7613, + "step": 182 + }, + { + "epoch": 0.43937575030012005, + "grad_norm": 1.0631388873420178, + "learning_rate": 9.227580442891022e-06, + "loss": 0.7528, + "step": 183 + }, + { + "epoch": 0.4417767106842737, + "grad_norm": 1.0811446737171562, + "learning_rate": 9.21695722906443e-06, + "loss": 0.7988, + "step": 184 + }, + { + "epoch": 0.44417767106842737, + "grad_norm": 3.77472894386223, + "learning_rate": 9.206267664155906e-06, + "loss": 0.7669, + "step": 185 + }, + { + "epoch": 0.44657863145258103, + "grad_norm": 2.5440967403332833, + "learning_rate": 9.195511916358813e-06, + "loss": 0.7478, + "step": 186 + }, + { + "epoch": 0.4489795918367347, + "grad_norm": 1.1995260606009623, + "learning_rate": 9.18469015490785e-06, + "loss": 0.7637, + "step": 187 + }, + { + "epoch": 0.45138055222088835, + "grad_norm": 1.087601696720053, + "learning_rate": 9.173802550076402e-06, + "loss": 0.7122, + "step": 188 + }, + { + "epoch": 0.453781512605042, + "grad_norm": 0.859793070085179, + "learning_rate": 9.162849273173857e-06, + "loss": 0.7237, + "step": 189 + }, + { + "epoch": 0.4561824729891957, + "grad_norm": 0.909405499890903, + "learning_rate": 9.151830496542912e-06, + "loss": 0.7413, + "step": 190 + }, + { + "epoch": 0.45858343337334934, + "grad_norm": 1.1698481579798348, + "learning_rate": 9.140746393556853e-06, + "loss": 0.7686, + "step": 191 + }, + { + "epoch": 0.460984393757503, + "grad_norm": 1.4005910755153028, + "learning_rate": 9.129597138616845e-06, + "loss": 0.7301, + "step": 192 + }, + { + "epoch": 0.46338535414165666, + "grad_norm": 6.016086016376545, + "learning_rate": 9.118382907149164e-06, + "loss": 0.7871, + "step": 193 + }, + { + "epoch": 0.4657863145258103, + "grad_norm": 0.9147643453009918, + "learning_rate": 9.107103875602458e-06, + "loss": 0.7753, + "step": 194 + }, + { + "epoch": 0.468187274909964, + "grad_norm": 1.1762754554445651, + "learning_rate": 9.09576022144496e-06, + "loss": 0.7764, + "step": 195 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 1.175295130468933, + "learning_rate": 9.084352123161695e-06, + "loss": 0.7912, + "step": 196 + }, + { + "epoch": 0.4729891956782713, + "grad_norm": 1.0739318228013277, + "learning_rate": 9.07287976025168e-06, + "loss": 0.762, + "step": 197 + }, + { + "epoch": 0.47539015606242496, + "grad_norm": 0.9934440139468771, + "learning_rate": 9.061343313225088e-06, + "loss": 0.8003, + "step": 198 + }, + { + "epoch": 0.4777911164465786, + "grad_norm": 1.0768642208250674, + "learning_rate": 9.04974296360042e-06, + "loss": 0.7659, + "step": 199 + }, + { + "epoch": 0.4801920768307323, + "grad_norm": 1.1273977823886203, + "learning_rate": 9.038078893901634e-06, + "loss": 0.7992, + "step": 200 + }, + { + "epoch": 0.48259303721488594, + "grad_norm": 1.1987531653944965, + "learning_rate": 9.026351287655294e-06, + "loss": 0.7932, + "step": 201 + }, + { + "epoch": 0.4849939975990396, + "grad_norm": 1.4257297426073774, + "learning_rate": 9.014560329387661e-06, + "loss": 0.7372, + "step": 202 + }, + { + "epoch": 0.48739495798319327, + "grad_norm": 0.8935161112029553, + "learning_rate": 9.002706204621802e-06, + "loss": 0.7701, + "step": 203 + }, + { + "epoch": 0.4897959183673469, + "grad_norm": 1.1445504107150832, + "learning_rate": 8.99078909987467e-06, + "loss": 0.7937, + "step": 204 + }, + { + "epoch": 0.4921968787515006, + "grad_norm": 0.9975014914511301, + "learning_rate": 8.978809202654161e-06, + "loss": 0.7045, + "step": 205 + }, + { + "epoch": 0.49459783913565425, + "grad_norm": 1.566224538850411, + "learning_rate": 8.966766701456177e-06, + "loss": 0.7665, + "step": 206 + }, + { + "epoch": 0.4969987995198079, + "grad_norm": 1.0336200728044336, + "learning_rate": 8.954661785761648e-06, + "loss": 0.7635, + "step": 207 + }, + { + "epoch": 0.49939975990396157, + "grad_norm": 1.2720894224504704, + "learning_rate": 8.942494646033555e-06, + "loss": 0.7546, + "step": 208 + }, + { + "epoch": 0.5018007202881153, + "grad_norm": 1.1200134220326137, + "learning_rate": 8.930265473713939e-06, + "loss": 0.7826, + "step": 209 + }, + { + "epoch": 0.5042016806722689, + "grad_norm": 1.2191868527466778, + "learning_rate": 8.917974461220877e-06, + "loss": 0.761, + "step": 210 + }, + { + "epoch": 0.5066026410564226, + "grad_norm": 0.9495968196976812, + "learning_rate": 8.905621801945467e-06, + "loss": 0.7817, + "step": 211 + }, + { + "epoch": 0.5090036014405762, + "grad_norm": 1.2513351813283156, + "learning_rate": 8.893207690248776e-06, + "loss": 0.812, + "step": 212 + }, + { + "epoch": 0.5114045618247299, + "grad_norm": 0.9020644463847227, + "learning_rate": 8.880732321458785e-06, + "loss": 0.7583, + "step": 213 + }, + { + "epoch": 0.5138055222088835, + "grad_norm": 0.8962757996224714, + "learning_rate": 8.868195891867315e-06, + "loss": 0.7437, + "step": 214 + }, + { + "epoch": 0.5162064825930373, + "grad_norm": 1.5860563857675605, + "learning_rate": 8.85559859872694e-06, + "loss": 0.7707, + "step": 215 + }, + { + "epoch": 0.5186074429771909, + "grad_norm": 0.9207750696190705, + "learning_rate": 8.84294064024788e-06, + "loss": 0.7405, + "step": 216 + }, + { + "epoch": 0.5210084033613446, + "grad_norm": 1.3824162672219853, + "learning_rate": 8.83022221559489e-06, + "loss": 0.7397, + "step": 217 + }, + { + "epoch": 0.5234093637454982, + "grad_norm": 1.0847854025433705, + "learning_rate": 8.817443524884119e-06, + "loss": 0.7825, + "step": 218 + }, + { + "epoch": 0.5258103241296519, + "grad_norm": 0.9427904526881725, + "learning_rate": 8.804604769179958e-06, + "loss": 0.7744, + "step": 219 + }, + { + "epoch": 0.5282112845138055, + "grad_norm": 1.2102888029571814, + "learning_rate": 8.791706150491887e-06, + "loss": 0.7461, + "step": 220 + }, + { + "epoch": 0.5306122448979592, + "grad_norm": 2.09508764328467, + "learning_rate": 8.778747871771293e-06, + "loss": 0.7135, + "step": 221 + }, + { + "epoch": 0.5330132052821128, + "grad_norm": 0.8498652434113725, + "learning_rate": 8.765730136908266e-06, + "loss": 0.7221, + "step": 222 + }, + { + "epoch": 0.5354141656662665, + "grad_norm": 1.12017910455628, + "learning_rate": 8.752653150728412e-06, + "loss": 0.7779, + "step": 223 + }, + { + "epoch": 0.5378151260504201, + "grad_norm": 1.0655695849783837, + "learning_rate": 8.739517118989606e-06, + "loss": 0.7488, + "step": 224 + }, + { + "epoch": 0.5402160864345739, + "grad_norm": 1.119471558248057, + "learning_rate": 8.726322248378775e-06, + "loss": 0.776, + "step": 225 + }, + { + "epoch": 0.5426170468187275, + "grad_norm": 1.0220100387555293, + "learning_rate": 8.713068746508633e-06, + "loss": 0.7736, + "step": 226 + }, + { + "epoch": 0.5450180072028812, + "grad_norm": 0.8814645039349688, + "learning_rate": 8.69975682191442e-06, + "loss": 0.7324, + "step": 227 + }, + { + "epoch": 0.5474189675870348, + "grad_norm": 0.8916406208469574, + "learning_rate": 8.68638668405062e-06, + "loss": 0.779, + "step": 228 + }, + { + "epoch": 0.5498199279711885, + "grad_norm": 7.661689497312271, + "learning_rate": 8.672958543287666e-06, + "loss": 0.8168, + "step": 229 + }, + { + "epoch": 0.5522208883553421, + "grad_norm": 1.2594345178659945, + "learning_rate": 8.659472610908628e-06, + "loss": 0.7909, + "step": 230 + }, + { + "epoch": 0.5546218487394958, + "grad_norm": 1.8799499195734866, + "learning_rate": 8.645929099105886e-06, + "loss": 0.7636, + "step": 231 + }, + { + "epoch": 0.5570228091236494, + "grad_norm": 2.1509512791119114, + "learning_rate": 8.632328220977801e-06, + "loss": 0.7813, + "step": 232 + }, + { + "epoch": 0.5594237695078031, + "grad_norm": 0.9752706676825262, + "learning_rate": 8.61867019052535e-06, + "loss": 0.7061, + "step": 233 + }, + { + "epoch": 0.5618247298919568, + "grad_norm": 0.9454915799985201, + "learning_rate": 8.604955222648772e-06, + "loss": 0.7505, + "step": 234 + }, + { + "epoch": 0.5642256902761105, + "grad_norm": 1.000828182837817, + "learning_rate": 8.591183533144172e-06, + "loss": 0.7546, + "step": 235 + }, + { + "epoch": 0.5666266506602641, + "grad_norm": 0.9228939989072429, + "learning_rate": 8.577355338700133e-06, + "loss": 0.7675, + "step": 236 + }, + { + "epoch": 0.5690276110444178, + "grad_norm": 1.184922039267135, + "learning_rate": 8.563470856894316e-06, + "loss": 0.7254, + "step": 237 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.8820446944758044, + "learning_rate": 8.549530306190015e-06, + "loss": 0.718, + "step": 238 + }, + { + "epoch": 0.5738295318127251, + "grad_norm": 1.1040228215856747, + "learning_rate": 8.535533905932739e-06, + "loss": 0.7741, + "step": 239 + }, + { + "epoch": 0.5762304921968787, + "grad_norm": 0.8713050147519229, + "learning_rate": 8.521481876346751e-06, + "loss": 0.7724, + "step": 240 + }, + { + "epoch": 0.5786314525810324, + "grad_norm": 1.261048501283676, + "learning_rate": 8.507374438531606e-06, + "loss": 0.7346, + "step": 241 + }, + { + "epoch": 0.581032412965186, + "grad_norm": 1.025441969049152, + "learning_rate": 8.493211814458674e-06, + "loss": 0.7434, + "step": 242 + }, + { + "epoch": 0.5834333733493398, + "grad_norm": 0.9922731431688747, + "learning_rate": 8.478994226967638e-06, + "loss": 0.7537, + "step": 243 + }, + { + "epoch": 0.5858343337334934, + "grad_norm": 0.9503319984653047, + "learning_rate": 8.464721899763003e-06, + "loss": 0.7845, + "step": 244 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 1.1876236123837647, + "learning_rate": 8.450395057410561e-06, + "loss": 0.7469, + "step": 245 + }, + { + "epoch": 0.5906362545018007, + "grad_norm": 1.7397436732241518, + "learning_rate": 8.436013925333868e-06, + "loss": 0.7519, + "step": 246 + }, + { + "epoch": 0.5930372148859544, + "grad_norm": 3.561635437191525, + "learning_rate": 8.421578729810693e-06, + "loss": 0.7558, + "step": 247 + }, + { + "epoch": 0.595438175270108, + "grad_norm": 0.8957487481226041, + "learning_rate": 8.407089697969458e-06, + "loss": 0.7337, + "step": 248 + }, + { + "epoch": 0.5978391356542617, + "grad_norm": 0.90662629233352, + "learning_rate": 8.392547057785662e-06, + "loss": 0.7312, + "step": 249 + }, + { + "epoch": 0.6002400960384153, + "grad_norm": 1.2695041036651842, + "learning_rate": 8.377951038078303e-06, + "loss": 0.7751, + "step": 250 + }, + { + "epoch": 0.602641056422569, + "grad_norm": 1.345157120613888, + "learning_rate": 8.363301868506264e-06, + "loss": 0.7401, + "step": 251 + }, + { + "epoch": 0.6050420168067226, + "grad_norm": 1.1702437582981216, + "learning_rate": 8.34859977956471e-06, + "loss": 0.7508, + "step": 252 + }, + { + "epoch": 0.6074429771908764, + "grad_norm": 1.0489855923078621, + "learning_rate": 8.33384500258146e-06, + "loss": 0.7578, + "step": 253 + }, + { + "epoch": 0.60984393757503, + "grad_norm": 0.9685408876120875, + "learning_rate": 8.319037769713338e-06, + "loss": 0.728, + "step": 254 + }, + { + "epoch": 0.6122448979591837, + "grad_norm": 1.3885104010614975, + "learning_rate": 8.304178313942536e-06, + "loss": 0.7346, + "step": 255 + }, + { + "epoch": 0.6146458583433373, + "grad_norm": 1.1866044878022257, + "learning_rate": 8.289266869072933e-06, + "loss": 0.7666, + "step": 256 + }, + { + "epoch": 0.617046818727491, + "grad_norm": 1.0305750345887703, + "learning_rate": 8.274303669726427e-06, + "loss": 0.8051, + "step": 257 + }, + { + "epoch": 0.6194477791116446, + "grad_norm": 1.8492085071792421, + "learning_rate": 8.259288951339233e-06, + "loss": 0.7967, + "step": 258 + }, + { + "epoch": 0.6218487394957983, + "grad_norm": 1.1664540722581158, + "learning_rate": 8.244222950158194e-06, + "loss": 0.7754, + "step": 259 + }, + { + "epoch": 0.6242496998799519, + "grad_norm": 1.1540766467381085, + "learning_rate": 8.229105903237045e-06, + "loss": 0.7439, + "step": 260 + }, + { + "epoch": 0.6266506602641057, + "grad_norm": 1.1934705441556062, + "learning_rate": 8.213938048432697e-06, + "loss": 0.7563, + "step": 261 + }, + { + "epoch": 0.6290516206482593, + "grad_norm": 1.146803985370755, + "learning_rate": 8.198719624401493e-06, + "loss": 0.7798, + "step": 262 + }, + { + "epoch": 0.631452581032413, + "grad_norm": 1.0165065570676493, + "learning_rate": 8.183450870595443e-06, + "loss": 0.7413, + "step": 263 + }, + { + "epoch": 0.6338535414165666, + "grad_norm": 1.5262803590995224, + "learning_rate": 8.168132027258467e-06, + "loss": 0.7941, + "step": 264 + }, + { + "epoch": 0.6362545018007203, + "grad_norm": 1.0571414793861305, + "learning_rate": 8.152763335422612e-06, + "loss": 0.7839, + "step": 265 + }, + { + "epoch": 0.6386554621848739, + "grad_norm": 1.0454099741877119, + "learning_rate": 8.13734503690426e-06, + "loss": 0.7884, + "step": 266 + }, + { + "epoch": 0.6410564225690276, + "grad_norm": 1.3245270490543846, + "learning_rate": 8.121877374300318e-06, + "loss": 0.7618, + "step": 267 + }, + { + "epoch": 0.6434573829531812, + "grad_norm": 0.9697059008882367, + "learning_rate": 8.106360590984406e-06, + "loss": 0.7538, + "step": 268 + }, + { + "epoch": 0.6458583433373349, + "grad_norm": 0.9819086681621175, + "learning_rate": 8.090794931103026e-06, + "loss": 0.802, + "step": 269 + }, + { + "epoch": 0.6482593037214885, + "grad_norm": 1.0499699628185923, + "learning_rate": 8.075180639571726e-06, + "loss": 0.7553, + "step": 270 + }, + { + "epoch": 0.6506602641056423, + "grad_norm": 1.1150101137484323, + "learning_rate": 8.059517962071234e-06, + "loss": 0.7943, + "step": 271 + }, + { + "epoch": 0.6530612244897959, + "grad_norm": 1.2142405538485563, + "learning_rate": 8.043807145043604e-06, + "loss": 0.7731, + "step": 272 + }, + { + "epoch": 0.6554621848739496, + "grad_norm": 1.6154134337294808, + "learning_rate": 8.028048435688333e-06, + "loss": 0.7863, + "step": 273 + }, + { + "epoch": 0.6578631452581032, + "grad_norm": 0.9839829711711955, + "learning_rate": 8.012242081958477e-06, + "loss": 0.7724, + "step": 274 + }, + { + "epoch": 0.6602641056422569, + "grad_norm": 1.2594110387151705, + "learning_rate": 7.996388332556735e-06, + "loss": 0.7519, + "step": 275 + }, + { + "epoch": 0.6626650660264105, + "grad_norm": 1.0696851214819196, + "learning_rate": 7.980487436931558e-06, + "loss": 0.7736, + "step": 276 + }, + { + "epoch": 0.6650660264105642, + "grad_norm": 1.007391516874581, + "learning_rate": 7.964539645273204e-06, + "loss": 0.7803, + "step": 277 + }, + { + "epoch": 0.6674669867947179, + "grad_norm": 1.1895821378331923, + "learning_rate": 7.948545208509811e-06, + "loss": 0.7286, + "step": 278 + }, + { + "epoch": 0.6698679471788715, + "grad_norm": 1.1781445386849876, + "learning_rate": 7.932504378303452e-06, + "loss": 0.7451, + "step": 279 + }, + { + "epoch": 0.6722689075630253, + "grad_norm": 0.9004446486986322, + "learning_rate": 7.916417407046166e-06, + "loss": 0.7744, + "step": 280 + }, + { + "epoch": 0.6746698679471789, + "grad_norm": 1.0504636199086463, + "learning_rate": 7.900284547855992e-06, + "loss": 0.7189, + "step": 281 + }, + { + "epoch": 0.6770708283313326, + "grad_norm": 0.9851224240510926, + "learning_rate": 7.884106054572987e-06, + "loss": 0.7228, + "step": 282 + }, + { + "epoch": 0.6794717887154862, + "grad_norm": 0.882730672015069, + "learning_rate": 7.86788218175523e-06, + "loss": 0.7606, + "step": 283 + }, + { + "epoch": 0.6818727490996399, + "grad_norm": 1.134738248618347, + "learning_rate": 7.851613184674821e-06, + "loss": 0.7453, + "step": 284 + }, + { + "epoch": 0.6842737094837935, + "grad_norm": 1.3550122549643175, + "learning_rate": 7.835299319313854e-06, + "loss": 0.7346, + "step": 285 + }, + { + "epoch": 0.6866746698679472, + "grad_norm": 1.1551976950212515, + "learning_rate": 7.818940842360404e-06, + "loss": 0.7225, + "step": 286 + }, + { + "epoch": 0.6890756302521008, + "grad_norm": 1.0596588721021893, + "learning_rate": 7.80253801120447e-06, + "loss": 0.7729, + "step": 287 + }, + { + "epoch": 0.6914765906362546, + "grad_norm": 2.0352142132248483, + "learning_rate": 7.78609108393395e-06, + "loss": 0.7651, + "step": 288 + }, + { + "epoch": 0.6938775510204082, + "grad_norm": 1.0580145696903516, + "learning_rate": 7.769600319330553e-06, + "loss": 0.7577, + "step": 289 + }, + { + "epoch": 0.6962785114045619, + "grad_norm": 0.9327611537310231, + "learning_rate": 7.753065976865745e-06, + "loss": 0.7345, + "step": 290 + }, + { + "epoch": 0.6986794717887155, + "grad_norm": 0.9177655371636081, + "learning_rate": 7.736488316696663e-06, + "loss": 0.7542, + "step": 291 + }, + { + "epoch": 0.7010804321728692, + "grad_norm": 0.9186582304267394, + "learning_rate": 7.719867599662017e-06, + "loss": 0.7067, + "step": 292 + }, + { + "epoch": 0.7034813925570228, + "grad_norm": 1.4851894689768277, + "learning_rate": 7.703204087277989e-06, + "loss": 0.7895, + "step": 293 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 1.3443334645052298, + "learning_rate": 7.686498041734121e-06, + "loss": 0.7201, + "step": 294 + }, + { + "epoch": 0.7082833133253301, + "grad_norm": 0.9541971519404381, + "learning_rate": 7.669749725889182e-06, + "loss": 0.7078, + "step": 295 + }, + { + "epoch": 0.7106842737094838, + "grad_norm": 0.9343509496893679, + "learning_rate": 7.65295940326704e-06, + "loss": 0.7449, + "step": 296 + }, + { + "epoch": 0.7130852340936374, + "grad_norm": 1.0135831282736736, + "learning_rate": 7.636127338052513e-06, + "loss": 0.7561, + "step": 297 + }, + { + "epoch": 0.7154861944777912, + "grad_norm": 1.2913595649693785, + "learning_rate": 7.619253795087209e-06, + "loss": 0.7582, + "step": 298 + }, + { + "epoch": 0.7178871548619448, + "grad_norm": 0.98172249006896, + "learning_rate": 7.602339039865362e-06, + "loss": 0.7616, + "step": 299 + }, + { + "epoch": 0.7202881152460985, + "grad_norm": 0.9737413536629318, + "learning_rate": 7.5853833385296545e-06, + "loss": 0.767, + "step": 300 + }, + { + "epoch": 0.7226890756302521, + "grad_norm": 0.9292017890894759, + "learning_rate": 7.568386957867033e-06, + "loss": 0.7535, + "step": 301 + }, + { + "epoch": 0.7250900360144058, + "grad_norm": 0.8482575432449314, + "learning_rate": 7.5513501653045e-06, + "loss": 0.7573, + "step": 302 + }, + { + "epoch": 0.7274909963985594, + "grad_norm": 0.9964160968006177, + "learning_rate": 7.534273228904916e-06, + "loss": 0.7487, + "step": 303 + }, + { + "epoch": 0.7298919567827131, + "grad_norm": 1.0416855644496446, + "learning_rate": 7.5171564173627795e-06, + "loss": 0.7472, + "step": 304 + }, + { + "epoch": 0.7322929171668667, + "grad_norm": 1.0226595073884834, + "learning_rate": 7.500000000000001e-06, + "loss": 0.7682, + "step": 305 + }, + { + "epoch": 0.7346938775510204, + "grad_norm": 1.0250925918628377, + "learning_rate": 7.482804246761657e-06, + "loss": 0.819, + "step": 306 + }, + { + "epoch": 0.737094837935174, + "grad_norm": 1.0101595190279031, + "learning_rate": 7.465569428211752e-06, + "loss": 0.7815, + "step": 307 + }, + { + "epoch": 0.7394957983193278, + "grad_norm": 1.0375551725741046, + "learning_rate": 7.448295815528956e-06, + "loss": 0.7546, + "step": 308 + }, + { + "epoch": 0.7418967587034814, + "grad_norm": 0.9524297093471641, + "learning_rate": 7.430983680502344e-06, + "loss": 0.7651, + "step": 309 + }, + { + "epoch": 0.7442977190876351, + "grad_norm": 1.0192202418334808, + "learning_rate": 7.413633295527109e-06, + "loss": 0.7386, + "step": 310 + }, + { + "epoch": 0.7466986794717887, + "grad_norm": 0.9486780543314182, + "learning_rate": 7.396244933600285e-06, + "loss": 0.768, + "step": 311 + }, + { + "epoch": 0.7490996398559424, + "grad_norm": 0.9813092444439941, + "learning_rate": 7.378818868316449e-06, + "loss": 0.7531, + "step": 312 + }, + { + "epoch": 0.751500600240096, + "grad_norm": 0.9112662872131286, + "learning_rate": 7.361355373863415e-06, + "loss": 0.7513, + "step": 313 + }, + { + "epoch": 0.7539015606242497, + "grad_norm": 1.058071811604099, + "learning_rate": 7.343854725017919e-06, + "loss": 0.7757, + "step": 314 + }, + { + "epoch": 0.7563025210084033, + "grad_norm": 0.9009387483744434, + "learning_rate": 7.326317197141304e-06, + "loss": 0.7458, + "step": 315 + }, + { + "epoch": 0.758703481392557, + "grad_norm": 1.020248201280368, + "learning_rate": 7.308743066175172e-06, + "loss": 0.7707, + "step": 316 + }, + { + "epoch": 0.7611044417767107, + "grad_norm": 1.0146874125529126, + "learning_rate": 7.291132608637053e-06, + "loss": 0.7426, + "step": 317 + }, + { + "epoch": 0.7635054021608644, + "grad_norm": 1.0743624752190135, + "learning_rate": 7.273486101616057e-06, + "loss": 0.7998, + "step": 318 + }, + { + "epoch": 0.765906362545018, + "grad_norm": 1.7707907807158567, + "learning_rate": 7.255803822768504e-06, + "loss": 0.7722, + "step": 319 + }, + { + "epoch": 0.7683073229291717, + "grad_norm": 1.0121350667327902, + "learning_rate": 7.238086050313563e-06, + "loss": 0.7313, + "step": 320 + }, + { + "epoch": 0.7707082833133253, + "grad_norm": 1.0163182881965183, + "learning_rate": 7.2203330630288714e-06, + "loss": 0.7435, + "step": 321 + }, + { + "epoch": 0.773109243697479, + "grad_norm": 0.9268275302035212, + "learning_rate": 7.202545140246148e-06, + "loss": 0.7664, + "step": 322 + }, + { + "epoch": 0.7755102040816326, + "grad_norm": 1.2563485488363568, + "learning_rate": 7.1847225618467975e-06, + "loss": 0.7801, + "step": 323 + }, + { + "epoch": 0.7779111644657863, + "grad_norm": 1.037881541942287, + "learning_rate": 7.166865608257515e-06, + "loss": 0.7652, + "step": 324 + }, + { + "epoch": 0.78031212484994, + "grad_norm": 1.3383350176718172, + "learning_rate": 7.148974560445859e-06, + "loss": 0.7226, + "step": 325 + }, + { + "epoch": 0.7827130852340937, + "grad_norm": 1.0578449779959211, + "learning_rate": 7.131049699915842e-06, + "loss": 0.7552, + "step": 326 + }, + { + "epoch": 0.7851140456182473, + "grad_norm": 1.0137416729283089, + "learning_rate": 7.113091308703498e-06, + "loss": 0.7536, + "step": 327 + }, + { + "epoch": 0.787515006002401, + "grad_norm": 1.1642800214358668, + "learning_rate": 7.095099669372444e-06, + "loss": 0.7488, + "step": 328 + }, + { + "epoch": 0.7899159663865546, + "grad_norm": 1.057986883568984, + "learning_rate": 7.0770750650094335e-06, + "loss": 0.7988, + "step": 329 + }, + { + "epoch": 0.7923169267707083, + "grad_norm": 1.0658321245710127, + "learning_rate": 7.059017779219904e-06, + "loss": 0.7202, + "step": 330 + }, + { + "epoch": 0.7947178871548619, + "grad_norm": 1.2955485154946214, + "learning_rate": 7.040928096123516e-06, + "loss": 0.7656, + "step": 331 + }, + { + "epoch": 0.7971188475390156, + "grad_norm": 0.8925229800971378, + "learning_rate": 7.022806300349676e-06, + "loss": 0.716, + "step": 332 + }, + { + "epoch": 0.7995198079231692, + "grad_norm": 1.0187499110626825, + "learning_rate": 7.004652677033069e-06, + "loss": 0.7473, + "step": 333 + }, + { + "epoch": 0.801920768307323, + "grad_norm": 1.5751305697768676, + "learning_rate": 6.98646751180916e-06, + "loss": 0.7509, + "step": 334 + }, + { + "epoch": 0.8043217286914766, + "grad_norm": 1.3546230921711813, + "learning_rate": 6.968251090809708e-06, + "loss": 0.7695, + "step": 335 + }, + { + "epoch": 0.8067226890756303, + "grad_norm": 1.069845095442519, + "learning_rate": 6.95000370065826e-06, + "loss": 0.7435, + "step": 336 + }, + { + "epoch": 0.8091236494597839, + "grad_norm": 0.9844453826512336, + "learning_rate": 6.931725628465643e-06, + "loss": 0.7542, + "step": 337 + }, + { + "epoch": 0.8115246098439376, + "grad_norm": 1.247669840020457, + "learning_rate": 6.913417161825449e-06, + "loss": 0.7366, + "step": 338 + }, + { + "epoch": 0.8139255702280912, + "grad_norm": 0.9839530814071178, + "learning_rate": 6.895078588809503e-06, + "loss": 0.7235, + "step": 339 + }, + { + "epoch": 0.8163265306122449, + "grad_norm": 0.9114990748369234, + "learning_rate": 6.876710197963334e-06, + "loss": 0.7584, + "step": 340 + }, + { + "epoch": 0.8187274909963985, + "grad_norm": 0.8893953013528672, + "learning_rate": 6.858312278301638e-06, + "loss": 0.7466, + "step": 341 + }, + { + "epoch": 0.8211284513805522, + "grad_norm": 1.3095523002713856, + "learning_rate": 6.839885119303726e-06, + "loss": 0.8087, + "step": 342 + }, + { + "epoch": 0.8235294117647058, + "grad_norm": 0.9225661035820061, + "learning_rate": 6.821429010908972e-06, + "loss": 0.7372, + "step": 343 + }, + { + "epoch": 0.8259303721488596, + "grad_norm": 0.9913169132532897, + "learning_rate": 6.802944243512249e-06, + "loss": 0.7756, + "step": 344 + }, + { + "epoch": 0.8283313325330132, + "grad_norm": 0.9851272669238657, + "learning_rate": 6.78443110795936e-06, + "loss": 0.7104, + "step": 345 + }, + { + "epoch": 0.8307322929171669, + "grad_norm": 0.8690268038644338, + "learning_rate": 6.765889895542465e-06, + "loss": 0.771, + "step": 346 + }, + { + "epoch": 0.8331332533013205, + "grad_norm": 1.0136014295946223, + "learning_rate": 6.747320897995493e-06, + "loss": 0.7672, + "step": 347 + }, + { + "epoch": 0.8355342136854742, + "grad_norm": 0.9009692165583426, + "learning_rate": 6.728724407489554e-06, + "loss": 0.7566, + "step": 348 + }, + { + "epoch": 0.8379351740696278, + "grad_norm": 3.3801588169217225, + "learning_rate": 6.710100716628345e-06, + "loss": 0.7636, + "step": 349 + }, + { + "epoch": 0.8403361344537815, + "grad_norm": 0.8939669513482729, + "learning_rate": 6.691450118443538e-06, + "loss": 0.7323, + "step": 350 + }, + { + "epoch": 0.8427370948379351, + "grad_norm": 1.256771645476368, + "learning_rate": 6.672772906390177e-06, + "loss": 0.7878, + "step": 351 + }, + { + "epoch": 0.8451380552220888, + "grad_norm": 0.9716979576540683, + "learning_rate": 6.65406937434206e-06, + "loss": 0.746, + "step": 352 + }, + { + "epoch": 0.8475390156062425, + "grad_norm": 1.17294825500255, + "learning_rate": 6.635339816587109e-06, + "loss": 0.8029, + "step": 353 + }, + { + "epoch": 0.8499399759903962, + "grad_norm": 1.1320782160419252, + "learning_rate": 6.616584527822745e-06, + "loss": 0.7398, + "step": 354 + }, + { + "epoch": 0.8523409363745498, + "grad_norm": 0.8285295612933271, + "learning_rate": 6.5978038031512505e-06, + "loss": 0.7413, + "step": 355 + }, + { + "epoch": 0.8547418967587035, + "grad_norm": 1.1319688174607296, + "learning_rate": 6.578997938075126e-06, + "loss": 0.7815, + "step": 356 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 0.8783242447132844, + "learning_rate": 6.560167228492436e-06, + "loss": 0.734, + "step": 357 + }, + { + "epoch": 0.8595438175270108, + "grad_norm": 1.061138411536176, + "learning_rate": 6.5413119706921635e-06, + "loss": 0.7205, + "step": 358 + }, + { + "epoch": 0.8619447779111644, + "grad_norm": 1.9255346896224161, + "learning_rate": 6.522432461349536e-06, + "loss": 0.7528, + "step": 359 + }, + { + "epoch": 0.8643457382953181, + "grad_norm": 0.9835784047836312, + "learning_rate": 6.503528997521365e-06, + "loss": 0.7188, + "step": 360 + }, + { + "epoch": 0.8667466986794717, + "grad_norm": 1.0231868671892967, + "learning_rate": 6.484601876641375e-06, + "loss": 0.7469, + "step": 361 + }, + { + "epoch": 0.8691476590636255, + "grad_norm": 0.9166994445850244, + "learning_rate": 6.465651396515511e-06, + "loss": 0.7513, + "step": 362 + }, + { + "epoch": 0.8715486194477791, + "grad_norm": 0.9766913854228723, + "learning_rate": 6.446677855317265e-06, + "loss": 0.7955, + "step": 363 + }, + { + "epoch": 0.8739495798319328, + "grad_norm": 1.3198005619305515, + "learning_rate": 6.427681551582978e-06, + "loss": 0.7519, + "step": 364 + }, + { + "epoch": 0.8763505402160864, + "grad_norm": 0.9712835770794745, + "learning_rate": 6.408662784207149e-06, + "loss": 0.7459, + "step": 365 + }, + { + "epoch": 0.8787515006002401, + "grad_norm": 1.0163919411898112, + "learning_rate": 6.389621852437723e-06, + "loss": 0.7759, + "step": 366 + }, + { + "epoch": 0.8811524609843937, + "grad_norm": 0.950248653944803, + "learning_rate": 6.370559055871389e-06, + "loss": 0.7378, + "step": 367 + }, + { + "epoch": 0.8835534213685474, + "grad_norm": 1.1181926812558955, + "learning_rate": 6.351474694448865e-06, + "loss": 0.746, + "step": 368 + }, + { + "epoch": 0.885954381752701, + "grad_norm": 1.037686986955754, + "learning_rate": 6.332369068450175e-06, + "loss": 0.7656, + "step": 369 + }, + { + "epoch": 0.8883553421368547, + "grad_norm": 1.072450361306469, + "learning_rate": 6.313242478489934e-06, + "loss": 0.7104, + "step": 370 + }, + { + "epoch": 0.8907563025210085, + "grad_norm": 1.108167091391112, + "learning_rate": 6.294095225512604e-06, + "loss": 0.8022, + "step": 371 + }, + { + "epoch": 0.8931572629051621, + "grad_norm": 1.722491884002692, + "learning_rate": 6.274927610787771e-06, + "loss": 0.7242, + "step": 372 + }, + { + "epoch": 0.8955582232893158, + "grad_norm": 1.0457660283102586, + "learning_rate": 6.255739935905396e-06, + "loss": 0.7866, + "step": 373 + }, + { + "epoch": 0.8979591836734694, + "grad_norm": 1.6184876219177928, + "learning_rate": 6.236532502771078e-06, + "loss": 0.7439, + "step": 374 + }, + { + "epoch": 0.9003601440576231, + "grad_norm": 1.0062644766522555, + "learning_rate": 6.217305613601296e-06, + "loss": 0.7334, + "step": 375 + }, + { + "epoch": 0.9027611044417767, + "grad_norm": 0.9352218251328956, + "learning_rate": 6.198059570918656e-06, + "loss": 0.7451, + "step": 376 + }, + { + "epoch": 0.9051620648259304, + "grad_norm": 0.9116887661426208, + "learning_rate": 6.178794677547138e-06, + "loss": 0.7324, + "step": 377 + }, + { + "epoch": 0.907563025210084, + "grad_norm": 1.183777769058435, + "learning_rate": 6.1595112366073164e-06, + "loss": 0.7257, + "step": 378 + }, + { + "epoch": 0.9099639855942377, + "grad_norm": 0.9545364548686734, + "learning_rate": 6.140209551511609e-06, + "loss": 0.708, + "step": 379 + }, + { + "epoch": 0.9123649459783914, + "grad_norm": 0.9266052219702813, + "learning_rate": 6.120889925959486e-06, + "loss": 0.7892, + "step": 380 + }, + { + "epoch": 0.9147659063625451, + "grad_norm": 0.9080830124340115, + "learning_rate": 6.101552663932704e-06, + "loss": 0.7016, + "step": 381 + }, + { + "epoch": 0.9171668667466987, + "grad_norm": 1.1299134877349652, + "learning_rate": 6.0821980696905145e-06, + "loss": 0.7674, + "step": 382 + }, + { + "epoch": 0.9195678271308524, + "grad_norm": 0.8451340918186704, + "learning_rate": 6.062826447764883e-06, + "loss": 0.698, + "step": 383 + }, + { + "epoch": 0.921968787515006, + "grad_norm": 0.7924589314809064, + "learning_rate": 6.0434381029556945e-06, + "loss": 0.7397, + "step": 384 + }, + { + "epoch": 0.9243697478991597, + "grad_norm": 0.9045434583742306, + "learning_rate": 6.024033340325954e-06, + "loss": 0.7425, + "step": 385 + }, + { + "epoch": 0.9267707082833133, + "grad_norm": 0.9016007393146936, + "learning_rate": 6.004612465196994e-06, + "loss": 0.7258, + "step": 386 + }, + { + "epoch": 0.929171668667467, + "grad_norm": 1.0646113635207004, + "learning_rate": 5.985175783143666e-06, + "loss": 0.7614, + "step": 387 + }, + { + "epoch": 0.9315726290516206, + "grad_norm": 0.9444488037926984, + "learning_rate": 5.965723599989529e-06, + "loss": 0.7225, + "step": 388 + }, + { + "epoch": 0.9339735894357744, + "grad_norm": 1.1421403843804732, + "learning_rate": 5.946256221802052e-06, + "loss": 0.7859, + "step": 389 + }, + { + "epoch": 0.936374549819928, + "grad_norm": 1.083603518417246, + "learning_rate": 5.926773954887777e-06, + "loss": 0.778, + "step": 390 + }, + { + "epoch": 0.9387755102040817, + "grad_norm": 1.0818704239803636, + "learning_rate": 5.907277105787513e-06, + "loss": 0.7646, + "step": 391 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 1.0705981939700056, + "learning_rate": 5.887765981271518e-06, + "loss": 0.8131, + "step": 392 + }, + { + "epoch": 0.943577430972389, + "grad_norm": 2.2203638089853044, + "learning_rate": 5.8682408883346535e-06, + "loss": 0.7368, + "step": 393 + }, + { + "epoch": 0.9459783913565426, + "grad_norm": 1.0437303875045223, + "learning_rate": 5.848702134191571e-06, + "loss": 0.7742, + "step": 394 + }, + { + "epoch": 0.9483793517406963, + "grad_norm": 1.210376504898421, + "learning_rate": 5.829150026271871e-06, + "loss": 0.7314, + "step": 395 + }, + { + "epoch": 0.9507803121248499, + "grad_norm": 1.1286122534398462, + "learning_rate": 5.80958487221527e-06, + "loss": 0.7788, + "step": 396 + }, + { + "epoch": 0.9531812725090036, + "grad_norm": 1.3543601065822313, + "learning_rate": 5.79000697986675e-06, + "loss": 0.7606, + "step": 397 + }, + { + "epoch": 0.9555822328931572, + "grad_norm": 0.9033985285987017, + "learning_rate": 5.7704166572717295e-06, + "loss": 0.7275, + "step": 398 + }, + { + "epoch": 0.957983193277311, + "grad_norm": 1.6202620989820378, + "learning_rate": 5.750814212671202e-06, + "loss": 0.7501, + "step": 399 + }, + { + "epoch": 0.9603841536614646, + "grad_norm": 1.0808376641943391, + "learning_rate": 5.731199954496899e-06, + "loss": 0.7872, + "step": 400 + }, + { + "epoch": 0.9627851140456183, + "grad_norm": 1.2033579507042218, + "learning_rate": 5.711574191366427e-06, + "loss": 0.7743, + "step": 401 + }, + { + "epoch": 0.9651860744297719, + "grad_norm": 0.9090277632343909, + "learning_rate": 5.691937232078415e-06, + "loss": 0.7538, + "step": 402 + }, + { + "epoch": 0.9675870348139256, + "grad_norm": 0.8618200490078318, + "learning_rate": 5.6722893856076596e-06, + "loss": 0.7009, + "step": 403 + }, + { + "epoch": 0.9699879951980792, + "grad_norm": 0.9175641207469445, + "learning_rate": 5.65263096110026e-06, + "loss": 0.7577, + "step": 404 + }, + { + "epoch": 0.9723889555822329, + "grad_norm": 0.8794915802010296, + "learning_rate": 5.632962267868747e-06, + "loss": 0.7103, + "step": 405 + }, + { + "epoch": 0.9747899159663865, + "grad_norm": 1.0573779393835883, + "learning_rate": 5.6132836153872335e-06, + "loss": 0.7752, + "step": 406 + }, + { + "epoch": 0.9771908763505402, + "grad_norm": 1.0016281052861966, + "learning_rate": 5.593595313286526e-06, + "loss": 0.7634, + "step": 407 + }, + { + "epoch": 0.9795918367346939, + "grad_norm": 1.307171807596152, + "learning_rate": 5.573897671349269e-06, + "loss": 0.7534, + "step": 408 + }, + { + "epoch": 0.9819927971188476, + "grad_norm": 1.0143411203734125, + "learning_rate": 5.5541909995050554e-06, + "loss": 0.7303, + "step": 409 + }, + { + "epoch": 0.9843937575030012, + "grad_norm": 1.010734624581804, + "learning_rate": 5.534475607825566e-06, + "loss": 0.764, + "step": 410 + }, + { + "epoch": 0.9867947178871549, + "grad_norm": 1.2154276169872473, + "learning_rate": 5.514751806519673e-06, + "loss": 0.7378, + "step": 411 + }, + { + "epoch": 0.9891956782713085, + "grad_norm": 0.9686457443348316, + "learning_rate": 5.495019905928578e-06, + "loss": 0.7392, + "step": 412 + }, + { + "epoch": 0.9915966386554622, + "grad_norm": 1.0745017643744474, + "learning_rate": 5.475280216520913e-06, + "loss": 0.7819, + "step": 413 + }, + { + "epoch": 0.9939975990396158, + "grad_norm": 0.999983849780639, + "learning_rate": 5.455533048887868e-06, + "loss": 0.7569, + "step": 414 + }, + { + "epoch": 0.9963985594237695, + "grad_norm": 1.02702252315643, + "learning_rate": 5.435778713738292e-06, + "loss": 0.7627, + "step": 415 + }, + { + "epoch": 0.9987995198079231, + "grad_norm": 1.0724028498080733, + "learning_rate": 5.416017521893813e-06, + "loss": 0.7009, + "step": 416 + }, + { + "epoch": 1.0, + "grad_norm": 1.0724028498080733, + "learning_rate": 5.396249784283943e-06, + "loss": 0.8865, + "step": 417 + }, + { + "epoch": 1.0024009603841537, + "grad_norm": 2.101877070085442, + "learning_rate": 5.3764758119411905e-06, + "loss": 0.7009, + "step": 418 + }, + { + "epoch": 1.0048019207683074, + "grad_norm": 1.118846557228715, + "learning_rate": 5.356695915996162e-06, + "loss": 0.7542, + "step": 419 + }, + { + "epoch": 1.007202881152461, + "grad_norm": 1.156865195159883, + "learning_rate": 5.336910407672668e-06, + "loss": 0.6836, + "step": 420 + }, + { + "epoch": 1.0096038415366146, + "grad_norm": 0.9278578282699763, + "learning_rate": 5.317119598282823e-06, + "loss": 0.7363, + "step": 421 + }, + { + "epoch": 1.0120048019207684, + "grad_norm": 0.995653093708434, + "learning_rate": 5.297323799222156e-06, + "loss": 0.7663, + "step": 422 + }, + { + "epoch": 1.014405762304922, + "grad_norm": 1.1905262761143818, + "learning_rate": 5.277523321964701e-06, + "loss": 0.7088, + "step": 423 + }, + { + "epoch": 1.0168067226890756, + "grad_norm": 0.8602717681140455, + "learning_rate": 5.257718478058105e-06, + "loss": 0.7622, + "step": 424 + }, + { + "epoch": 1.0192076830732293, + "grad_norm": 1.380911494413432, + "learning_rate": 5.237909579118713e-06, + "loss": 0.7616, + "step": 425 + }, + { + "epoch": 1.021608643457383, + "grad_norm": 8.665839441934292, + "learning_rate": 5.218096936826681e-06, + "loss": 0.7419, + "step": 426 + }, + { + "epoch": 1.0240096038415367, + "grad_norm": 0.9358352128566729, + "learning_rate": 5.198280862921062e-06, + "loss": 0.7515, + "step": 427 + }, + { + "epoch": 1.0264105642256902, + "grad_norm": 1.3142767223698053, + "learning_rate": 5.178461669194903e-06, + "loss": 0.7266, + "step": 428 + }, + { + "epoch": 1.028811524609844, + "grad_norm": 1.050719187044544, + "learning_rate": 5.15863966749034e-06, + "loss": 0.7296, + "step": 429 + }, + { + "epoch": 1.0312124849939976, + "grad_norm": 1.302115134296509, + "learning_rate": 5.138815169693687e-06, + "loss": 0.6843, + "step": 430 + }, + { + "epoch": 1.0336134453781514, + "grad_norm": 0.847885176823766, + "learning_rate": 5.118988487730537e-06, + "loss": 0.7294, + "step": 431 + }, + { + "epoch": 1.0360144057623049, + "grad_norm": 0.9180649724528399, + "learning_rate": 5.099159933560851e-06, + "loss": 0.6877, + "step": 432 + }, + { + "epoch": 1.0384153661464586, + "grad_norm": 0.9090661115202275, + "learning_rate": 5.07932981917404e-06, + "loss": 0.6987, + "step": 433 + }, + { + "epoch": 1.0408163265306123, + "grad_norm": 0.8573995996185877, + "learning_rate": 5.059498456584072e-06, + "loss": 0.6693, + "step": 434 + }, + { + "epoch": 1.043217286914766, + "grad_norm": 0.8185459787831719, + "learning_rate": 5.039666157824549e-06, + "loss": 0.6714, + "step": 435 + }, + { + "epoch": 1.0456182472989195, + "grad_norm": 0.8618136173846955, + "learning_rate": 5.019833234943806e-06, + "loss": 0.743, + "step": 436 + }, + { + "epoch": 1.0480192076830732, + "grad_norm": 1.435576312926133, + "learning_rate": 5e-06, + "loss": 0.7218, + "step": 437 + }, + { + "epoch": 1.050420168067227, + "grad_norm": 0.7341581025321086, + "learning_rate": 4.980166765056194e-06, + "loss": 0.7147, + "step": 438 + }, + { + "epoch": 1.0528211284513807, + "grad_norm": 1.081533239673927, + "learning_rate": 4.960333842175453e-06, + "loss": 0.6944, + "step": 439 + }, + { + "epoch": 1.0552220888355341, + "grad_norm": 0.9479569729309318, + "learning_rate": 4.940501543415929e-06, + "loss": 0.7085, + "step": 440 + }, + { + "epoch": 1.0576230492196879, + "grad_norm": 0.8342266084401958, + "learning_rate": 4.9206701808259605e-06, + "loss": 0.6474, + "step": 441 + }, + { + "epoch": 1.0600240096038416, + "grad_norm": 0.9085685056051219, + "learning_rate": 4.900840066439151e-06, + "loss": 0.6667, + "step": 442 + }, + { + "epoch": 1.0624249699879953, + "grad_norm": 0.8889802086577271, + "learning_rate": 4.881011512269464e-06, + "loss": 0.6867, + "step": 443 + }, + { + "epoch": 1.0648259303721488, + "grad_norm": 0.9332804081329865, + "learning_rate": 4.861184830306314e-06, + "loss": 0.6801, + "step": 444 + }, + { + "epoch": 1.0672268907563025, + "grad_norm": 0.9868828565183504, + "learning_rate": 4.841360332509663e-06, + "loss": 0.7641, + "step": 445 + }, + { + "epoch": 1.0696278511404562, + "grad_norm": 1.2155044715175538, + "learning_rate": 4.821538330805098e-06, + "loss": 0.6743, + "step": 446 + }, + { + "epoch": 1.07202881152461, + "grad_norm": 0.9925350547990021, + "learning_rate": 4.801719137078939e-06, + "loss": 0.6516, + "step": 447 + }, + { + "epoch": 1.0744297719087634, + "grad_norm": 1.0348738488040936, + "learning_rate": 4.781903063173321e-06, + "loss": 0.6509, + "step": 448 + }, + { + "epoch": 1.0768307322929171, + "grad_norm": 0.836392400812354, + "learning_rate": 4.762090420881289e-06, + "loss": 0.6267, + "step": 449 + }, + { + "epoch": 1.0792316926770709, + "grad_norm": 0.9188934737658707, + "learning_rate": 4.742281521941897e-06, + "loss": 0.6933, + "step": 450 + }, + { + "epoch": 1.0816326530612246, + "grad_norm": 1.2483588900096583, + "learning_rate": 4.7224766780353005e-06, + "loss": 0.701, + "step": 451 + }, + { + "epoch": 1.084033613445378, + "grad_norm": 1.099152868995137, + "learning_rate": 4.7026762007778455e-06, + "loss": 0.6968, + "step": 452 + }, + { + "epoch": 1.0864345738295318, + "grad_norm": 0.8641730346092144, + "learning_rate": 4.682880401717178e-06, + "loss": 0.6913, + "step": 453 + }, + { + "epoch": 1.0888355342136855, + "grad_norm": 1.0362371171789713, + "learning_rate": 4.663089592327333e-06, + "loss": 0.663, + "step": 454 + }, + { + "epoch": 1.0912364945978392, + "grad_norm": 1.1665659853493717, + "learning_rate": 4.643304084003839e-06, + "loss": 0.6402, + "step": 455 + }, + { + "epoch": 1.0936374549819927, + "grad_norm": 1.042312121994228, + "learning_rate": 4.62352418805881e-06, + "loss": 0.6993, + "step": 456 + }, + { + "epoch": 1.0960384153661464, + "grad_norm": 1.0156806331552348, + "learning_rate": 4.603750215716057e-06, + "loss": 0.6469, + "step": 457 + }, + { + "epoch": 1.0984393757503002, + "grad_norm": 1.0037418962257996, + "learning_rate": 4.583982478106189e-06, + "loss": 0.6736, + "step": 458 + }, + { + "epoch": 1.1008403361344539, + "grad_norm": 1.0639389320851615, + "learning_rate": 4.564221286261709e-06, + "loss": 0.6973, + "step": 459 + }, + { + "epoch": 1.1032412965186074, + "grad_norm": 6.957024323901044, + "learning_rate": 4.544466951112132e-06, + "loss": 0.6831, + "step": 460 + }, + { + "epoch": 1.105642256902761, + "grad_norm": 0.882371596489962, + "learning_rate": 4.524719783479088e-06, + "loss": 0.6545, + "step": 461 + }, + { + "epoch": 1.1080432172869148, + "grad_norm": 1.0995617865014164, + "learning_rate": 4.504980094071424e-06, + "loss": 0.675, + "step": 462 + }, + { + "epoch": 1.1104441776710685, + "grad_norm": 0.8925989987844686, + "learning_rate": 4.485248193480328e-06, + "loss": 0.6371, + "step": 463 + }, + { + "epoch": 1.112845138055222, + "grad_norm": 1.000513181879756, + "learning_rate": 4.465524392174437e-06, + "loss": 0.6369, + "step": 464 + }, + { + "epoch": 1.1152460984393757, + "grad_norm": 2.259472809190549, + "learning_rate": 4.445809000494945e-06, + "loss": 0.6465, + "step": 465 + }, + { + "epoch": 1.1176470588235294, + "grad_norm": 0.959502156080397, + "learning_rate": 4.426102328650732e-06, + "loss": 0.6307, + "step": 466 + }, + { + "epoch": 1.1200480192076832, + "grad_norm": 1.0884936132184861, + "learning_rate": 4.4064046867134755e-06, + "loss": 0.6523, + "step": 467 + }, + { + "epoch": 1.1224489795918366, + "grad_norm": 1.2362423608257147, + "learning_rate": 4.386716384612768e-06, + "loss": 0.7073, + "step": 468 + }, + { + "epoch": 1.1248499399759904, + "grad_norm": 1.154930197028087, + "learning_rate": 4.367037732131254e-06, + "loss": 0.6702, + "step": 469 + }, + { + "epoch": 1.127250900360144, + "grad_norm": 0.9675716457470993, + "learning_rate": 4.347369038899744e-06, + "loss": 0.6306, + "step": 470 + }, + { + "epoch": 1.1296518607442978, + "grad_norm": 1.2123183177500714, + "learning_rate": 4.327710614392341e-06, + "loss": 0.6422, + "step": 471 + }, + { + "epoch": 1.1320528211284513, + "grad_norm": 0.8783262340992644, + "learning_rate": 4.308062767921586e-06, + "loss": 0.6408, + "step": 472 + }, + { + "epoch": 1.134453781512605, + "grad_norm": 1.0020692197104342, + "learning_rate": 4.2884258086335755e-06, + "loss": 0.6726, + "step": 473 + }, + { + "epoch": 1.1368547418967587, + "grad_norm": 0.9984867727924627, + "learning_rate": 4.268800045503102e-06, + "loss": 0.6972, + "step": 474 + }, + { + "epoch": 1.1392557022809124, + "grad_norm": 0.9177095305540808, + "learning_rate": 4.249185787328798e-06, + "loss": 0.6482, + "step": 475 + }, + { + "epoch": 1.141656662665066, + "grad_norm": 0.9030081661554377, + "learning_rate": 4.229583342728273e-06, + "loss": 0.6777, + "step": 476 + }, + { + "epoch": 1.1440576230492197, + "grad_norm": 0.8540231006160938, + "learning_rate": 4.209993020133251e-06, + "loss": 0.6577, + "step": 477 + }, + { + "epoch": 1.1464585834333734, + "grad_norm": 0.8097479049668175, + "learning_rate": 4.190415127784731e-06, + "loss": 0.7251, + "step": 478 + }, + { + "epoch": 1.148859543817527, + "grad_norm": 0.8970081520393531, + "learning_rate": 4.1708499737281305e-06, + "loss": 0.6796, + "step": 479 + }, + { + "epoch": 1.1512605042016806, + "grad_norm": 0.8654962875772526, + "learning_rate": 4.151297865808432e-06, + "loss": 0.7018, + "step": 480 + }, + { + "epoch": 1.1536614645858343, + "grad_norm": 0.9548312252503125, + "learning_rate": 4.131759111665349e-06, + "loss": 0.6414, + "step": 481 + }, + { + "epoch": 1.156062424969988, + "grad_norm": 0.9134696774056807, + "learning_rate": 4.1122340187284845e-06, + "loss": 0.6587, + "step": 482 + }, + { + "epoch": 1.1584633853541417, + "grad_norm": 1.047296786406303, + "learning_rate": 4.092722894212488e-06, + "loss": 0.6275, + "step": 483 + }, + { + "epoch": 1.1608643457382952, + "grad_norm": 1.3504571393502145, + "learning_rate": 4.073226045112225e-06, + "loss": 0.6679, + "step": 484 + }, + { + "epoch": 1.163265306122449, + "grad_norm": 0.8152705410339258, + "learning_rate": 4.053743778197951e-06, + "loss": 0.6142, + "step": 485 + }, + { + "epoch": 1.1656662665066027, + "grad_norm": 1.0444369910181297, + "learning_rate": 4.0342764000104715e-06, + "loss": 0.6897, + "step": 486 + }, + { + "epoch": 1.1680672268907564, + "grad_norm": 1.2835376928592126, + "learning_rate": 4.014824216856336e-06, + "loss": 0.6095, + "step": 487 + }, + { + "epoch": 1.1704681872749099, + "grad_norm": 0.9870823918768562, + "learning_rate": 3.995387534803006e-06, + "loss": 0.7001, + "step": 488 + }, + { + "epoch": 1.1728691476590636, + "grad_norm": 0.9161419881683542, + "learning_rate": 3.975966659674048e-06, + "loss": 0.6836, + "step": 489 + }, + { + "epoch": 1.1752701080432173, + "grad_norm": 0.8363382965648142, + "learning_rate": 3.956561897044306e-06, + "loss": 0.6305, + "step": 490 + }, + { + "epoch": 1.177671068427371, + "grad_norm": 0.9517408930013426, + "learning_rate": 3.937173552235117e-06, + "loss": 0.6359, + "step": 491 + }, + { + "epoch": 1.1800720288115247, + "grad_norm": 0.960064443656299, + "learning_rate": 3.917801930309486e-06, + "loss": 0.677, + "step": 492 + }, + { + "epoch": 1.1824729891956782, + "grad_norm": 1.0082757605439017, + "learning_rate": 3.898447336067297e-06, + "loss": 0.6813, + "step": 493 + }, + { + "epoch": 1.184873949579832, + "grad_norm": 1.270622611887631, + "learning_rate": 3.879110074040515e-06, + "loss": 0.6447, + "step": 494 + }, + { + "epoch": 1.1872749099639857, + "grad_norm": 0.9018388456061602, + "learning_rate": 3.859790448488394e-06, + "loss": 0.6143, + "step": 495 + }, + { + "epoch": 1.1896758703481392, + "grad_norm": 0.8687733757060934, + "learning_rate": 3.840488763392685e-06, + "loss": 0.6744, + "step": 496 + }, + { + "epoch": 1.1920768307322929, + "grad_norm": 1.0303761890291854, + "learning_rate": 3.821205322452863e-06, + "loss": 0.6109, + "step": 497 + }, + { + "epoch": 1.1944777911164466, + "grad_norm": 1.0063806235226265, + "learning_rate": 3.8019404290813456e-06, + "loss": 0.6678, + "step": 498 + }, + { + "epoch": 1.1968787515006003, + "grad_norm": 0.8611797107872082, + "learning_rate": 3.782694386398706e-06, + "loss": 0.6521, + "step": 499 + }, + { + "epoch": 1.199279711884754, + "grad_norm": 0.8940169915802657, + "learning_rate": 3.7634674972289227e-06, + "loss": 0.6709, + "step": 500 + }, + { + "epoch": 1.2016806722689075, + "grad_norm": 0.9657961801617352, + "learning_rate": 3.7442600640946045e-06, + "loss": 0.6212, + "step": 501 + }, + { + "epoch": 1.2040816326530612, + "grad_norm": 1.1496491496114696, + "learning_rate": 3.72507238921223e-06, + "loss": 0.6767, + "step": 502 + }, + { + "epoch": 1.206482593037215, + "grad_norm": 1.0842958983301503, + "learning_rate": 3.705904774487396e-06, + "loss": 0.6599, + "step": 503 + }, + { + "epoch": 1.2088835534213684, + "grad_norm": 1.0318159594309126, + "learning_rate": 3.686757521510068e-06, + "loss": 0.6881, + "step": 504 + }, + { + "epoch": 1.2112845138055222, + "grad_norm": 0.8003629139237494, + "learning_rate": 3.667630931549826e-06, + "loss": 0.6486, + "step": 505 + }, + { + "epoch": 1.2136854741896759, + "grad_norm": 1.0293538346323636, + "learning_rate": 3.648525305551136e-06, + "loss": 0.6552, + "step": 506 + }, + { + "epoch": 1.2160864345738296, + "grad_norm": 1.1816027470046149, + "learning_rate": 3.6294409441286133e-06, + "loss": 0.6239, + "step": 507 + }, + { + "epoch": 1.2184873949579833, + "grad_norm": 1.2274500784176232, + "learning_rate": 3.610378147562279e-06, + "loss": 0.6419, + "step": 508 + }, + { + "epoch": 1.2208883553421368, + "grad_norm": 0.879327609945451, + "learning_rate": 3.5913372157928515e-06, + "loss": 0.5977, + "step": 509 + }, + { + "epoch": 1.2232893157262905, + "grad_norm": 1.1499847415517088, + "learning_rate": 3.572318448417023e-06, + "loss": 0.6294, + "step": 510 + }, + { + "epoch": 1.2256902761104442, + "grad_norm": 0.9738966730369417, + "learning_rate": 3.553322144682737e-06, + "loss": 0.6488, + "step": 511 + }, + { + "epoch": 1.2280912364945977, + "grad_norm": 0.7975428195744296, + "learning_rate": 3.5343486034844897e-06, + "loss": 0.612, + "step": 512 + }, + { + "epoch": 1.2304921968787514, + "grad_norm": 0.8715338954561731, + "learning_rate": 3.5153981233586277e-06, + "loss": 0.6948, + "step": 513 + }, + { + "epoch": 1.2328931572629052, + "grad_norm": 0.999917456418209, + "learning_rate": 3.4964710024786354e-06, + "loss": 0.6684, + "step": 514 + }, + { + "epoch": 1.2352941176470589, + "grad_norm": 0.8967614161402051, + "learning_rate": 3.477567538650466e-06, + "loss": 0.6531, + "step": 515 + }, + { + "epoch": 1.2376950780312126, + "grad_norm": 1.2866563272030653, + "learning_rate": 3.458688029307839e-06, + "loss": 0.6038, + "step": 516 + }, + { + "epoch": 1.240096038415366, + "grad_norm": 1.2858647511107744, + "learning_rate": 3.439832771507565e-06, + "loss": 0.6571, + "step": 517 + }, + { + "epoch": 1.2424969987995198, + "grad_norm": 1.521910339081235, + "learning_rate": 3.4210020619248762e-06, + "loss": 0.6566, + "step": 518 + }, + { + "epoch": 1.2448979591836735, + "grad_norm": 8.941545760275048, + "learning_rate": 3.402196196848751e-06, + "loss": 0.6737, + "step": 519 + }, + { + "epoch": 1.247298919567827, + "grad_norm": 0.9956406367743204, + "learning_rate": 3.383415472177256e-06, + "loss": 0.6264, + "step": 520 + }, + { + "epoch": 1.2496998799519807, + "grad_norm": 0.8862696839179252, + "learning_rate": 3.3646601834128924e-06, + "loss": 0.6635, + "step": 521 + }, + { + "epoch": 1.2521008403361344, + "grad_norm": 0.9772838899127305, + "learning_rate": 3.3459306256579405e-06, + "loss": 0.6577, + "step": 522 + }, + { + "epoch": 1.2545018007202882, + "grad_norm": 0.9432621976883124, + "learning_rate": 3.3272270936098243e-06, + "loss": 0.6506, + "step": 523 + }, + { + "epoch": 1.2569027611044419, + "grad_norm": 1.9109659903891167, + "learning_rate": 3.3085498815564644e-06, + "loss": 0.6658, + "step": 524 + }, + { + "epoch": 1.2593037214885954, + "grad_norm": 0.8480535123546864, + "learning_rate": 3.289899283371657e-06, + "loss": 0.6229, + "step": 525 + }, + { + "epoch": 1.261704681872749, + "grad_norm": 0.9441953680609311, + "learning_rate": 3.2712755925104478e-06, + "loss": 0.6811, + "step": 526 + }, + { + "epoch": 1.2641056422569028, + "grad_norm": 0.8296489881948692, + "learning_rate": 3.252679102004509e-06, + "loss": 0.6777, + "step": 527 + }, + { + "epoch": 1.2665066026410563, + "grad_norm": 3.3056754185406376, + "learning_rate": 3.234110104457536e-06, + "loss": 0.6511, + "step": 528 + }, + { + "epoch": 1.26890756302521, + "grad_norm": 1.1292461057910061, + "learning_rate": 3.2155688920406415e-06, + "loss": 0.6316, + "step": 529 + }, + { + "epoch": 1.2713085234093637, + "grad_norm": 0.9500459227262518, + "learning_rate": 3.1970557564877524e-06, + "loss": 0.6055, + "step": 530 + }, + { + "epoch": 1.2737094837935174, + "grad_norm": 1.2673542521457937, + "learning_rate": 3.178570989091028e-06, + "loss": 0.6341, + "step": 531 + }, + { + "epoch": 1.2761104441776712, + "grad_norm": 0.9947728211445109, + "learning_rate": 3.1601148806962757e-06, + "loss": 0.6683, + "step": 532 + }, + { + "epoch": 1.2785114045618247, + "grad_norm": 0.9593776784610404, + "learning_rate": 3.141687721698363e-06, + "loss": 0.5981, + "step": 533 + }, + { + "epoch": 1.2809123649459784, + "grad_norm": 1.0033764100278546, + "learning_rate": 3.123289802036667e-06, + "loss": 0.6233, + "step": 534 + }, + { + "epoch": 1.283313325330132, + "grad_norm": 0.9047192571279472, + "learning_rate": 3.104921411190499e-06, + "loss": 0.6555, + "step": 535 + }, + { + "epoch": 1.2857142857142856, + "grad_norm": 0.9716764971679062, + "learning_rate": 3.0865828381745515e-06, + "loss": 0.709, + "step": 536 + }, + { + "epoch": 1.2881152460984393, + "grad_norm": 0.9776107500234306, + "learning_rate": 3.0682743715343565e-06, + "loss": 0.6639, + "step": 537 + }, + { + "epoch": 1.290516206482593, + "grad_norm": 1.2084524925872613, + "learning_rate": 3.049996299341742e-06, + "loss": 0.6664, + "step": 538 + }, + { + "epoch": 1.2929171668667467, + "grad_norm": 1.1629000749386234, + "learning_rate": 3.0317489091902936e-06, + "loss": 0.6515, + "step": 539 + }, + { + "epoch": 1.2953181272509005, + "grad_norm": 0.8738746819122408, + "learning_rate": 3.0135324881908405e-06, + "loss": 0.6378, + "step": 540 + }, + { + "epoch": 1.297719087635054, + "grad_norm": 0.9071468993169257, + "learning_rate": 2.995347322966933e-06, + "loss": 0.6271, + "step": 541 + }, + { + "epoch": 1.3001200480192077, + "grad_norm": 0.8527725942665851, + "learning_rate": 2.977193699650325e-06, + "loss": 0.6566, + "step": 542 + }, + { + "epoch": 1.3025210084033614, + "grad_norm": 1.821515983286086, + "learning_rate": 2.959071903876486e-06, + "loss": 0.6526, + "step": 543 + }, + { + "epoch": 1.3049219687875149, + "grad_norm": 2.6168941096292655, + "learning_rate": 2.9409822207800976e-06, + "loss": 0.6432, + "step": 544 + }, + { + "epoch": 1.3073229291716686, + "grad_norm": 1.1340068418135858, + "learning_rate": 2.9229249349905686e-06, + "loss": 0.6097, + "step": 545 + }, + { + "epoch": 1.3097238895558223, + "grad_norm": 1.065777330021333, + "learning_rate": 2.9049003306275575e-06, + "loss": 0.6513, + "step": 546 + }, + { + "epoch": 1.312124849939976, + "grad_norm": 0.9840631248620156, + "learning_rate": 2.886908691296504e-06, + "loss": 0.6957, + "step": 547 + }, + { + "epoch": 1.3145258103241297, + "grad_norm": 0.9275156432371783, + "learning_rate": 2.86895030008416e-06, + "loss": 0.6222, + "step": 548 + }, + { + "epoch": 1.3169267707082832, + "grad_norm": 1.0694369857718884, + "learning_rate": 2.851025439554142e-06, + "loss": 0.6292, + "step": 549 + }, + { + "epoch": 1.319327731092437, + "grad_norm": 0.9195088026501728, + "learning_rate": 2.8331343917424857e-06, + "loss": 0.6293, + "step": 550 + }, + { + "epoch": 1.3217286914765907, + "grad_norm": 1.0440622673055318, + "learning_rate": 2.8152774381532033e-06, + "loss": 0.6494, + "step": 551 + }, + { + "epoch": 1.3241296518607442, + "grad_norm": 0.9585224467319371, + "learning_rate": 2.797454859753853e-06, + "loss": 0.6377, + "step": 552 + }, + { + "epoch": 1.3265306122448979, + "grad_norm": 0.986356865601444, + "learning_rate": 2.7796669369711294e-06, + "loss": 0.6078, + "step": 553 + }, + { + "epoch": 1.3289315726290516, + "grad_norm": 0.8852031958630902, + "learning_rate": 2.761913949686438e-06, + "loss": 0.6663, + "step": 554 + }, + { + "epoch": 1.3313325330132053, + "grad_norm": 1.0194560710233276, + "learning_rate": 2.744196177231498e-06, + "loss": 0.6471, + "step": 555 + }, + { + "epoch": 1.333733493397359, + "grad_norm": 1.1092428908617034, + "learning_rate": 2.726513898383944e-06, + "loss": 0.6319, + "step": 556 + }, + { + "epoch": 1.3361344537815127, + "grad_norm": 1.046228165586162, + "learning_rate": 2.708867391362948e-06, + "loss": 0.6316, + "step": 557 + }, + { + "epoch": 1.3385354141656662, + "grad_norm": 0.9384421616577757, + "learning_rate": 2.6912569338248317e-06, + "loss": 0.6404, + "step": 558 + }, + { + "epoch": 1.34093637454982, + "grad_norm": 1.0979431829143493, + "learning_rate": 2.673682802858697e-06, + "loss": 0.6528, + "step": 559 + }, + { + "epoch": 1.3433373349339737, + "grad_norm": 0.8782235843379402, + "learning_rate": 2.656145274982081e-06, + "loss": 0.6455, + "step": 560 + }, + { + "epoch": 1.3457382953181272, + "grad_norm": 0.9821501158024878, + "learning_rate": 2.6386446261365874e-06, + "loss": 0.6829, + "step": 561 + }, + { + "epoch": 1.3481392557022809, + "grad_norm": 1.7793178730445558, + "learning_rate": 2.621181131683551e-06, + "loss": 0.7075, + "step": 562 + }, + { + "epoch": 1.3505402160864346, + "grad_norm": 1.1270971633767486, + "learning_rate": 2.603755066399718e-06, + "loss": 0.6341, + "step": 563 + }, + { + "epoch": 1.3529411764705883, + "grad_norm": 0.9286077170161989, + "learning_rate": 2.5863667044728924e-06, + "loss": 0.6825, + "step": 564 + }, + { + "epoch": 1.355342136854742, + "grad_norm": 0.7617565172187906, + "learning_rate": 2.5690163194976576e-06, + "loss": 0.5877, + "step": 565 + }, + { + "epoch": 1.3577430972388955, + "grad_norm": 0.9361260944487725, + "learning_rate": 2.5517041844710456e-06, + "loss": 0.6975, + "step": 566 + }, + { + "epoch": 1.3601440576230492, + "grad_norm": 1.0586142116171426, + "learning_rate": 2.5344305717882487e-06, + "loss": 0.6781, + "step": 567 + }, + { + "epoch": 1.362545018007203, + "grad_norm": 0.8572775669908461, + "learning_rate": 2.5171957532383453e-06, + "loss": 0.6484, + "step": 568 + }, + { + "epoch": 1.3649459783913565, + "grad_norm": 0.8072710536841113, + "learning_rate": 2.5000000000000015e-06, + "loss": 0.6215, + "step": 569 + }, + { + "epoch": 1.3673469387755102, + "grad_norm": 1.1210751153626373, + "learning_rate": 2.4828435826372204e-06, + "loss": 0.6611, + "step": 570 + }, + { + "epoch": 1.3697478991596639, + "grad_norm": 0.9536882704959704, + "learning_rate": 2.465726771095086e-06, + "loss": 0.5999, + "step": 571 + }, + { + "epoch": 1.3721488595438176, + "grad_norm": 1.261934257875031, + "learning_rate": 2.448649834695503e-06, + "loss": 0.6723, + "step": 572 + }, + { + "epoch": 1.3745498199279713, + "grad_norm": 0.9371878075006924, + "learning_rate": 2.4316130421329696e-06, + "loss": 0.6731, + "step": 573 + }, + { + "epoch": 1.3769507803121248, + "grad_norm": 0.8608482332532424, + "learning_rate": 2.4146166614703454e-06, + "loss": 0.6823, + "step": 574 + }, + { + "epoch": 1.3793517406962785, + "grad_norm": 0.9664794593332436, + "learning_rate": 2.3976609601346395e-06, + "loss": 0.6453, + "step": 575 + }, + { + "epoch": 1.3817527010804322, + "grad_norm": 0.968355670405567, + "learning_rate": 2.3807462049127934e-06, + "loss": 0.6024, + "step": 576 + }, + { + "epoch": 1.3841536614645857, + "grad_norm": 2.444260082739265, + "learning_rate": 2.363872661947488e-06, + "loss": 0.6582, + "step": 577 + }, + { + "epoch": 1.3865546218487395, + "grad_norm": 0.8703597669538309, + "learning_rate": 2.3470405967329605e-06, + "loss": 0.6522, + "step": 578 + }, + { + "epoch": 1.3889555822328932, + "grad_norm": 0.9906523957085589, + "learning_rate": 2.33025027411082e-06, + "loss": 0.6417, + "step": 579 + }, + { + "epoch": 1.3913565426170469, + "grad_norm": 2.495843697923558, + "learning_rate": 2.3135019582658803e-06, + "loss": 0.6468, + "step": 580 + }, + { + "epoch": 1.3937575030012006, + "grad_norm": 1.062237654307874, + "learning_rate": 2.296795912722014e-06, + "loss": 0.7482, + "step": 581 + }, + { + "epoch": 1.396158463385354, + "grad_norm": 1.2408978869613996, + "learning_rate": 2.2801324003379854e-06, + "loss": 0.6637, + "step": 582 + }, + { + "epoch": 1.3985594237695078, + "grad_norm": 0.9095853615754681, + "learning_rate": 2.263511683303339e-06, + "loss": 0.6371, + "step": 583 + }, + { + "epoch": 1.4009603841536615, + "grad_norm": 0.9690759667373599, + "learning_rate": 2.246934023134257e-06, + "loss": 0.6622, + "step": 584 + }, + { + "epoch": 1.403361344537815, + "grad_norm": 0.9341036768906378, + "learning_rate": 2.230399680669449e-06, + "loss": 0.6833, + "step": 585 + }, + { + "epoch": 1.4057623049219687, + "grad_norm": 0.8871623641888806, + "learning_rate": 2.213908916066052e-06, + "loss": 0.6327, + "step": 586 + }, + { + "epoch": 1.4081632653061225, + "grad_norm": 0.8972465832232572, + "learning_rate": 2.1974619887955294e-06, + "loss": 0.6534, + "step": 587 + }, + { + "epoch": 1.4105642256902762, + "grad_norm": 1.9887286961869182, + "learning_rate": 2.1810591576395985e-06, + "loss": 0.6357, + "step": 588 + }, + { + "epoch": 1.41296518607443, + "grad_norm": 0.899752751900282, + "learning_rate": 2.1647006806861472e-06, + "loss": 0.6344, + "step": 589 + }, + { + "epoch": 1.4153661464585834, + "grad_norm": 2.3738841741244374, + "learning_rate": 2.148386815325179e-06, + "loss": 0.6297, + "step": 590 + }, + { + "epoch": 1.417767106842737, + "grad_norm": 1.1780835394591138, + "learning_rate": 2.132117818244771e-06, + "loss": 0.6695, + "step": 591 + }, + { + "epoch": 1.4201680672268908, + "grad_norm": 0.940256461882646, + "learning_rate": 2.1158939454270138e-06, + "loss": 0.6213, + "step": 592 + }, + { + "epoch": 1.4225690276110443, + "grad_norm": 1.1302731548409901, + "learning_rate": 2.09971545214401e-06, + "loss": 0.6848, + "step": 593 + }, + { + "epoch": 1.424969987995198, + "grad_norm": 0.8890669967126053, + "learning_rate": 2.083582592953837e-06, + "loss": 0.6168, + "step": 594 + }, + { + "epoch": 1.4273709483793517, + "grad_norm": 0.7964330127150799, + "learning_rate": 2.0674956216965484e-06, + "loss": 0.63, + "step": 595 + }, + { + "epoch": 1.4297719087635055, + "grad_norm": 1.0716409115935557, + "learning_rate": 2.05145479149019e-06, + "loss": 0.6524, + "step": 596 + }, + { + "epoch": 1.4321728691476592, + "grad_norm": 1.0124952323959022, + "learning_rate": 2.0354603547267985e-06, + "loss": 0.6721, + "step": 597 + }, + { + "epoch": 1.4345738295318127, + "grad_norm": 0.977022328573683, + "learning_rate": 2.019512563068443e-06, + "loss": 0.6292, + "step": 598 + }, + { + "epoch": 1.4369747899159664, + "grad_norm": 1.1217389949243721, + "learning_rate": 2.0036116674432653e-06, + "loss": 0.6326, + "step": 599 + }, + { + "epoch": 1.43937575030012, + "grad_norm": 0.8774778075409905, + "learning_rate": 1.9877579180415252e-06, + "loss": 0.6111, + "step": 600 + }, + { + "epoch": 1.4417767106842736, + "grad_norm": 0.9297967785330995, + "learning_rate": 1.971951564311668e-06, + "loss": 0.6636, + "step": 601 + }, + { + "epoch": 1.4441776710684273, + "grad_norm": 1.1183920012608857, + "learning_rate": 1.956192854956397e-06, + "loss": 0.7022, + "step": 602 + }, + { + "epoch": 1.446578631452581, + "grad_norm": 0.8658496834568494, + "learning_rate": 1.9404820379287677e-06, + "loss": 0.6807, + "step": 603 + }, + { + "epoch": 1.4489795918367347, + "grad_norm": 0.9111194634371569, + "learning_rate": 1.924819360428276e-06, + "loss": 0.6499, + "step": 604 + }, + { + "epoch": 1.4513805522208885, + "grad_norm": 0.8210926504989612, + "learning_rate": 1.9092050688969736e-06, + "loss": 0.5997, + "step": 605 + }, + { + "epoch": 1.453781512605042, + "grad_norm": 0.9508028705856567, + "learning_rate": 1.8936394090155952e-06, + "loss": 0.6234, + "step": 606 + }, + { + "epoch": 1.4561824729891957, + "grad_norm": 0.8253729688748952, + "learning_rate": 1.8781226256996838e-06, + "loss": 0.6343, + "step": 607 + }, + { + "epoch": 1.4585834333733494, + "grad_norm": 1.1301843863674312, + "learning_rate": 1.8626549630957397e-06, + "loss": 0.6384, + "step": 608 + }, + { + "epoch": 1.4609843937575029, + "grad_norm": 1.005290256520289, + "learning_rate": 1.8472366645773892e-06, + "loss": 0.6391, + "step": 609 + }, + { + "epoch": 1.4633853541416566, + "grad_norm": 1.1339132680403525, + "learning_rate": 1.831867972741534e-06, + "loss": 0.7299, + "step": 610 + }, + { + "epoch": 1.4657863145258103, + "grad_norm": 1.6091153520949204, + "learning_rate": 1.8165491294045596e-06, + "loss": 0.6607, + "step": 611 + }, + { + "epoch": 1.468187274909964, + "grad_norm": 0.8949044081385531, + "learning_rate": 1.8012803755985098e-06, + "loss": 0.6645, + "step": 612 + }, + { + "epoch": 1.4705882352941178, + "grad_norm": 1.0476157250288862, + "learning_rate": 1.7860619515673034e-06, + "loss": 0.6667, + "step": 613 + }, + { + "epoch": 1.4729891956782712, + "grad_norm": 1.0829061637937194, + "learning_rate": 1.7708940967629567e-06, + "loss": 0.6317, + "step": 614 + }, + { + "epoch": 1.475390156062425, + "grad_norm": 0.9370432461748385, + "learning_rate": 1.7557770498418087e-06, + "loss": 0.6737, + "step": 615 + }, + { + "epoch": 1.4777911164465787, + "grad_norm": 1.0882844485223084, + "learning_rate": 1.740711048660767e-06, + "loss": 0.6478, + "step": 616 + }, + { + "epoch": 1.4801920768307322, + "grad_norm": 8.920120491127792, + "learning_rate": 1.7256963302735752e-06, + "loss": 0.6864, + "step": 617 + }, + { + "epoch": 1.482593037214886, + "grad_norm": 0.9549261538889966, + "learning_rate": 1.7107331309270686e-06, + "loss": 0.651, + "step": 618 + }, + { + "epoch": 1.4849939975990396, + "grad_norm": 2.261202226088013, + "learning_rate": 1.6958216860574661e-06, + "loss": 0.6273, + "step": 619 + }, + { + "epoch": 1.4873949579831933, + "grad_norm": 0.930837304019007, + "learning_rate": 1.6809622302866623e-06, + "loss": 0.6645, + "step": 620 + }, + { + "epoch": 1.489795918367347, + "grad_norm": 1.216556634979396, + "learning_rate": 1.6661549974185426e-06, + "loss": 0.6767, + "step": 621 + }, + { + "epoch": 1.4921968787515005, + "grad_norm": 0.8558885688663144, + "learning_rate": 1.6514002204352919e-06, + "loss": 0.5963, + "step": 622 + }, + { + "epoch": 1.4945978391356542, + "grad_norm": 0.8947167354414174, + "learning_rate": 1.6366981314937374e-06, + "loss": 0.6748, + "step": 623 + }, + { + "epoch": 1.496998799519808, + "grad_norm": 0.9245974493657283, + "learning_rate": 1.6220489619216988e-06, + "loss": 0.6465, + "step": 624 + }, + { + "epoch": 1.4993997599039615, + "grad_norm": 0.7421955754193266, + "learning_rate": 1.6074529422143398e-06, + "loss": 0.6198, + "step": 625 + }, + { + "epoch": 1.5018007202881152, + "grad_norm": 1.0226755510539118, + "learning_rate": 1.5929103020305441e-06, + "loss": 0.6737, + "step": 626 + }, + { + "epoch": 1.504201680672269, + "grad_norm": 0.9001021170854256, + "learning_rate": 1.5784212701893088e-06, + "loss": 0.6197, + "step": 627 + }, + { + "epoch": 1.5066026410564226, + "grad_norm": 1.0389230808512446, + "learning_rate": 1.5639860746661339e-06, + "loss": 0.6617, + "step": 628 + }, + { + "epoch": 1.5090036014405763, + "grad_norm": 0.9388777166315837, + "learning_rate": 1.549604942589441e-06, + "loss": 0.6912, + "step": 629 + }, + { + "epoch": 1.51140456182473, + "grad_norm": 0.9751733282640802, + "learning_rate": 1.5352781002369976e-06, + "loss": 0.6452, + "step": 630 + }, + { + "epoch": 1.5138055222088835, + "grad_norm": 0.7877825500209034, + "learning_rate": 1.521005773032362e-06, + "loss": 0.6328, + "step": 631 + }, + { + "epoch": 1.5162064825930373, + "grad_norm": 1.0048986074663675, + "learning_rate": 1.5067881855413275e-06, + "loss": 0.6717, + "step": 632 + }, + { + "epoch": 1.5186074429771907, + "grad_norm": 1.0356464031510044, + "learning_rate": 1.4926255614683931e-06, + "loss": 0.6301, + "step": 633 + }, + { + "epoch": 1.5210084033613445, + "grad_norm": 1.1045877070154857, + "learning_rate": 1.4785181236532514e-06, + "loss": 0.6495, + "step": 634 + }, + { + "epoch": 1.5234093637454982, + "grad_norm": 1.0326108777946468, + "learning_rate": 1.4644660940672628e-06, + "loss": 0.6601, + "step": 635 + }, + { + "epoch": 1.525810324129652, + "grad_norm": 0.8225636887377143, + "learning_rate": 1.450469693809986e-06, + "loss": 0.6578, + "step": 636 + }, + { + "epoch": 1.5282112845138056, + "grad_norm": 1.0518805255531551, + "learning_rate": 1.4365291431056871e-06, + "loss": 0.6113, + "step": 637 + }, + { + "epoch": 1.5306122448979593, + "grad_norm": 2.5860144771962066, + "learning_rate": 1.4226446612998673e-06, + "loss": 0.6298, + "step": 638 + }, + { + "epoch": 1.5330132052821128, + "grad_norm": 1.111493995682213, + "learning_rate": 1.4088164668558302e-06, + "loss": 0.615, + "step": 639 + }, + { + "epoch": 1.5354141656662665, + "grad_norm": 0.9342207765376362, + "learning_rate": 1.39504477735123e-06, + "loss": 0.6636, + "step": 640 + }, + { + "epoch": 1.53781512605042, + "grad_norm": 0.9206445859863835, + "learning_rate": 1.3813298094746491e-06, + "loss": 0.6278, + "step": 641 + }, + { + "epoch": 1.5402160864345738, + "grad_norm": 1.0324765311389028, + "learning_rate": 1.3676717790222e-06, + "loss": 0.64, + "step": 642 + }, + { + "epoch": 1.5426170468187275, + "grad_norm": 1.225786679799285, + "learning_rate": 1.354070900894115e-06, + "loss": 0.6522, + "step": 643 + }, + { + "epoch": 1.5450180072028812, + "grad_norm": 0.9687208223950486, + "learning_rate": 1.340527389091374e-06, + "loss": 0.6247, + "step": 644 + }, + { + "epoch": 1.547418967587035, + "grad_norm": 0.8339357278892652, + "learning_rate": 1.3270414567123342e-06, + "loss": 0.6777, + "step": 645 + }, + { + "epoch": 1.5498199279711886, + "grad_norm": 0.9391801557957058, + "learning_rate": 1.3136133159493803e-06, + "loss": 0.7764, + "step": 646 + }, + { + "epoch": 1.552220888355342, + "grad_norm": 1.1081348736358358, + "learning_rate": 1.3002431780855817e-06, + "loss": 0.6648, + "step": 647 + }, + { + "epoch": 1.5546218487394958, + "grad_norm": 1.2909105123595979, + "learning_rate": 1.2869312534913686e-06, + "loss": 0.6731, + "step": 648 + }, + { + "epoch": 1.5570228091236493, + "grad_norm": 2.0962675285135908, + "learning_rate": 1.2736777516212267e-06, + "loss": 0.713, + "step": 649 + }, + { + "epoch": 1.559423769507803, + "grad_norm": 0.9514467073135442, + "learning_rate": 1.260482881010396e-06, + "loss": 0.5859, + "step": 650 + }, + { + "epoch": 1.5618247298919568, + "grad_norm": 0.8913685423120479, + "learning_rate": 1.2473468492715896e-06, + "loss": 0.6406, + "step": 651 + }, + { + "epoch": 1.5642256902761105, + "grad_norm": 0.9198640943911467, + "learning_rate": 1.2342698630917337e-06, + "loss": 0.6289, + "step": 652 + }, + { + "epoch": 1.5666266506602642, + "grad_norm": 0.9220782597157949, + "learning_rate": 1.2212521282287093e-06, + "loss": 0.6582, + "step": 653 + }, + { + "epoch": 1.569027611044418, + "grad_norm": 1.0495774693223259, + "learning_rate": 1.2082938495081143e-06, + "loss": 0.6248, + "step": 654 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 1.7192639331315858, + "learning_rate": 1.1953952308200428e-06, + "loss": 0.6172, + "step": 655 + }, + { + "epoch": 1.5738295318127251, + "grad_norm": 0.9156640691230766, + "learning_rate": 1.1825564751158825e-06, + "loss": 0.6684, + "step": 656 + }, + { + "epoch": 1.5762304921968786, + "grad_norm": 0.8912057501633746, + "learning_rate": 1.1697777844051105e-06, + "loss": 0.6742, + "step": 657 + }, + { + "epoch": 1.5786314525810323, + "grad_norm": 0.9486921526679506, + "learning_rate": 1.1570593597521201e-06, + "loss": 0.6114, + "step": 658 + }, + { + "epoch": 1.581032412965186, + "grad_norm": 0.9544253927956883, + "learning_rate": 1.144401401273062e-06, + "loss": 0.6097, + "step": 659 + }, + { + "epoch": 1.5834333733493398, + "grad_norm": 1.3504169424667136, + "learning_rate": 1.1318041081326868e-06, + "loss": 0.6328, + "step": 660 + }, + { + "epoch": 1.5858343337334935, + "grad_norm": 0.8965813935254673, + "learning_rate": 1.1192676785412154e-06, + "loss": 0.6642, + "step": 661 + }, + { + "epoch": 1.5882352941176472, + "grad_norm": 1.0133574586256462, + "learning_rate": 1.1067923097512256e-06, + "loss": 0.6539, + "step": 662 + }, + { + "epoch": 1.5906362545018007, + "grad_norm": 0.9508952698284318, + "learning_rate": 1.0943781980545332e-06, + "loss": 0.6681, + "step": 663 + }, + { + "epoch": 1.5930372148859544, + "grad_norm": 1.472278929509015, + "learning_rate": 1.082025538779124e-06, + "loss": 0.6957, + "step": 664 + }, + { + "epoch": 1.595438175270108, + "grad_norm": 1.030931088954498, + "learning_rate": 1.0697345262860638e-06, + "loss": 0.6383, + "step": 665 + }, + { + "epoch": 1.5978391356542616, + "grad_norm": 1.1342385358407325, + "learning_rate": 1.0575053539664465e-06, + "loss": 0.616, + "step": 666 + }, + { + "epoch": 1.6002400960384153, + "grad_norm": 0.9671979722590744, + "learning_rate": 1.0453382142383545e-06, + "loss": 0.675, + "step": 667 + }, + { + "epoch": 1.602641056422569, + "grad_norm": 0.9287771382738991, + "learning_rate": 1.0332332985438248e-06, + "loss": 0.6366, + "step": 668 + }, + { + "epoch": 1.6050420168067228, + "grad_norm": 0.9472315620503476, + "learning_rate": 1.0211907973458391e-06, + "loss": 0.6264, + "step": 669 + }, + { + "epoch": 1.6074429771908765, + "grad_norm": 1.087340362754727, + "learning_rate": 1.0092109001253314e-06, + "loss": 0.6398, + "step": 670 + }, + { + "epoch": 1.60984393757503, + "grad_norm": 0.8115527735674969, + "learning_rate": 9.972937953781985e-07, + "loss": 0.6273, + "step": 671 + }, + { + "epoch": 1.6122448979591837, + "grad_norm": 0.9337618001262841, + "learning_rate": 9.85439670612341e-07, + "loss": 0.6363, + "step": 672 + }, + { + "epoch": 1.6146458583433372, + "grad_norm": 1.5207042640095867, + "learning_rate": 9.73648712344707e-07, + "loss": 0.6502, + "step": 673 + }, + { + "epoch": 1.617046818727491, + "grad_norm": 1.085665654159603, + "learning_rate": 9.619211060983674e-07, + "loss": 0.6889, + "step": 674 + }, + { + "epoch": 1.6194477791116446, + "grad_norm": 0.9586829488001999, + "learning_rate": 9.502570363995839e-07, + "loss": 0.7141, + "step": 675 + }, + { + "epoch": 1.6218487394957983, + "grad_norm": 1.3208377530379118, + "learning_rate": 9.386566867749131e-07, + "loss": 0.6788, + "step": 676 + }, + { + "epoch": 1.624249699879952, + "grad_norm": 0.9717132751178416, + "learning_rate": 9.271202397483214e-07, + "loss": 0.6222, + "step": 677 + }, + { + "epoch": 1.6266506602641058, + "grad_norm": 0.8691308355211956, + "learning_rate": 9.15647876838306e-07, + "loss": 0.643, + "step": 678 + }, + { + "epoch": 1.6290516206482593, + "grad_norm": 0.9820836736882421, + "learning_rate": 9.042397785550405e-07, + "loss": 0.681, + "step": 679 + }, + { + "epoch": 1.631452581032413, + "grad_norm": 1.0360078246750115, + "learning_rate": 8.928961243975437e-07, + "loss": 0.628, + "step": 680 + }, + { + "epoch": 1.6338535414165665, + "grad_norm": 1.0282198489236734, + "learning_rate": 8.816170928508367e-07, + "loss": 0.6961, + "step": 681 + }, + { + "epoch": 1.6362545018007202, + "grad_norm": 1.4419881007160427, + "learning_rate": 8.704028613831561e-07, + "loss": 0.657, + "step": 682 + }, + { + "epoch": 1.638655462184874, + "grad_norm": 0.9684178540852127, + "learning_rate": 8.592536064431467e-07, + "loss": 0.6684, + "step": 683 + }, + { + "epoch": 1.6410564225690276, + "grad_norm": 0.8737370302144639, + "learning_rate": 8.481695034570892e-07, + "loss": 0.6826, + "step": 684 + }, + { + "epoch": 1.6434573829531813, + "grad_norm": 1.9051074779808914, + "learning_rate": 8.371507268261436e-07, + "loss": 0.6422, + "step": 685 + }, + { + "epoch": 1.645858343337335, + "grad_norm": 5.528134895709504, + "learning_rate": 8.261974499235987e-07, + "loss": 0.6956, + "step": 686 + }, + { + "epoch": 1.6482593037214885, + "grad_norm": 0.9288503984193149, + "learning_rate": 8.153098450921514e-07, + "loss": 0.6465, + "step": 687 + }, + { + "epoch": 1.6506602641056423, + "grad_norm": 0.8890540558215465, + "learning_rate": 8.044880836411889e-07, + "loss": 0.6786, + "step": 688 + }, + { + "epoch": 1.6530612244897958, + "grad_norm": 1.0942079760094963, + "learning_rate": 7.937323358440935e-07, + "loss": 0.6615, + "step": 689 + }, + { + "epoch": 1.6554621848739495, + "grad_norm": 1.3854108815443495, + "learning_rate": 7.830427709355726e-07, + "loss": 0.6713, + "step": 690 + }, + { + "epoch": 1.6578631452581032, + "grad_norm": 1.8458381248660314, + "learning_rate": 7.724195571089787e-07, + "loss": 0.666, + "step": 691 + }, + { + "epoch": 1.660264105642257, + "grad_norm": 0.9665077364422517, + "learning_rate": 7.618628615136825e-07, + "loss": 0.6585, + "step": 692 + }, + { + "epoch": 1.6626650660264106, + "grad_norm": 1.2255421786519085, + "learning_rate": 7.513728502524286e-07, + "loss": 0.6579, + "step": 693 + }, + { + "epoch": 1.6650660264105643, + "grad_norm": 1.0829000788514598, + "learning_rate": 7.409496883787276e-07, + "loss": 0.6705, + "step": 694 + }, + { + "epoch": 1.667466986794718, + "grad_norm": 1.224215492415608, + "learning_rate": 7.305935398942598e-07, + "loss": 0.6362, + "step": 695 + }, + { + "epoch": 1.6698679471788715, + "grad_norm": 1.00281395524098, + "learning_rate": 7.203045677462922e-07, + "loss": 0.6416, + "step": 696 + }, + { + "epoch": 1.6722689075630253, + "grad_norm": 0.8844080649716427, + "learning_rate": 7.100829338251147e-07, + "loss": 0.6706, + "step": 697 + }, + { + "epoch": 1.6746698679471788, + "grad_norm": 1.3784393546774785, + "learning_rate": 6.999287989614972e-07, + "loss": 0.6008, + "step": 698 + }, + { + "epoch": 1.6770708283313325, + "grad_norm": 1.2142211468312185, + "learning_rate": 6.898423229241535e-07, + "loss": 0.617, + "step": 699 + }, + { + "epoch": 1.6794717887154862, + "grad_norm": 0.8926473520136611, + "learning_rate": 6.798236644172301e-07, + "loss": 0.6651, + "step": 700 + }, + { + "epoch": 1.68187274909964, + "grad_norm": 0.7761396023419351, + "learning_rate": 6.698729810778065e-07, + "loss": 0.6587, + "step": 701 + }, + { + "epoch": 1.6842737094837936, + "grad_norm": 0.976405931900423, + "learning_rate": 6.599904294734227e-07, + "loss": 0.6085, + "step": 702 + }, + { + "epoch": 1.6866746698679473, + "grad_norm": 0.9389803284730526, + "learning_rate": 6.501761650996052e-07, + "loss": 0.6295, + "step": 703 + }, + { + "epoch": 1.6890756302521008, + "grad_norm": 1.210248872780046, + "learning_rate": 6.404303423774261e-07, + "loss": 0.665, + "step": 704 + }, + { + "epoch": 1.6914765906362546, + "grad_norm": 0.953540162188805, + "learning_rate": 6.307531146510754e-07, + "loss": 0.6996, + "step": 705 + }, + { + "epoch": 1.693877551020408, + "grad_norm": 0.9906405098922356, + "learning_rate": 6.211446341854415e-07, + "loss": 0.6568, + "step": 706 + }, + { + "epoch": 1.6962785114045618, + "grad_norm": 1.0790486562130543, + "learning_rate": 6.116050521637218e-07, + "loss": 0.6314, + "step": 707 + }, + { + "epoch": 1.6986794717887155, + "grad_norm": 0.8749360312510759, + "learning_rate": 6.021345186850419e-07, + "loss": 0.6679, + "step": 708 + }, + { + "epoch": 1.7010804321728692, + "grad_norm": 0.932641296809523, + "learning_rate": 5.927331827620902e-07, + "loss": 0.6049, + "step": 709 + }, + { + "epoch": 1.703481392557023, + "grad_norm": 1.501148013771617, + "learning_rate": 5.834011923187804e-07, + "loss": 0.7124, + "step": 710 + }, + { + "epoch": 1.7058823529411766, + "grad_norm": 1.0232382065365628, + "learning_rate": 5.741386941879179e-07, + "loss": 0.6182, + "step": 711 + }, + { + "epoch": 1.7082833133253301, + "grad_norm": 0.8493888976027031, + "learning_rate": 5.649458341088915e-07, + "loss": 0.6098, + "step": 712 + }, + { + "epoch": 1.7106842737094838, + "grad_norm": 1.0287417261052827, + "learning_rate": 5.558227567253832e-07, + "loss": 0.6485, + "step": 713 + }, + { + "epoch": 1.7130852340936373, + "grad_norm": 0.9824165450245743, + "learning_rate": 5.467696055830862e-07, + "loss": 0.6442, + "step": 714 + }, + { + "epoch": 1.715486194477791, + "grad_norm": 0.9840054188214199, + "learning_rate": 5.377865231274504e-07, + "loss": 0.6555, + "step": 715 + }, + { + "epoch": 1.7178871548619448, + "grad_norm": 0.9220257833526645, + "learning_rate": 5.288736507014436e-07, + "loss": 0.6615, + "step": 716 + }, + { + "epoch": 1.7202881152460985, + "grad_norm": 1.1631333915971627, + "learning_rate": 5.200311285433213e-07, + "loss": 0.6667, + "step": 717 + }, + { + "epoch": 1.7226890756302522, + "grad_norm": 0.8429719875570699, + "learning_rate": 5.112590957844233e-07, + "loss": 0.6566, + "step": 718 + }, + { + "epoch": 1.725090036014406, + "grad_norm": 1.0799558152581747, + "learning_rate": 5.025576904469842e-07, + "loss": 0.6693, + "step": 719 + }, + { + "epoch": 1.7274909963985594, + "grad_norm": 1.2210942148405586, + "learning_rate": 4.939270494419657e-07, + "loss": 0.6541, + "step": 720 + }, + { + "epoch": 1.7298919567827131, + "grad_norm": 0.9590257367356989, + "learning_rate": 4.853673085668947e-07, + "loss": 0.6431, + "step": 721 + }, + { + "epoch": 1.7322929171668666, + "grad_norm": 1.7775283513181313, + "learning_rate": 4.768786025037308e-07, + "loss": 0.6626, + "step": 722 + }, + { + "epoch": 1.7346938775510203, + "grad_norm": 1.1344818287215155, + "learning_rate": 4.6846106481675035e-07, + "loss": 0.7145, + "step": 723 + }, + { + "epoch": 1.737094837935174, + "grad_norm": 0.8004575138198893, + "learning_rate": 4.601148279504386e-07, + "loss": 0.6703, + "step": 724 + }, + { + "epoch": 1.7394957983193278, + "grad_norm": 1.0048669346768608, + "learning_rate": 4.5184002322740784e-07, + "loss": 0.6605, + "step": 725 + }, + { + "epoch": 1.7418967587034815, + "grad_norm": 1.7963702152633332, + "learning_rate": 4.43636780846336e-07, + "loss": 0.6642, + "step": 726 + }, + { + "epoch": 1.7442977190876352, + "grad_norm": 0.8675050774193321, + "learning_rate": 4.355052298799112e-07, + "loss": 0.6212, + "step": 727 + }, + { + "epoch": 1.7466986794717887, + "grad_norm": 1.1240940409157105, + "learning_rate": 4.274454982728032e-07, + "loss": 0.6624, + "step": 728 + }, + { + "epoch": 1.7490996398559424, + "grad_norm": 0.8849118229552638, + "learning_rate": 4.194577128396521e-07, + "loss": 0.6643, + "step": 729 + }, + { + "epoch": 1.751500600240096, + "grad_norm": 0.9481416953837549, + "learning_rate": 4.11541999263072e-07, + "loss": 0.6642, + "step": 730 + }, + { + "epoch": 1.7539015606242496, + "grad_norm": 0.8446166604341134, + "learning_rate": 4.036984820916723e-07, + "loss": 0.6813, + "step": 731 + }, + { + "epoch": 1.7563025210084033, + "grad_norm": 0.9415777016276731, + "learning_rate": 3.959272847380985e-07, + "loss": 0.6586, + "step": 732 + }, + { + "epoch": 1.758703481392557, + "grad_norm": 3.16162065014156, + "learning_rate": 3.882285294770938e-07, + "loss": 0.671, + "step": 733 + }, + { + "epoch": 1.7611044417767108, + "grad_norm": 0.9046402526532606, + "learning_rate": 3.8060233744356634e-07, + "loss": 0.6472, + "step": 734 + }, + { + "epoch": 1.7635054021608645, + "grad_norm": 1.274276291255798, + "learning_rate": 3.730488286306944e-07, + "loss": 0.6879, + "step": 735 + }, + { + "epoch": 1.765906362545018, + "grad_norm": 0.9266738982390647, + "learning_rate": 3.655681218880325e-07, + "loss": 0.7006, + "step": 736 + }, + { + "epoch": 1.7683073229291717, + "grad_norm": 1.1298270256741265, + "learning_rate": 3.581603349196372e-07, + "loss": 0.6302, + "step": 737 + }, + { + "epoch": 1.7707082833133252, + "grad_norm": 1.4768285811131943, + "learning_rate": 3.5082558428222555e-07, + "loss": 0.6344, + "step": 738 + }, + { + "epoch": 1.773109243697479, + "grad_norm": 1.03574555828318, + "learning_rate": 3.435639853833317e-07, + "loss": 0.6694, + "step": 739 + }, + { + "epoch": 1.7755102040816326, + "grad_norm": 0.9168661600940188, + "learning_rate": 3.363756524794959e-07, + "loss": 0.6907, + "step": 740 + }, + { + "epoch": 1.7779111644657863, + "grad_norm": 1.212035933698864, + "learning_rate": 3.2926069867446673e-07, + "loss": 0.6557, + "step": 741 + }, + { + "epoch": 1.78031212484994, + "grad_norm": 1.0068074581038595, + "learning_rate": 3.222192359174181e-07, + "loss": 0.6407, + "step": 742 + }, + { + "epoch": 1.7827130852340938, + "grad_norm": 1.048679687746833, + "learning_rate": 3.1525137500119207e-07, + "loss": 0.6423, + "step": 743 + }, + { + "epoch": 1.7851140456182473, + "grad_norm": 0.9656449483604257, + "learning_rate": 3.0835722556055103e-07, + "loss": 0.6629, + "step": 744 + }, + { + "epoch": 1.787515006002401, + "grad_norm": 1.428088163085122, + "learning_rate": 3.015368960704584e-07, + "loss": 0.6633, + "step": 745 + }, + { + "epoch": 1.7899159663865545, + "grad_norm": 0.9064014873180054, + "learning_rate": 2.947904938443663e-07, + "loss": 0.7047, + "step": 746 + }, + { + "epoch": 1.7923169267707082, + "grad_norm": 0.9167347266961742, + "learning_rate": 2.881181250325299e-07, + "loss": 0.6324, + "step": 747 + }, + { + "epoch": 1.794717887154862, + "grad_norm": 0.9791642332462065, + "learning_rate": 2.815198946203379e-07, + "loss": 0.6573, + "step": 748 + }, + { + "epoch": 1.7971188475390156, + "grad_norm": 1.7966695794490903, + "learning_rate": 2.7499590642665773e-07, + "loss": 0.6361, + "step": 749 + }, + { + "epoch": 1.7995198079231693, + "grad_norm": 1.0061714167779683, + "learning_rate": 2.68546263102204e-07, + "loss": 0.6569, + "step": 750 + }, + { + "epoch": 1.801920768307323, + "grad_norm": 0.929889297638995, + "learning_rate": 2.621710661279253e-07, + "loss": 0.6482, + "step": 751 + }, + { + "epoch": 1.8043217286914766, + "grad_norm": 1.447901958483443, + "learning_rate": 2.5587041581340235e-07, + "loss": 0.6727, + "step": 752 + }, + { + "epoch": 1.8067226890756303, + "grad_norm": 1.281764423132765, + "learning_rate": 2.4964441129527337e-07, + "loss": 0.637, + "step": 753 + }, + { + "epoch": 1.8091236494597838, + "grad_norm": 1.19823353128569, + "learning_rate": 2.4349315053567466e-07, + "loss": 0.6666, + "step": 754 + }, + { + "epoch": 1.8115246098439375, + "grad_norm": 0.8837247280505467, + "learning_rate": 2.3741673032069757e-07, + "loss": 0.6884, + "step": 755 + }, + { + "epoch": 1.8139255702280912, + "grad_norm": 3.3673813583616514, + "learning_rate": 2.314152462588659e-07, + "loss": 0.6197, + "step": 756 + }, + { + "epoch": 1.816326530612245, + "grad_norm": 0.8669864527827419, + "learning_rate": 2.2548879277963065e-07, + "loss": 0.6754, + "step": 757 + }, + { + "epoch": 1.8187274909963986, + "grad_norm": 0.8557759357976468, + "learning_rate": 2.1963746313188762e-07, + "loss": 0.6565, + "step": 758 + }, + { + "epoch": 1.8211284513805523, + "grad_norm": 0.9835493095940567, + "learning_rate": 2.1386134938250645e-07, + "loss": 0.7246, + "step": 759 + }, + { + "epoch": 1.8235294117647058, + "grad_norm": 0.9486375192086429, + "learning_rate": 2.0816054241488358e-07, + "loss": 0.6464, + "step": 760 + }, + { + "epoch": 1.8259303721488596, + "grad_norm": 0.9939355968419499, + "learning_rate": 2.0253513192751374e-07, + "loss": 0.6784, + "step": 761 + }, + { + "epoch": 1.828331332533013, + "grad_norm": 2.5411064037190214, + "learning_rate": 1.9698520643257334e-07, + "loss": 0.6171, + "step": 762 + }, + { + "epoch": 1.8307322929171668, + "grad_norm": 0.8611234302478261, + "learning_rate": 1.9151085325453512e-07, + "loss": 0.6915, + "step": 763 + }, + { + "epoch": 1.8331332533013205, + "grad_norm": 0.8053015612745882, + "learning_rate": 1.8611215852879005e-07, + "loss": 0.6687, + "step": 764 + }, + { + "epoch": 1.8355342136854742, + "grad_norm": 0.9521324815935206, + "learning_rate": 1.807892072002898e-07, + "loss": 0.6657, + "step": 765 + }, + { + "epoch": 1.837935174069628, + "grad_norm": 1.094466055359716, + "learning_rate": 1.7554208302221654e-07, + "loss": 0.7215, + "step": 766 + }, + { + "epoch": 1.8403361344537816, + "grad_norm": 0.9098193968920033, + "learning_rate": 1.7037086855465902e-07, + "loss": 0.6497, + "step": 767 + }, + { + "epoch": 1.8427370948379351, + "grad_norm": 0.855181562781992, + "learning_rate": 1.652756451633164e-07, + "loss": 0.7093, + "step": 768 + }, + { + "epoch": 1.8451380552220888, + "grad_norm": 0.8565693947802443, + "learning_rate": 1.6025649301821877e-07, + "loss": 0.6503, + "step": 769 + }, + { + "epoch": 1.8475390156062423, + "grad_norm": 0.9679413436421161, + "learning_rate": 1.5531349109246364e-07, + "loss": 0.7123, + "step": 770 + }, + { + "epoch": 1.849939975990396, + "grad_norm": 1.136073654492355, + "learning_rate": 1.5044671716097414e-07, + "loss": 0.6521, + "step": 771 + }, + { + "epoch": 1.8523409363745498, + "grad_norm": 0.7992798585720682, + "learning_rate": 1.4565624779927568e-07, + "loss": 0.6681, + "step": 772 + }, + { + "epoch": 1.8547418967587035, + "grad_norm": 1.677902937262838, + "learning_rate": 1.4094215838229176e-07, + "loss": 0.6913, + "step": 773 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 1.5737946583536535, + "learning_rate": 1.3630452308315557e-07, + "loss": 0.6543, + "step": 774 + }, + { + "epoch": 1.859543817527011, + "grad_norm": 1.1481025211558338, + "learning_rate": 1.3174341487204478e-07, + "loss": 0.6267, + "step": 775 + }, + { + "epoch": 1.8619447779111644, + "grad_norm": 0.8965017224592727, + "learning_rate": 1.2725890551503472e-07, + "loss": 0.6994, + "step": 776 + }, + { + "epoch": 1.8643457382953181, + "grad_norm": 0.867766283089291, + "learning_rate": 1.2285106557296479e-07, + "loss": 0.6322, + "step": 777 + }, + { + "epoch": 1.8667466986794716, + "grad_norm": 0.9421735003405503, + "learning_rate": 1.185199644003332e-07, + "loss": 0.6507, + "step": 778 + }, + { + "epoch": 1.8691476590636253, + "grad_norm": 1.0906617292381764, + "learning_rate": 1.1426567014420297e-07, + "loss": 0.672, + "step": 779 + }, + { + "epoch": 1.871548619447779, + "grad_norm": 1.247937633242823, + "learning_rate": 1.1008824974312827e-07, + "loss": 0.697, + "step": 780 + }, + { + "epoch": 1.8739495798319328, + "grad_norm": 1.1670307684913883, + "learning_rate": 1.0598776892610685e-07, + "loss": 0.6856, + "step": 781 + }, + { + "epoch": 1.8763505402160865, + "grad_norm": 1.3271471671208666, + "learning_rate": 1.0196429221153825e-07, + "loss": 0.6622, + "step": 782 + }, + { + "epoch": 1.8787515006002402, + "grad_norm": 0.886398633773095, + "learning_rate": 9.801788290621505e-08, + "loss": 0.6898, + "step": 783 + }, + { + "epoch": 1.8811524609843937, + "grad_norm": 0.877392618889072, + "learning_rate": 9.414860310432317e-08, + "loss": 0.6508, + "step": 784 + }, + { + "epoch": 1.8835534213685474, + "grad_norm": 1.1906859503017138, + "learning_rate": 9.035651368646647e-08, + "loss": 0.6489, + "step": 785 + }, + { + "epoch": 1.885954381752701, + "grad_norm": 1.9798044799116112, + "learning_rate": 8.664167431870762e-08, + "loss": 0.6719, + "step": 786 + }, + { + "epoch": 1.8883553421368546, + "grad_norm": 1.1006110013842414, + "learning_rate": 8.300414345163043e-08, + "loss": 0.642, + "step": 787 + }, + { + "epoch": 1.8907563025210083, + "grad_norm": 0.7974940942674409, + "learning_rate": 7.944397831941952e-08, + "loss": 0.7165, + "step": 788 + }, + { + "epoch": 1.893157262905162, + "grad_norm": 1.1280785375939588, + "learning_rate": 7.59612349389599e-08, + "loss": 0.6431, + "step": 789 + }, + { + "epoch": 1.8955582232893158, + "grad_norm": 1.1144184023534176, + "learning_rate": 7.255596810895548e-08, + "loss": 0.7046, + "step": 790 + }, + { + "epoch": 1.8979591836734695, + "grad_norm": 1.138147657830953, + "learning_rate": 6.922823140906754e-08, + "loss": 0.6698, + "step": 791 + }, + { + "epoch": 1.9003601440576232, + "grad_norm": 0.9217680993318809, + "learning_rate": 6.597807719907034e-08, + "loss": 0.6496, + "step": 792 + }, + { + "epoch": 1.9027611044417767, + "grad_norm": 1.186007338823389, + "learning_rate": 6.280555661802857e-08, + "loss": 0.6649, + "step": 793 + }, + { + "epoch": 1.9051620648259304, + "grad_norm": 1.0174742125343583, + "learning_rate": 5.971071958349229e-08, + "loss": 0.6568, + "step": 794 + }, + { + "epoch": 1.907563025210084, + "grad_norm": 0.8438893439006977, + "learning_rate": 5.669361479071156e-08, + "loss": 0.6654, + "step": 795 + }, + { + "epoch": 1.9099639855942376, + "grad_norm": 0.7499962846971833, + "learning_rate": 5.375428971186925e-08, + "loss": 0.6276, + "step": 796 + }, + { + "epoch": 1.9123649459783914, + "grad_norm": 0.8717461451446198, + "learning_rate": 5.089279059533658e-08, + "loss": 0.7107, + "step": 797 + }, + { + "epoch": 1.914765906362545, + "grad_norm": 0.9116003950294865, + "learning_rate": 4.810916246494157e-08, + "loss": 0.6244, + "step": 798 + }, + { + "epoch": 1.9171668667466988, + "grad_norm": 0.8645440990234077, + "learning_rate": 4.5403449119265085e-08, + "loss": 0.683, + "step": 799 + }, + { + "epoch": 1.9195678271308525, + "grad_norm": 0.9080233023711938, + "learning_rate": 4.2775693130948094e-08, + "loss": 0.6298, + "step": 800 + }, + { + "epoch": 1.921968787515006, + "grad_norm": 0.8210413108402735, + "learning_rate": 4.02259358460233e-08, + "loss": 0.6709, + "step": 801 + }, + { + "epoch": 1.9243697478991597, + "grad_norm": 0.8285484827144454, + "learning_rate": 3.775421738326568e-08, + "loss": 0.6655, + "step": 802 + }, + { + "epoch": 1.9267707082833132, + "grad_norm": 0.9586228743335077, + "learning_rate": 3.536057663355852e-08, + "loss": 0.6507, + "step": 803 + }, + { + "epoch": 1.929171668667467, + "grad_norm": 1.04816127232883, + "learning_rate": 3.304505125928392e-08, + "loss": 0.6765, + "step": 804 + }, + { + "epoch": 1.9315726290516206, + "grad_norm": 0.8425194752265788, + "learning_rate": 3.080767769372939e-08, + "loss": 0.6534, + "step": 805 + }, + { + "epoch": 1.9339735894357744, + "grad_norm": 1.896719832014499, + "learning_rate": 2.8648491140513267e-08, + "loss": 0.6904, + "step": 806 + }, + { + "epoch": 1.936374549819928, + "grad_norm": 1.0062088432432506, + "learning_rate": 2.6567525573034102e-08, + "loss": 0.6954, + "step": 807 + }, + { + "epoch": 1.9387755102040818, + "grad_norm": 0.8975562417272989, + "learning_rate": 2.4564813733932157e-08, + "loss": 0.6849, + "step": 808 + }, + { + "epoch": 1.9411764705882353, + "grad_norm": 0.975794376499767, + "learning_rate": 2.264038713457706e-08, + "loss": 0.7237, + "step": 809 + }, + { + "epoch": 1.943577430972389, + "grad_norm": 1.0524644516102524, + "learning_rate": 2.079427605457096e-08, + "loss": 0.6941, + "step": 810 + }, + { + "epoch": 1.9459783913565425, + "grad_norm": 0.9611491587591614, + "learning_rate": 1.9026509541272276e-08, + "loss": 0.6858, + "step": 811 + }, + { + "epoch": 1.9483793517406962, + "grad_norm": 1.0000196792745963, + "learning_rate": 1.7337115409338246e-08, + "loss": 0.6309, + "step": 812 + }, + { + "epoch": 1.95078031212485, + "grad_norm": 1.0920655370481793, + "learning_rate": 1.5726120240288632e-08, + "loss": 0.694, + "step": 813 + }, + { + "epoch": 1.9531812725090036, + "grad_norm": 0.9748896970825864, + "learning_rate": 1.4193549382083837e-08, + "loss": 0.6893, + "step": 814 + }, + { + "epoch": 1.9555822328931574, + "grad_norm": 1.0771011273317628, + "learning_rate": 1.2739426948732426e-08, + "loss": 0.6638, + "step": 815 + }, + { + "epoch": 1.957983193277311, + "grad_norm": 0.8882200571103428, + "learning_rate": 1.1363775819904776e-08, + "loss": 0.6955, + "step": 816 + }, + { + "epoch": 1.9603841536614646, + "grad_norm": 1.090762803675368, + "learning_rate": 1.006661764057837e-08, + "loss": 0.6966, + "step": 817 + }, + { + "epoch": 1.9627851140456183, + "grad_norm": 1.0696035526291576, + "learning_rate": 8.847972820693052e-09, + "loss": 0.703, + "step": 818 + }, + { + "epoch": 1.9651860744297718, + "grad_norm": 1.1819203209196956, + "learning_rate": 7.707860534834632e-09, + "loss": 0.6827, + "step": 819 + }, + { + "epoch": 1.9675870348139255, + "grad_norm": 1.3716127907201827, + "learning_rate": 6.646298721928457e-09, + "loss": 0.6391, + "step": 820 + }, + { + "epoch": 1.9699879951980792, + "grad_norm": 0.8540501179543823, + "learning_rate": 5.6633040849601865e-09, + "loss": 0.6845, + "step": 821 + }, + { + "epoch": 1.972388955582233, + "grad_norm": 0.9224613846938693, + "learning_rate": 4.758892090711009e-09, + "loss": 0.6393, + "step": 822 + }, + { + "epoch": 1.9747899159663866, + "grad_norm": 0.9721797003258046, + "learning_rate": 3.9330769695167245e-09, + "loss": 0.6922, + "step": 823 + }, + { + "epoch": 1.9771908763505404, + "grad_norm": 0.9272860464428038, + "learning_rate": 3.1858717150412554e-09, + "loss": 0.68, + "step": 824 + }, + { + "epoch": 1.9795918367346939, + "grad_norm": 0.9109986729531275, + "learning_rate": 2.5172880840745873e-09, + "loss": 0.6996, + "step": 825 + }, + { + "epoch": 1.9819927971188476, + "grad_norm": 0.9142879218161498, + "learning_rate": 1.9273365963440315e-09, + "loss": 0.6439, + "step": 826 + }, + { + "epoch": 1.984393757503001, + "grad_norm": 0.9184253994662968, + "learning_rate": 1.4160265343549084e-09, + "loss": 0.6819, + "step": 827 + }, + { + "epoch": 1.9867947178871548, + "grad_norm": 0.929916491589975, + "learning_rate": 9.833659432367803e-10, + "loss": 0.6725, + "step": 828 + }, + { + "epoch": 1.9891956782713085, + "grad_norm": 1.4953561604837144, + "learning_rate": 6.293616306246586e-10, + "loss": 0.6642, + "step": 829 + }, + { + "epoch": 1.9915966386554622, + "grad_norm": 0.9372999860644707, + "learning_rate": 3.540191665457604e-10, + "loss": 0.7112, + "step": 830 + }, + { + "epoch": 1.993997599039616, + "grad_norm": 0.9716377415350187, + "learning_rate": 1.5734288333457692e-10, + "loss": 0.6803, + "step": 831 + }, + { + "epoch": 1.9963985594237696, + "grad_norm": 0.8051855934219595, + "learning_rate": 3.9335875564594505e-11, + "loss": 0.6748, + "step": 832 + } + ], + "logging_steps": 1, + "max_steps": 832, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 208, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.326067610943488e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}