diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,10486 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9998743455497382, + "eval_steps": 500, + "global_step": 1492, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006701570680628272, + "grad_norm": 4.200109958648682, + "learning_rate": 3.3333333333333335e-07, + "loss": 8.7721, + "step": 1 + }, + { + "epoch": 0.0013403141361256544, + "grad_norm": 4.861227035522461, + "learning_rate": 6.666666666666667e-07, + "loss": 9.0224, + "step": 2 + }, + { + "epoch": 0.002010471204188482, + "grad_norm": 6.782981872558594, + "learning_rate": 1.0000000000000002e-06, + "loss": 9.2282, + "step": 3 + }, + { + "epoch": 0.0026806282722513087, + "grad_norm": 7.665585994720459, + "learning_rate": 1.3333333333333334e-06, + "loss": 8.9244, + "step": 4 + }, + { + "epoch": 0.003350785340314136, + "grad_norm": 5.701632499694824, + "learning_rate": 1.6666666666666667e-06, + "loss": 8.7144, + "step": 5 + }, + { + "epoch": 0.004020942408376964, + "grad_norm": 4.979744911193848, + "learning_rate": 2.0000000000000003e-06, + "loss": 8.8509, + "step": 6 + }, + { + "epoch": 0.00469109947643979, + "grad_norm": 5.203133583068848, + "learning_rate": 2.3333333333333336e-06, + "loss": 8.0535, + "step": 7 + }, + { + "epoch": 0.0053612565445026175, + "grad_norm": 4.624353408813477, + "learning_rate": 2.666666666666667e-06, + "loss": 8.5312, + "step": 8 + }, + { + "epoch": 0.006031413612565445, + "grad_norm": 3.5958287715911865, + "learning_rate": 3e-06, + "loss": 8.3258, + "step": 9 + }, + { + "epoch": 0.006701570680628272, + "grad_norm": 3.4215784072875977, + "learning_rate": 3.3333333333333333e-06, + "loss": 8.416, + "step": 10 + }, + { + "epoch": 0.007371727748691099, + "grad_norm": 3.484177589416504, + "learning_rate": 3.6666666666666666e-06, + "loss": 8.2023, + "step": 11 + }, + { + "epoch": 0.008041884816753928, + "grad_norm": 3.484971046447754, + "learning_rate": 4.000000000000001e-06, + "loss": 7.813, + "step": 12 + }, + { + "epoch": 0.008712041884816753, + "grad_norm": 2.726947069168091, + "learning_rate": 4.333333333333334e-06, + "loss": 8.134, + "step": 13 + }, + { + "epoch": 0.00938219895287958, + "grad_norm": 3.3426618576049805, + "learning_rate": 4.666666666666667e-06, + "loss": 7.7547, + "step": 14 + }, + { + "epoch": 0.010052356020942408, + "grad_norm": 3.043964385986328, + "learning_rate": 5e-06, + "loss": 7.7399, + "step": 15 + }, + { + "epoch": 0.010722513089005235, + "grad_norm": 3.101304531097412, + "learning_rate": 4.999994344791726e-06, + "loss": 7.1639, + "step": 16 + }, + { + "epoch": 0.011392670157068062, + "grad_norm": 3.683811902999878, + "learning_rate": 4.999977379192491e-06, + "loss": 7.595, + "step": 17 + }, + { + "epoch": 0.01206282722513089, + "grad_norm": 3.79826021194458, + "learning_rate": 4.9999491032790475e-06, + "loss": 7.5361, + "step": 18 + }, + { + "epoch": 0.012732984293193717, + "grad_norm": 3.463408946990967, + "learning_rate": 4.999909517179322e-06, + "loss": 7.1776, + "step": 19 + }, + { + "epoch": 0.013403141361256544, + "grad_norm": 3.486097574234009, + "learning_rate": 4.9998586210724085e-06, + "loss": 7.0846, + "step": 20 + }, + { + "epoch": 0.014073298429319371, + "grad_norm": 3.8364169597625732, + "learning_rate": 4.99979641518857e-06, + "loss": 7.3558, + "step": 21 + }, + { + "epoch": 0.014743455497382199, + "grad_norm": 2.9487595558166504, + "learning_rate": 4.999722899809235e-06, + "loss": 6.9944, + "step": 22 + }, + { + "epoch": 0.015413612565445026, + "grad_norm": 2.877228260040283, + "learning_rate": 4.9996380752670006e-06, + "loss": 6.8514, + "step": 23 + }, + { + "epoch": 0.016083769633507855, + "grad_norm": 2.7856905460357666, + "learning_rate": 4.999541941945626e-06, + "loss": 6.9527, + "step": 24 + }, + { + "epoch": 0.016753926701570682, + "grad_norm": 2.6983463764190674, + "learning_rate": 4.999434500280035e-06, + "loss": 6.8831, + "step": 25 + }, + { + "epoch": 0.017424083769633506, + "grad_norm": 3.0367815494537354, + "learning_rate": 4.999315750756312e-06, + "loss": 6.6457, + "step": 26 + }, + { + "epoch": 0.018094240837696333, + "grad_norm": 2.566683292388916, + "learning_rate": 4.999185693911699e-06, + "loss": 6.8638, + "step": 27 + }, + { + "epoch": 0.01876439790575916, + "grad_norm": 3.147542953491211, + "learning_rate": 4.9990443303345945e-06, + "loss": 6.5231, + "step": 28 + }, + { + "epoch": 0.019434554973821988, + "grad_norm": 2.622145175933838, + "learning_rate": 4.998891660664552e-06, + "loss": 6.8036, + "step": 29 + }, + { + "epoch": 0.020104712041884815, + "grad_norm": 2.8038251399993896, + "learning_rate": 4.998727685592273e-06, + "loss": 6.6147, + "step": 30 + }, + { + "epoch": 0.020774869109947643, + "grad_norm": 3.022918939590454, + "learning_rate": 4.9985524058596105e-06, + "loss": 6.6855, + "step": 31 + }, + { + "epoch": 0.02144502617801047, + "grad_norm": 2.2576181888580322, + "learning_rate": 4.998365822259556e-06, + "loss": 6.5477, + "step": 32 + }, + { + "epoch": 0.022115183246073297, + "grad_norm": 2.9812848567962646, + "learning_rate": 4.998167935636247e-06, + "loss": 6.4677, + "step": 33 + }, + { + "epoch": 0.022785340314136125, + "grad_norm": 2.937847375869751, + "learning_rate": 4.997958746884956e-06, + "loss": 6.4969, + "step": 34 + }, + { + "epoch": 0.023455497382198952, + "grad_norm": 2.3638932704925537, + "learning_rate": 4.997738256952086e-06, + "loss": 6.6306, + "step": 35 + }, + { + "epoch": 0.02412565445026178, + "grad_norm": 2.4940426349639893, + "learning_rate": 4.997506466835171e-06, + "loss": 6.6289, + "step": 36 + }, + { + "epoch": 0.024795811518324606, + "grad_norm": 2.751082420349121, + "learning_rate": 4.997263377582868e-06, + "loss": 6.4163, + "step": 37 + }, + { + "epoch": 0.025465968586387434, + "grad_norm": 2.156564474105835, + "learning_rate": 4.997008990294953e-06, + "loss": 6.5205, + "step": 38 + }, + { + "epoch": 0.02613612565445026, + "grad_norm": 2.9955501556396484, + "learning_rate": 4.996743306122317e-06, + "loss": 6.4886, + "step": 39 + }, + { + "epoch": 0.02680628272251309, + "grad_norm": 2.5269675254821777, + "learning_rate": 4.99646632626696e-06, + "loss": 6.5692, + "step": 40 + }, + { + "epoch": 0.027476439790575916, + "grad_norm": 2.1620020866394043, + "learning_rate": 4.996178051981984e-06, + "loss": 6.5636, + "step": 41 + }, + { + "epoch": 0.028146596858638743, + "grad_norm": 2.7254207134246826, + "learning_rate": 4.99587848457159e-06, + "loss": 6.1583, + "step": 42 + }, + { + "epoch": 0.02881675392670157, + "grad_norm": 2.702894926071167, + "learning_rate": 4.9955676253910716e-06, + "loss": 6.4469, + "step": 43 + }, + { + "epoch": 0.029486910994764397, + "grad_norm": 2.119009494781494, + "learning_rate": 4.995245475846806e-06, + "loss": 6.3458, + "step": 44 + }, + { + "epoch": 0.030157068062827225, + "grad_norm": 2.6456499099731445, + "learning_rate": 4.994912037396254e-06, + "loss": 6.8394, + "step": 45 + }, + { + "epoch": 0.030827225130890052, + "grad_norm": 2.4168996810913086, + "learning_rate": 4.994567311547944e-06, + "loss": 6.5947, + "step": 46 + }, + { + "epoch": 0.03149738219895288, + "grad_norm": 2.0806760787963867, + "learning_rate": 4.994211299861475e-06, + "loss": 6.1653, + "step": 47 + }, + { + "epoch": 0.03216753926701571, + "grad_norm": 2.130005121231079, + "learning_rate": 4.993844003947501e-06, + "loss": 6.4416, + "step": 48 + }, + { + "epoch": 0.032837696335078534, + "grad_norm": 2.2185635566711426, + "learning_rate": 4.993465425467732e-06, + "loss": 6.3064, + "step": 49 + }, + { + "epoch": 0.033507853403141365, + "grad_norm": 2.5894346237182617, + "learning_rate": 4.9930755661349215e-06, + "loss": 6.5322, + "step": 50 + }, + { + "epoch": 0.03417801047120419, + "grad_norm": 1.9817583560943604, + "learning_rate": 4.992674427712855e-06, + "loss": 6.0912, + "step": 51 + }, + { + "epoch": 0.03484816753926701, + "grad_norm": 2.4749717712402344, + "learning_rate": 4.99226201201635e-06, + "loss": 6.2575, + "step": 52 + }, + { + "epoch": 0.03551832460732984, + "grad_norm": 3.163346767425537, + "learning_rate": 4.991838320911245e-06, + "loss": 6.3432, + "step": 53 + }, + { + "epoch": 0.03618848167539267, + "grad_norm": 2.872176170349121, + "learning_rate": 4.991403356314388e-06, + "loss": 6.181, + "step": 54 + }, + { + "epoch": 0.0368586387434555, + "grad_norm": 2.5608084201812744, + "learning_rate": 4.990957120193632e-06, + "loss": 6.2428, + "step": 55 + }, + { + "epoch": 0.03752879581151832, + "grad_norm": 2.9240713119506836, + "learning_rate": 4.990499614567825e-06, + "loss": 6.1696, + "step": 56 + }, + { + "epoch": 0.03819895287958115, + "grad_norm": 2.4163713455200195, + "learning_rate": 4.990030841506797e-06, + "loss": 6.1887, + "step": 57 + }, + { + "epoch": 0.038869109947643976, + "grad_norm": 2.1572868824005127, + "learning_rate": 4.9895508031313545e-06, + "loss": 6.4599, + "step": 58 + }, + { + "epoch": 0.03953926701570681, + "grad_norm": 2.173875570297241, + "learning_rate": 4.989059501613273e-06, + "loss": 5.9834, + "step": 59 + }, + { + "epoch": 0.04020942408376963, + "grad_norm": 2.419245719909668, + "learning_rate": 4.9885569391752805e-06, + "loss": 6.1352, + "step": 60 + }, + { + "epoch": 0.04087958115183246, + "grad_norm": 2.2928881645202637, + "learning_rate": 4.988043118091056e-06, + "loss": 5.7896, + "step": 61 + }, + { + "epoch": 0.041549738219895285, + "grad_norm": 1.8945914506912231, + "learning_rate": 4.9875180406852085e-06, + "loss": 6.0007, + "step": 62 + }, + { + "epoch": 0.042219895287958116, + "grad_norm": 2.0440707206726074, + "learning_rate": 4.986981709333278e-06, + "loss": 5.9541, + "step": 63 + }, + { + "epoch": 0.04289005235602094, + "grad_norm": 2.1350975036621094, + "learning_rate": 4.9864341264617165e-06, + "loss": 6.2672, + "step": 64 + }, + { + "epoch": 0.04356020942408377, + "grad_norm": 2.444735527038574, + "learning_rate": 4.98587529454788e-06, + "loss": 6.1391, + "step": 65 + }, + { + "epoch": 0.044230366492146594, + "grad_norm": 2.0678515434265137, + "learning_rate": 4.985305216120017e-06, + "loss": 6.136, + "step": 66 + }, + { + "epoch": 0.044900523560209425, + "grad_norm": 2.892256021499634, + "learning_rate": 4.984723893757256e-06, + "loss": 6.0538, + "step": 67 + }, + { + "epoch": 0.04557068062827225, + "grad_norm": 2.0744709968566895, + "learning_rate": 4.9841313300895985e-06, + "loss": 6.425, + "step": 68 + }, + { + "epoch": 0.04624083769633508, + "grad_norm": 2.185494899749756, + "learning_rate": 4.9835275277979e-06, + "loss": 6.1453, + "step": 69 + }, + { + "epoch": 0.046910994764397904, + "grad_norm": 2.4761366844177246, + "learning_rate": 4.9829124896138635e-06, + "loss": 6.1714, + "step": 70 + }, + { + "epoch": 0.047581151832460734, + "grad_norm": 2.2010130882263184, + "learning_rate": 4.982286218320023e-06, + "loss": 6.3217, + "step": 71 + }, + { + "epoch": 0.04825130890052356, + "grad_norm": 2.60612154006958, + "learning_rate": 4.981648716749735e-06, + "loss": 6.2632, + "step": 72 + }, + { + "epoch": 0.04892146596858639, + "grad_norm": 2.6755013465881348, + "learning_rate": 4.980999987787163e-06, + "loss": 5.8599, + "step": 73 + }, + { + "epoch": 0.04959162303664921, + "grad_norm": 2.565596103668213, + "learning_rate": 4.980340034367264e-06, + "loss": 5.9165, + "step": 74 + }, + { + "epoch": 0.050261780104712044, + "grad_norm": 1.9897733926773071, + "learning_rate": 4.979668859475779e-06, + "loss": 6.2114, + "step": 75 + }, + { + "epoch": 0.05093193717277487, + "grad_norm": 2.3420677185058594, + "learning_rate": 4.978986466149212e-06, + "loss": 6.0591, + "step": 76 + }, + { + "epoch": 0.0516020942408377, + "grad_norm": 2.258755683898926, + "learning_rate": 4.978292857474828e-06, + "loss": 6.1753, + "step": 77 + }, + { + "epoch": 0.05227225130890052, + "grad_norm": 2.6298441886901855, + "learning_rate": 4.977588036590625e-06, + "loss": 6.3313, + "step": 78 + }, + { + "epoch": 0.05294240837696335, + "grad_norm": 2.3326869010925293, + "learning_rate": 4.976872006685332e-06, + "loss": 5.9557, + "step": 79 + }, + { + "epoch": 0.05361256544502618, + "grad_norm": 2.4547061920166016, + "learning_rate": 4.976144770998386e-06, + "loss": 5.9302, + "step": 80 + }, + { + "epoch": 0.05428272251308901, + "grad_norm": 2.0991933345794678, + "learning_rate": 4.975406332819923e-06, + "loss": 5.817, + "step": 81 + }, + { + "epoch": 0.05495287958115183, + "grad_norm": 3.2543742656707764, + "learning_rate": 4.974656695490762e-06, + "loss": 6.128, + "step": 82 + }, + { + "epoch": 0.05562303664921466, + "grad_norm": 2.2632484436035156, + "learning_rate": 4.973895862402385e-06, + "loss": 6.0342, + "step": 83 + }, + { + "epoch": 0.056293193717277486, + "grad_norm": 2.243244171142578, + "learning_rate": 4.973123836996929e-06, + "loss": 6.0889, + "step": 84 + }, + { + "epoch": 0.056963350785340316, + "grad_norm": 2.0775105953216553, + "learning_rate": 4.9723406227671645e-06, + "loss": 5.9834, + "step": 85 + }, + { + "epoch": 0.05763350785340314, + "grad_norm": 2.1865806579589844, + "learning_rate": 4.971546223256484e-06, + "loss": 5.8297, + "step": 86 + }, + { + "epoch": 0.05830366492146597, + "grad_norm": 2.533968925476074, + "learning_rate": 4.9707406420588836e-06, + "loss": 6.1779, + "step": 87 + }, + { + "epoch": 0.058973821989528795, + "grad_norm": 2.2566957473754883, + "learning_rate": 4.969923882818946e-06, + "loss": 5.8309, + "step": 88 + }, + { + "epoch": 0.059643979057591626, + "grad_norm": 3.06984806060791, + "learning_rate": 4.969095949231826e-06, + "loss": 5.7423, + "step": 89 + }, + { + "epoch": 0.06031413612565445, + "grad_norm": 2.543743848800659, + "learning_rate": 4.968256845043233e-06, + "loss": 6.4709, + "step": 90 + }, + { + "epoch": 0.06098429319371728, + "grad_norm": 2.6973841190338135, + "learning_rate": 4.967406574049416e-06, + "loss": 6.2436, + "step": 91 + }, + { + "epoch": 0.061654450261780104, + "grad_norm": 2.71950101852417, + "learning_rate": 4.96654514009714e-06, + "loss": 5.8146, + "step": 92 + }, + { + "epoch": 0.062324607329842935, + "grad_norm": 2.4400877952575684, + "learning_rate": 4.965672547083678e-06, + "loss": 5.9425, + "step": 93 + }, + { + "epoch": 0.06299476439790576, + "grad_norm": 2.360313892364502, + "learning_rate": 4.964788798956784e-06, + "loss": 5.9114, + "step": 94 + }, + { + "epoch": 0.06366492146596858, + "grad_norm": 2.3321006298065186, + "learning_rate": 4.963893899714683e-06, + "loss": 5.9243, + "step": 95 + }, + { + "epoch": 0.06433507853403142, + "grad_norm": 2.638052463531494, + "learning_rate": 4.9629878534060494e-06, + "loss": 6.1771, + "step": 96 + }, + { + "epoch": 0.06500523560209424, + "grad_norm": 2.1472249031066895, + "learning_rate": 4.962070664129986e-06, + "loss": 5.955, + "step": 97 + }, + { + "epoch": 0.06567539267015707, + "grad_norm": 2.5043721199035645, + "learning_rate": 4.961142336036011e-06, + "loss": 6.346, + "step": 98 + }, + { + "epoch": 0.06634554973821989, + "grad_norm": 2.469748020172119, + "learning_rate": 4.960202873324033e-06, + "loss": 5.9722, + "step": 99 + }, + { + "epoch": 0.06701570680628273, + "grad_norm": 2.429661512374878, + "learning_rate": 4.959252280244342e-06, + "loss": 5.89, + "step": 100 + }, + { + "epoch": 0.06768586387434555, + "grad_norm": 2.4955296516418457, + "learning_rate": 4.958290561097575e-06, + "loss": 5.8828, + "step": 101 + }, + { + "epoch": 0.06835602094240838, + "grad_norm": 1.904392123222351, + "learning_rate": 4.9573177202347135e-06, + "loss": 6.0817, + "step": 102 + }, + { + "epoch": 0.0690261780104712, + "grad_norm": 2.5589585304260254, + "learning_rate": 4.956333762057048e-06, + "loss": 5.8343, + "step": 103 + }, + { + "epoch": 0.06969633507853402, + "grad_norm": 2.1008498668670654, + "learning_rate": 4.955338691016173e-06, + "loss": 5.7119, + "step": 104 + }, + { + "epoch": 0.07036649214659686, + "grad_norm": 2.3903372287750244, + "learning_rate": 4.954332511613953e-06, + "loss": 6.1662, + "step": 105 + }, + { + "epoch": 0.07103664921465969, + "grad_norm": 2.2429683208465576, + "learning_rate": 4.953315228402512e-06, + "loss": 5.9493, + "step": 106 + }, + { + "epoch": 0.07170680628272251, + "grad_norm": 2.764125347137451, + "learning_rate": 4.952286845984209e-06, + "loss": 5.8358, + "step": 107 + }, + { + "epoch": 0.07237696335078533, + "grad_norm": 2.5214192867279053, + "learning_rate": 4.9512473690116165e-06, + "loss": 5.7791, + "step": 108 + }, + { + "epoch": 0.07304712041884817, + "grad_norm": 2.370891571044922, + "learning_rate": 4.950196802187503e-06, + "loss": 5.7436, + "step": 109 + }, + { + "epoch": 0.073717277486911, + "grad_norm": 2.2167561054229736, + "learning_rate": 4.9491351502648075e-06, + "loss": 5.9407, + "step": 110 + }, + { + "epoch": 0.07438743455497382, + "grad_norm": 2.805875301361084, + "learning_rate": 4.94806241804662e-06, + "loss": 6.0697, + "step": 111 + }, + { + "epoch": 0.07505759162303664, + "grad_norm": 1.881156325340271, + "learning_rate": 4.946978610386159e-06, + "loss": 5.7838, + "step": 112 + }, + { + "epoch": 0.07572774869109948, + "grad_norm": 2.642446517944336, + "learning_rate": 4.9458837321867515e-06, + "loss": 6.1066, + "step": 113 + }, + { + "epoch": 0.0763979057591623, + "grad_norm": 2.5777878761291504, + "learning_rate": 4.944777788401809e-06, + "loss": 5.6759, + "step": 114 + }, + { + "epoch": 0.07706806282722513, + "grad_norm": 2.3588509559631348, + "learning_rate": 4.943660784034806e-06, + "loss": 6.0307, + "step": 115 + }, + { + "epoch": 0.07773821989528795, + "grad_norm": 2.7860770225524902, + "learning_rate": 4.942532724139255e-06, + "loss": 5.6816, + "step": 116 + }, + { + "epoch": 0.07840837696335079, + "grad_norm": 2.4214789867401123, + "learning_rate": 4.941393613818688e-06, + "loss": 5.9614, + "step": 117 + }, + { + "epoch": 0.07907853403141361, + "grad_norm": 1.9265861511230469, + "learning_rate": 4.94024345822663e-06, + "loss": 5.4188, + "step": 118 + }, + { + "epoch": 0.07974869109947644, + "grad_norm": 2.178675413131714, + "learning_rate": 4.939082262566575e-06, + "loss": 5.6996, + "step": 119 + }, + { + "epoch": 0.08041884816753926, + "grad_norm": 2.9240055084228516, + "learning_rate": 4.937910032091968e-06, + "loss": 6.0002, + "step": 120 + }, + { + "epoch": 0.0810890052356021, + "grad_norm": 3.041872262954712, + "learning_rate": 4.936726772106174e-06, + "loss": 6.1273, + "step": 121 + }, + { + "epoch": 0.08175916230366492, + "grad_norm": 2.833592414855957, + "learning_rate": 4.935532487962457e-06, + "loss": 5.9378, + "step": 122 + }, + { + "epoch": 0.08242931937172775, + "grad_norm": 2.215791702270508, + "learning_rate": 4.934327185063959e-06, + "loss": 5.8925, + "step": 123 + }, + { + "epoch": 0.08309947643979057, + "grad_norm": 2.210946559906006, + "learning_rate": 4.933110868863671e-06, + "loss": 6.0775, + "step": 124 + }, + { + "epoch": 0.08376963350785341, + "grad_norm": 1.9565507173538208, + "learning_rate": 4.93188354486441e-06, + "loss": 6.0097, + "step": 125 + }, + { + "epoch": 0.08443979057591623, + "grad_norm": 2.499535083770752, + "learning_rate": 4.9306452186187925e-06, + "loss": 5.7272, + "step": 126 + }, + { + "epoch": 0.08510994764397906, + "grad_norm": 2.260690927505493, + "learning_rate": 4.9293958957292155e-06, + "loss": 5.8022, + "step": 127 + }, + { + "epoch": 0.08578010471204188, + "grad_norm": 2.7281482219696045, + "learning_rate": 4.928135581847823e-06, + "loss": 5.6282, + "step": 128 + }, + { + "epoch": 0.08645026178010472, + "grad_norm": 2.7198517322540283, + "learning_rate": 4.926864282676485e-06, + "loss": 5.8988, + "step": 129 + }, + { + "epoch": 0.08712041884816754, + "grad_norm": 2.2805449962615967, + "learning_rate": 4.9255820039667695e-06, + "loss": 5.8828, + "step": 130 + }, + { + "epoch": 0.08779057591623037, + "grad_norm": 2.526623010635376, + "learning_rate": 4.9242887515199215e-06, + "loss": 6.1022, + "step": 131 + }, + { + "epoch": 0.08846073298429319, + "grad_norm": 2.3147377967834473, + "learning_rate": 4.9229845311868275e-06, + "loss": 5.8275, + "step": 132 + }, + { + "epoch": 0.08913089005235603, + "grad_norm": 2.0316696166992188, + "learning_rate": 4.921669348868e-06, + "loss": 5.7102, + "step": 133 + }, + { + "epoch": 0.08980104712041885, + "grad_norm": 2.1787824630737305, + "learning_rate": 4.9203432105135425e-06, + "loss": 5.8846, + "step": 134 + }, + { + "epoch": 0.09047120418848167, + "grad_norm": 2.2715516090393066, + "learning_rate": 4.919006122123125e-06, + "loss": 6.0401, + "step": 135 + }, + { + "epoch": 0.0911413612565445, + "grad_norm": 2.4892923831939697, + "learning_rate": 4.917658089745959e-06, + "loss": 6.322, + "step": 136 + }, + { + "epoch": 0.09181151832460734, + "grad_norm": 2.3479976654052734, + "learning_rate": 4.916299119480767e-06, + "loss": 5.578, + "step": 137 + }, + { + "epoch": 0.09248167539267016, + "grad_norm": 1.8516892194747925, + "learning_rate": 4.914929217475758e-06, + "loss": 5.6311, + "step": 138 + }, + { + "epoch": 0.09315183246073298, + "grad_norm": 2.260326623916626, + "learning_rate": 4.913548389928595e-06, + "loss": 5.7489, + "step": 139 + }, + { + "epoch": 0.09382198952879581, + "grad_norm": 2.2286529541015625, + "learning_rate": 4.912156643086374e-06, + "loss": 5.7533, + "step": 140 + }, + { + "epoch": 0.09449214659685864, + "grad_norm": 2.654252767562866, + "learning_rate": 4.910753983245589e-06, + "loss": 5.8891, + "step": 141 + }, + { + "epoch": 0.09516230366492147, + "grad_norm": 2.1620259284973145, + "learning_rate": 4.909340416752105e-06, + "loss": 5.9395, + "step": 142 + }, + { + "epoch": 0.09583246073298429, + "grad_norm": 2.2099599838256836, + "learning_rate": 4.907915950001135e-06, + "loss": 5.8191, + "step": 143 + }, + { + "epoch": 0.09650261780104712, + "grad_norm": 2.1290504932403564, + "learning_rate": 4.906480589437202e-06, + "loss": 5.6433, + "step": 144 + }, + { + "epoch": 0.09717277486910995, + "grad_norm": 2.3433656692504883, + "learning_rate": 4.905034341554117e-06, + "loss": 5.6745, + "step": 145 + }, + { + "epoch": 0.09784293193717278, + "grad_norm": 2.5840182304382324, + "learning_rate": 4.903577212894947e-06, + "loss": 5.9487, + "step": 146 + }, + { + "epoch": 0.0985130890052356, + "grad_norm": 2.1299641132354736, + "learning_rate": 4.902109210051984e-06, + "loss": 5.7433, + "step": 147 + }, + { + "epoch": 0.09918324607329843, + "grad_norm": 2.7531559467315674, + "learning_rate": 4.900630339666718e-06, + "loss": 5.9031, + "step": 148 + }, + { + "epoch": 0.09985340314136126, + "grad_norm": 2.4779627323150635, + "learning_rate": 4.899140608429804e-06, + "loss": 5.7958, + "step": 149 + }, + { + "epoch": 0.10052356020942409, + "grad_norm": 2.1861352920532227, + "learning_rate": 4.897640023081036e-06, + "loss": 5.7467, + "step": 150 + }, + { + "epoch": 0.10119371727748691, + "grad_norm": 2.305194854736328, + "learning_rate": 4.896128590409311e-06, + "loss": 5.8453, + "step": 151 + }, + { + "epoch": 0.10186387434554973, + "grad_norm": 2.6004295349121094, + "learning_rate": 4.894606317252603e-06, + "loss": 5.5061, + "step": 152 + }, + { + "epoch": 0.10253403141361256, + "grad_norm": 2.268364191055298, + "learning_rate": 4.893073210497928e-06, + "loss": 5.8876, + "step": 153 + }, + { + "epoch": 0.1032041884816754, + "grad_norm": 2.2736222743988037, + "learning_rate": 4.8915292770813184e-06, + "loss": 5.9355, + "step": 154 + }, + { + "epoch": 0.10387434554973822, + "grad_norm": 2.465298652648926, + "learning_rate": 4.889974523987784e-06, + "loss": 6.0257, + "step": 155 + }, + { + "epoch": 0.10454450261780104, + "grad_norm": 2.676025629043579, + "learning_rate": 4.888408958251289e-06, + "loss": 5.5879, + "step": 156 + }, + { + "epoch": 0.10521465968586387, + "grad_norm": 2.2422494888305664, + "learning_rate": 4.886832586954712e-06, + "loss": 5.8153, + "step": 157 + }, + { + "epoch": 0.1058848167539267, + "grad_norm": 2.0813255310058594, + "learning_rate": 4.8852454172298205e-06, + "loss": 5.4443, + "step": 158 + }, + { + "epoch": 0.10655497382198953, + "grad_norm": 2.5441648960113525, + "learning_rate": 4.883647456257234e-06, + "loss": 5.6157, + "step": 159 + }, + { + "epoch": 0.10722513089005235, + "grad_norm": 2.377225637435913, + "learning_rate": 4.882038711266395e-06, + "loss": 5.6878, + "step": 160 + }, + { + "epoch": 0.10789528795811518, + "grad_norm": 2.6996095180511475, + "learning_rate": 4.880419189535532e-06, + "loss": 5.6048, + "step": 161 + }, + { + "epoch": 0.10856544502617801, + "grad_norm": 2.653292655944824, + "learning_rate": 4.878788898391634e-06, + "loss": 5.6416, + "step": 162 + }, + { + "epoch": 0.10923560209424084, + "grad_norm": 2.5808253288269043, + "learning_rate": 4.877147845210407e-06, + "loss": 5.6885, + "step": 163 + }, + { + "epoch": 0.10990575916230366, + "grad_norm": 2.5603091716766357, + "learning_rate": 4.8754960374162516e-06, + "loss": 5.6075, + "step": 164 + }, + { + "epoch": 0.11057591623036649, + "grad_norm": 2.2412588596343994, + "learning_rate": 4.87383348248222e-06, + "loss": 5.1883, + "step": 165 + }, + { + "epoch": 0.11124607329842932, + "grad_norm": 2.5953209400177, + "learning_rate": 4.872160187929987e-06, + "loss": 5.4755, + "step": 166 + }, + { + "epoch": 0.11191623036649215, + "grad_norm": 2.453779697418213, + "learning_rate": 4.870476161329818e-06, + "loss": 5.9441, + "step": 167 + }, + { + "epoch": 0.11258638743455497, + "grad_norm": 2.0972816944122314, + "learning_rate": 4.868781410300529e-06, + "loss": 5.7357, + "step": 168 + }, + { + "epoch": 0.1132565445026178, + "grad_norm": 2.21097469329834, + "learning_rate": 4.867075942509455e-06, + "loss": 5.4357, + "step": 169 + }, + { + "epoch": 0.11392670157068063, + "grad_norm": 2.468629837036133, + "learning_rate": 4.8653597656724185e-06, + "loss": 5.8272, + "step": 170 + }, + { + "epoch": 0.11459685863874346, + "grad_norm": 2.063910961151123, + "learning_rate": 4.8636328875536875e-06, + "loss": 5.4154, + "step": 171 + }, + { + "epoch": 0.11526701570680628, + "grad_norm": 2.602414846420288, + "learning_rate": 4.861895315965946e-06, + "loss": 5.9394, + "step": 172 + }, + { + "epoch": 0.1159371727748691, + "grad_norm": 5.8961005210876465, + "learning_rate": 4.86014705877026e-06, + "loss": 5.4304, + "step": 173 + }, + { + "epoch": 0.11660732984293194, + "grad_norm": 2.347381353378296, + "learning_rate": 4.858388123876035e-06, + "loss": 5.4542, + "step": 174 + }, + { + "epoch": 0.11727748691099477, + "grad_norm": 2.1685433387756348, + "learning_rate": 4.856618519240985e-06, + "loss": 5.7699, + "step": 175 + }, + { + "epoch": 0.11794764397905759, + "grad_norm": 2.9305124282836914, + "learning_rate": 4.854838252871097e-06, + "loss": 5.4849, + "step": 176 + }, + { + "epoch": 0.11861780104712041, + "grad_norm": 2.922797441482544, + "learning_rate": 4.853047332820592e-06, + "loss": 5.521, + "step": 177 + }, + { + "epoch": 0.11928795811518325, + "grad_norm": 2.1050190925598145, + "learning_rate": 4.85124576719189e-06, + "loss": 5.6155, + "step": 178 + }, + { + "epoch": 0.11995811518324608, + "grad_norm": 2.704467535018921, + "learning_rate": 4.849433564135576e-06, + "loss": 5.5141, + "step": 179 + }, + { + "epoch": 0.1206282722513089, + "grad_norm": 2.1763556003570557, + "learning_rate": 4.847610731850358e-06, + "loss": 5.7, + "step": 180 + }, + { + "epoch": 0.12129842931937172, + "grad_norm": 2.1507253646850586, + "learning_rate": 4.845777278583032e-06, + "loss": 5.9349, + "step": 181 + }, + { + "epoch": 0.12196858638743456, + "grad_norm": 2.007770299911499, + "learning_rate": 4.843933212628447e-06, + "loss": 5.8838, + "step": 182 + }, + { + "epoch": 0.12263874345549738, + "grad_norm": 2.1414952278137207, + "learning_rate": 4.8420785423294645e-06, + "loss": 5.7521, + "step": 183 + }, + { + "epoch": 0.12330890052356021, + "grad_norm": 1.8680200576782227, + "learning_rate": 4.840213276076922e-06, + "loss": 5.7706, + "step": 184 + }, + { + "epoch": 0.12397905759162303, + "grad_norm": 2.4503700733184814, + "learning_rate": 4.8383374223095934e-06, + "loss": 5.8948, + "step": 185 + }, + { + "epoch": 0.12464921465968587, + "grad_norm": 1.9897749423980713, + "learning_rate": 4.836450989514155e-06, + "loss": 5.7293, + "step": 186 + }, + { + "epoch": 0.12531937172774868, + "grad_norm": 2.661517381668091, + "learning_rate": 4.834553986225145e-06, + "loss": 5.9449, + "step": 187 + }, + { + "epoch": 0.12598952879581152, + "grad_norm": 2.1688408851623535, + "learning_rate": 4.832646421024918e-06, + "loss": 5.9073, + "step": 188 + }, + { + "epoch": 0.12665968586387436, + "grad_norm": 3.2971296310424805, + "learning_rate": 4.83072830254362e-06, + "loss": 5.6963, + "step": 189 + }, + { + "epoch": 0.12732984293193716, + "grad_norm": 2.2415287494659424, + "learning_rate": 4.828799639459139e-06, + "loss": 5.5034, + "step": 190 + }, + { + "epoch": 0.128, + "grad_norm": 2.502039909362793, + "learning_rate": 4.826860440497065e-06, + "loss": 5.5988, + "step": 191 + }, + { + "epoch": 0.12867015706806284, + "grad_norm": 2.392843723297119, + "learning_rate": 4.824910714430659e-06, + "loss": 5.6986, + "step": 192 + }, + { + "epoch": 0.12934031413612565, + "grad_norm": 2.7530651092529297, + "learning_rate": 4.822950470080808e-06, + "loss": 5.8121, + "step": 193 + }, + { + "epoch": 0.1300104712041885, + "grad_norm": 2.3004884719848633, + "learning_rate": 4.820979716315983e-06, + "loss": 5.6649, + "step": 194 + }, + { + "epoch": 0.1306806282722513, + "grad_norm": 2.560392379760742, + "learning_rate": 4.8189984620522014e-06, + "loss": 5.7792, + "step": 195 + }, + { + "epoch": 0.13135078534031414, + "grad_norm": 2.2837352752685547, + "learning_rate": 4.8170067162529884e-06, + "loss": 5.7563, + "step": 196 + }, + { + "epoch": 0.13202094240837697, + "grad_norm": 2.555257558822632, + "learning_rate": 4.8150044879293336e-06, + "loss": 5.6809, + "step": 197 + }, + { + "epoch": 0.13269109947643978, + "grad_norm": 2.5194332599639893, + "learning_rate": 4.812991786139652e-06, + "loss": 5.8421, + "step": 198 + }, + { + "epoch": 0.13336125654450262, + "grad_norm": 2.4896628856658936, + "learning_rate": 4.810968619989742e-06, + "loss": 5.7051, + "step": 199 + }, + { + "epoch": 0.13403141361256546, + "grad_norm": 2.1717307567596436, + "learning_rate": 4.808934998632743e-06, + "loss": 5.5694, + "step": 200 + }, + { + "epoch": 0.13470157068062827, + "grad_norm": 1.9135693311691284, + "learning_rate": 4.806890931269098e-06, + "loss": 5.5108, + "step": 201 + }, + { + "epoch": 0.1353717277486911, + "grad_norm": 2.1709461212158203, + "learning_rate": 4.804836427146509e-06, + "loss": 5.7672, + "step": 202 + }, + { + "epoch": 0.13604188481675392, + "grad_norm": 3.3644444942474365, + "learning_rate": 4.8027714955598935e-06, + "loss": 5.6492, + "step": 203 + }, + { + "epoch": 0.13671204188481675, + "grad_norm": 2.060128927230835, + "learning_rate": 4.800696145851346e-06, + "loss": 5.8783, + "step": 204 + }, + { + "epoch": 0.1373821989528796, + "grad_norm": 2.0333595275878906, + "learning_rate": 4.798610387410097e-06, + "loss": 5.5442, + "step": 205 + }, + { + "epoch": 0.1380523560209424, + "grad_norm": 2.221376419067383, + "learning_rate": 4.7965142296724616e-06, + "loss": 5.582, + "step": 206 + }, + { + "epoch": 0.13872251308900524, + "grad_norm": 2.498304605484009, + "learning_rate": 4.794407682121809e-06, + "loss": 5.8169, + "step": 207 + }, + { + "epoch": 0.13939267015706805, + "grad_norm": 2.1738154888153076, + "learning_rate": 4.79229075428851e-06, + "loss": 5.65, + "step": 208 + }, + { + "epoch": 0.1400628272251309, + "grad_norm": 2.296131134033203, + "learning_rate": 4.7901634557499e-06, + "loss": 5.4729, + "step": 209 + }, + { + "epoch": 0.14073298429319372, + "grad_norm": 2.077704906463623, + "learning_rate": 4.788025796130232e-06, + "loss": 5.7508, + "step": 210 + }, + { + "epoch": 0.14140314136125653, + "grad_norm": 2.2950336933135986, + "learning_rate": 4.785877785100633e-06, + "loss": 5.6617, + "step": 211 + }, + { + "epoch": 0.14207329842931937, + "grad_norm": 2.310399293899536, + "learning_rate": 4.783719432379064e-06, + "loss": 5.4773, + "step": 212 + }, + { + "epoch": 0.1427434554973822, + "grad_norm": 2.098602771759033, + "learning_rate": 4.781550747730271e-06, + "loss": 5.8573, + "step": 213 + }, + { + "epoch": 0.14341361256544502, + "grad_norm": 2.5833826065063477, + "learning_rate": 4.779371740965747e-06, + "loss": 5.5664, + "step": 214 + }, + { + "epoch": 0.14408376963350786, + "grad_norm": 2.150275707244873, + "learning_rate": 4.7771824219436805e-06, + "loss": 5.4671, + "step": 215 + }, + { + "epoch": 0.14475392670157067, + "grad_norm": 2.187807559967041, + "learning_rate": 4.774982800568913e-06, + "loss": 5.5364, + "step": 216 + }, + { + "epoch": 0.1454240837696335, + "grad_norm": 2.631636619567871, + "learning_rate": 4.772772886792903e-06, + "loss": 5.5136, + "step": 217 + }, + { + "epoch": 0.14609424083769634, + "grad_norm": 2.5570361614227295, + "learning_rate": 4.770552690613665e-06, + "loss": 5.6872, + "step": 218 + }, + { + "epoch": 0.14676439790575915, + "grad_norm": 2.5417332649230957, + "learning_rate": 4.7683222220757375e-06, + "loss": 5.7036, + "step": 219 + }, + { + "epoch": 0.147434554973822, + "grad_norm": 2.375767230987549, + "learning_rate": 4.766081491270132e-06, + "loss": 5.5643, + "step": 220 + }, + { + "epoch": 0.14810471204188483, + "grad_norm": 1.9284825325012207, + "learning_rate": 4.7638305083342885e-06, + "loss": 5.5584, + "step": 221 + }, + { + "epoch": 0.14877486910994764, + "grad_norm": 2.257730484008789, + "learning_rate": 4.761569283452027e-06, + "loss": 5.8337, + "step": 222 + }, + { + "epoch": 0.14944502617801048, + "grad_norm": 2.2399933338165283, + "learning_rate": 4.7592978268535075e-06, + "loss": 5.6251, + "step": 223 + }, + { + "epoch": 0.15011518324607329, + "grad_norm": 2.2639455795288086, + "learning_rate": 4.757016148815177e-06, + "loss": 5.5898, + "step": 224 + }, + { + "epoch": 0.15078534031413612, + "grad_norm": 2.5909502506256104, + "learning_rate": 4.754724259659727e-06, + "loss": 5.4121, + "step": 225 + }, + { + "epoch": 0.15145549738219896, + "grad_norm": 1.9159077405929565, + "learning_rate": 4.752422169756048e-06, + "loss": 5.462, + "step": 226 + }, + { + "epoch": 0.15212565445026177, + "grad_norm": 1.896048665046692, + "learning_rate": 4.750109889519176e-06, + "loss": 5.7881, + "step": 227 + }, + { + "epoch": 0.1527958115183246, + "grad_norm": 2.2421371936798096, + "learning_rate": 4.747787429410253e-06, + "loss": 5.4056, + "step": 228 + }, + { + "epoch": 0.15346596858638745, + "grad_norm": 2.3694958686828613, + "learning_rate": 4.745454799936475e-06, + "loss": 5.4631, + "step": 229 + }, + { + "epoch": 0.15413612565445026, + "grad_norm": 2.3106985092163086, + "learning_rate": 4.743112011651047e-06, + "loss": 5.7385, + "step": 230 + }, + { + "epoch": 0.1548062827225131, + "grad_norm": 2.0970394611358643, + "learning_rate": 4.740759075153134e-06, + "loss": 5.5615, + "step": 231 + }, + { + "epoch": 0.1554764397905759, + "grad_norm": 2.0424554347991943, + "learning_rate": 4.738396001087811e-06, + "loss": 5.7584, + "step": 232 + }, + { + "epoch": 0.15614659685863874, + "grad_norm": 2.465092420578003, + "learning_rate": 4.73602280014602e-06, + "loss": 5.5121, + "step": 233 + }, + { + "epoch": 0.15681675392670158, + "grad_norm": 2.286259412765503, + "learning_rate": 4.733639483064517e-06, + "loss": 5.7908, + "step": 234 + }, + { + "epoch": 0.1574869109947644, + "grad_norm": 2.41261887550354, + "learning_rate": 4.731246060625826e-06, + "loss": 5.8801, + "step": 235 + }, + { + "epoch": 0.15815706806282723, + "grad_norm": 2.4263551235198975, + "learning_rate": 4.72884254365819e-06, + "loss": 5.4863, + "step": 236 + }, + { + "epoch": 0.15882722513089007, + "grad_norm": 2.1641643047332764, + "learning_rate": 4.7264289430355185e-06, + "loss": 5.7159, + "step": 237 + }, + { + "epoch": 0.15949738219895288, + "grad_norm": 2.152360200881958, + "learning_rate": 4.724005269677342e-06, + "loss": 5.3524, + "step": 238 + }, + { + "epoch": 0.1601675392670157, + "grad_norm": 2.6608517169952393, + "learning_rate": 4.721571534548766e-06, + "loss": 5.6378, + "step": 239 + }, + { + "epoch": 0.16083769633507852, + "grad_norm": 2.169257640838623, + "learning_rate": 4.71912774866041e-06, + "loss": 5.8246, + "step": 240 + }, + { + "epoch": 0.16150785340314136, + "grad_norm": 2.7111823558807373, + "learning_rate": 4.716673923068371e-06, + "loss": 5.6595, + "step": 241 + }, + { + "epoch": 0.1621780104712042, + "grad_norm": 2.112722635269165, + "learning_rate": 4.7142100688741645e-06, + "loss": 5.5694, + "step": 242 + }, + { + "epoch": 0.162848167539267, + "grad_norm": 2.59005069732666, + "learning_rate": 4.711736197224677e-06, + "loss": 5.6073, + "step": 243 + }, + { + "epoch": 0.16351832460732985, + "grad_norm": 2.3710269927978516, + "learning_rate": 4.709252319312117e-06, + "loss": 5.4579, + "step": 244 + }, + { + "epoch": 0.16418848167539268, + "grad_norm": 2.2614245414733887, + "learning_rate": 4.706758446373959e-06, + "loss": 5.5483, + "step": 245 + }, + { + "epoch": 0.1648586387434555, + "grad_norm": 2.088327407836914, + "learning_rate": 4.704254589692903e-06, + "loss": 5.4686, + "step": 246 + }, + { + "epoch": 0.16552879581151833, + "grad_norm": 2.09621524810791, + "learning_rate": 4.7017407605968125e-06, + "loss": 5.7135, + "step": 247 + }, + { + "epoch": 0.16619895287958114, + "grad_norm": 1.8950719833374023, + "learning_rate": 4.6992169704586685e-06, + "loss": 5.4741, + "step": 248 + }, + { + "epoch": 0.16686910994764398, + "grad_norm": 2.1435563564300537, + "learning_rate": 4.69668323069652e-06, + "loss": 5.4941, + "step": 249 + }, + { + "epoch": 0.16753926701570682, + "grad_norm": 2.2771897315979004, + "learning_rate": 4.694139552773426e-06, + "loss": 5.526, + "step": 250 + }, + { + "epoch": 0.16820942408376963, + "grad_norm": 2.7859890460968018, + "learning_rate": 4.69158594819741e-06, + "loss": 5.5536, + "step": 251 + }, + { + "epoch": 0.16887958115183246, + "grad_norm": 2.765713691711426, + "learning_rate": 4.689022428521405e-06, + "loss": 5.6265, + "step": 252 + }, + { + "epoch": 0.1695497382198953, + "grad_norm": 2.27315616607666, + "learning_rate": 4.6864490053432e-06, + "loss": 5.6416, + "step": 253 + }, + { + "epoch": 0.1702198952879581, + "grad_norm": 2.1748857498168945, + "learning_rate": 4.683865690305392e-06, + "loss": 5.688, + "step": 254 + }, + { + "epoch": 0.17089005235602095, + "grad_norm": 2.1356191635131836, + "learning_rate": 4.681272495095327e-06, + "loss": 5.5414, + "step": 255 + }, + { + "epoch": 0.17156020942408376, + "grad_norm": 2.1574511528015137, + "learning_rate": 4.6786694314450535e-06, + "loss": 5.6417, + "step": 256 + }, + { + "epoch": 0.1722303664921466, + "grad_norm": 2.3209385871887207, + "learning_rate": 4.6760565111312625e-06, + "loss": 5.5285, + "step": 257 + }, + { + "epoch": 0.17290052356020943, + "grad_norm": 2.666755437850952, + "learning_rate": 4.673433745975245e-06, + "loss": 5.6753, + "step": 258 + }, + { + "epoch": 0.17357068062827224, + "grad_norm": 2.35461163520813, + "learning_rate": 4.670801147842824e-06, + "loss": 5.4898, + "step": 259 + }, + { + "epoch": 0.17424083769633508, + "grad_norm": 2.502377986907959, + "learning_rate": 4.668158728644315e-06, + "loss": 5.1188, + "step": 260 + }, + { + "epoch": 0.1749109947643979, + "grad_norm": 2.217586040496826, + "learning_rate": 4.66550650033446e-06, + "loss": 5.5627, + "step": 261 + }, + { + "epoch": 0.17558115183246073, + "grad_norm": 2.145159959793091, + "learning_rate": 4.662844474912384e-06, + "loss": 5.2494, + "step": 262 + }, + { + "epoch": 0.17625130890052357, + "grad_norm": 2.508312702178955, + "learning_rate": 4.660172664421532e-06, + "loss": 5.6636, + "step": 263 + }, + { + "epoch": 0.17692146596858638, + "grad_norm": 2.927377939224243, + "learning_rate": 4.65749108094962e-06, + "loss": 5.6348, + "step": 264 + }, + { + "epoch": 0.17759162303664922, + "grad_norm": 2.3884453773498535, + "learning_rate": 4.654799736628579e-06, + "loss": 5.7108, + "step": 265 + }, + { + "epoch": 0.17826178010471205, + "grad_norm": 2.3559861183166504, + "learning_rate": 4.6520986436345e-06, + "loss": 5.5178, + "step": 266 + }, + { + "epoch": 0.17893193717277486, + "grad_norm": 1.8426997661590576, + "learning_rate": 4.649387814187575e-06, + "loss": 5.4479, + "step": 267 + }, + { + "epoch": 0.1796020942408377, + "grad_norm": 2.4238853454589844, + "learning_rate": 4.646667260552051e-06, + "loss": 5.5709, + "step": 268 + }, + { + "epoch": 0.1802722513089005, + "grad_norm": 6.363220691680908, + "learning_rate": 4.643936995036164e-06, + "loss": 5.6736, + "step": 269 + }, + { + "epoch": 0.18094240837696335, + "grad_norm": 2.6399664878845215, + "learning_rate": 4.641197029992093e-06, + "loss": 5.4715, + "step": 270 + }, + { + "epoch": 0.1816125654450262, + "grad_norm": 1.954572081565857, + "learning_rate": 4.638447377815892e-06, + "loss": 5.3817, + "step": 271 + }, + { + "epoch": 0.182282722513089, + "grad_norm": 2.6681385040283203, + "learning_rate": 4.635688050947449e-06, + "loss": 5.6239, + "step": 272 + }, + { + "epoch": 0.18295287958115183, + "grad_norm": 2.382075786590576, + "learning_rate": 4.632919061870417e-06, + "loss": 5.6783, + "step": 273 + }, + { + "epoch": 0.18362303664921467, + "grad_norm": 2.142279624938965, + "learning_rate": 4.630140423112164e-06, + "loss": 5.3735, + "step": 274 + }, + { + "epoch": 0.18429319371727748, + "grad_norm": 2.151935338973999, + "learning_rate": 4.627352147243715e-06, + "loss": 5.5223, + "step": 275 + }, + { + "epoch": 0.18496335078534032, + "grad_norm": 1.9508206844329834, + "learning_rate": 4.624554246879695e-06, + "loss": 5.2748, + "step": 276 + }, + { + "epoch": 0.18563350785340313, + "grad_norm": 2.103137969970703, + "learning_rate": 4.6217467346782714e-06, + "loss": 5.6259, + "step": 277 + }, + { + "epoch": 0.18630366492146597, + "grad_norm": 2.361621618270874, + "learning_rate": 4.618929623341097e-06, + "loss": 5.6894, + "step": 278 + }, + { + "epoch": 0.1869738219895288, + "grad_norm": 2.6873679161071777, + "learning_rate": 4.616102925613252e-06, + "loss": 5.4963, + "step": 279 + }, + { + "epoch": 0.18764397905759161, + "grad_norm": 2.3075380325317383, + "learning_rate": 4.613266654283189e-06, + "loss": 5.6025, + "step": 280 + }, + { + "epoch": 0.18831413612565445, + "grad_norm": 2.0175352096557617, + "learning_rate": 4.610420822182671e-06, + "loss": 5.5559, + "step": 281 + }, + { + "epoch": 0.1889842931937173, + "grad_norm": 2.1352314949035645, + "learning_rate": 4.607565442186719e-06, + "loss": 5.4375, + "step": 282 + }, + { + "epoch": 0.1896544502617801, + "grad_norm": 2.4613332748413086, + "learning_rate": 4.604700527213545e-06, + "loss": 5.6111, + "step": 283 + }, + { + "epoch": 0.19032460732984294, + "grad_norm": 2.250486135482788, + "learning_rate": 4.601826090224503e-06, + "loss": 5.8779, + "step": 284 + }, + { + "epoch": 0.19099476439790575, + "grad_norm": 2.21348237991333, + "learning_rate": 4.598942144224025e-06, + "loss": 5.3281, + "step": 285 + }, + { + "epoch": 0.19166492146596859, + "grad_norm": 2.185913562774658, + "learning_rate": 4.596048702259563e-06, + "loss": 5.6081, + "step": 286 + }, + { + "epoch": 0.19233507853403142, + "grad_norm": 2.3666489124298096, + "learning_rate": 4.593145777421529e-06, + "loss": 5.4173, + "step": 287 + }, + { + "epoch": 0.19300523560209423, + "grad_norm": 1.9570116996765137, + "learning_rate": 4.590233382843242e-06, + "loss": 5.6447, + "step": 288 + }, + { + "epoch": 0.19367539267015707, + "grad_norm": 3.0195152759552, + "learning_rate": 4.587311531700858e-06, + "loss": 5.495, + "step": 289 + }, + { + "epoch": 0.1943455497382199, + "grad_norm": 2.4136688709259033, + "learning_rate": 4.584380237213318e-06, + "loss": 5.2192, + "step": 290 + }, + { + "epoch": 0.19501570680628272, + "grad_norm": 2.426723003387451, + "learning_rate": 4.581439512642288e-06, + "loss": 5.522, + "step": 291 + }, + { + "epoch": 0.19568586387434556, + "grad_norm": 2.700432062149048, + "learning_rate": 4.578489371292097e-06, + "loss": 5.5662, + "step": 292 + }, + { + "epoch": 0.19635602094240837, + "grad_norm": 2.3877530097961426, + "learning_rate": 4.5755298265096725e-06, + "loss": 5.5128, + "step": 293 + }, + { + "epoch": 0.1970261780104712, + "grad_norm": 2.2122280597686768, + "learning_rate": 4.572560891684491e-06, + "loss": 5.7546, + "step": 294 + }, + { + "epoch": 0.19769633507853404, + "grad_norm": 2.132734775543213, + "learning_rate": 4.569582580248509e-06, + "loss": 5.5579, + "step": 295 + }, + { + "epoch": 0.19836649214659685, + "grad_norm": 2.368992328643799, + "learning_rate": 4.566594905676099e-06, + "loss": 5.6109, + "step": 296 + }, + { + "epoch": 0.1990366492146597, + "grad_norm": 2.3953449726104736, + "learning_rate": 4.563597881484004e-06, + "loss": 5.5182, + "step": 297 + }, + { + "epoch": 0.19970680628272253, + "grad_norm": 2.268723487854004, + "learning_rate": 4.560591521231259e-06, + "loss": 5.5564, + "step": 298 + }, + { + "epoch": 0.20037696335078534, + "grad_norm": 2.2275869846343994, + "learning_rate": 4.557575838519137e-06, + "loss": 5.5888, + "step": 299 + }, + { + "epoch": 0.20104712041884817, + "grad_norm": 1.8049523830413818, + "learning_rate": 4.554550846991091e-06, + "loss": 5.4499, + "step": 300 + }, + { + "epoch": 0.20171727748691098, + "grad_norm": 1.9729605913162231, + "learning_rate": 4.551516560332686e-06, + "loss": 5.5766, + "step": 301 + }, + { + "epoch": 0.20238743455497382, + "grad_norm": 1.977363109588623, + "learning_rate": 4.548472992271541e-06, + "loss": 5.6398, + "step": 302 + }, + { + "epoch": 0.20305759162303666, + "grad_norm": 2.736560821533203, + "learning_rate": 4.5454201565772635e-06, + "loss": 5.4898, + "step": 303 + }, + { + "epoch": 0.20372774869109947, + "grad_norm": 2.1042449474334717, + "learning_rate": 4.5423580670613925e-06, + "loss": 5.4519, + "step": 304 + }, + { + "epoch": 0.2043979057591623, + "grad_norm": 2.3877830505371094, + "learning_rate": 4.539286737577331e-06, + "loss": 5.402, + "step": 305 + }, + { + "epoch": 0.20506806282722512, + "grad_norm": 2.189049243927002, + "learning_rate": 4.536206182020284e-06, + "loss": 5.1588, + "step": 306 + }, + { + "epoch": 0.20573821989528795, + "grad_norm": 2.372809648513794, + "learning_rate": 4.533116414327199e-06, + "loss": 5.4298, + "step": 307 + }, + { + "epoch": 0.2064083769633508, + "grad_norm": 2.6864686012268066, + "learning_rate": 4.5300174484767e-06, + "loss": 5.056, + "step": 308 + }, + { + "epoch": 0.2070785340314136, + "grad_norm": 2.790855646133423, + "learning_rate": 4.5269092984890254e-06, + "loss": 5.8052, + "step": 309 + }, + { + "epoch": 0.20774869109947644, + "grad_norm": 2.2243454456329346, + "learning_rate": 4.523791978425962e-06, + "loss": 5.7378, + "step": 310 + }, + { + "epoch": 0.20841884816753928, + "grad_norm": 2.1781084537506104, + "learning_rate": 4.520665502390786e-06, + "loss": 5.4125, + "step": 311 + }, + { + "epoch": 0.2090890052356021, + "grad_norm": 2.8137004375457764, + "learning_rate": 4.517529884528196e-06, + "loss": 5.8347, + "step": 312 + }, + { + "epoch": 0.20975916230366493, + "grad_norm": 2.380362033843994, + "learning_rate": 4.51438513902425e-06, + "loss": 5.7121, + "step": 313 + }, + { + "epoch": 0.21042931937172774, + "grad_norm": 2.651210069656372, + "learning_rate": 4.5112312801063e-06, + "loss": 5.2444, + "step": 314 + }, + { + "epoch": 0.21109947643979057, + "grad_norm": 2.1358680725097656, + "learning_rate": 4.50806832204293e-06, + "loss": 5.4798, + "step": 315 + }, + { + "epoch": 0.2117696335078534, + "grad_norm": 1.9323924779891968, + "learning_rate": 4.5048962791438885e-06, + "loss": 5.2646, + "step": 316 + }, + { + "epoch": 0.21243979057591622, + "grad_norm": 2.2739667892456055, + "learning_rate": 4.501715165760027e-06, + "loss": 5.6293, + "step": 317 + }, + { + "epoch": 0.21310994764397906, + "grad_norm": 2.425351142883301, + "learning_rate": 4.4985249962832316e-06, + "loss": 5.1849, + "step": 318 + }, + { + "epoch": 0.2137801047120419, + "grad_norm": 1.904436469078064, + "learning_rate": 4.4953257851463595e-06, + "loss": 5.2887, + "step": 319 + }, + { + "epoch": 0.2144502617801047, + "grad_norm": 1.8905036449432373, + "learning_rate": 4.492117546823178e-06, + "loss": 5.2969, + "step": 320 + }, + { + "epoch": 0.21512041884816754, + "grad_norm": 2.0948808193206787, + "learning_rate": 4.488900295828289e-06, + "loss": 5.5535, + "step": 321 + }, + { + "epoch": 0.21579057591623035, + "grad_norm": 2.1106090545654297, + "learning_rate": 4.485674046717074e-06, + "loss": 5.5733, + "step": 322 + }, + { + "epoch": 0.2164607329842932, + "grad_norm": 2.204378128051758, + "learning_rate": 4.48243881408562e-06, + "loss": 5.5237, + "step": 323 + }, + { + "epoch": 0.21713089005235603, + "grad_norm": 2.3489532470703125, + "learning_rate": 4.479194612570659e-06, + "loss": 5.624, + "step": 324 + }, + { + "epoch": 0.21780104712041884, + "grad_norm": 2.2582662105560303, + "learning_rate": 4.475941456849501e-06, + "loss": 5.2971, + "step": 325 + }, + { + "epoch": 0.21847120418848168, + "grad_norm": 2.7756502628326416, + "learning_rate": 4.472679361639961e-06, + "loss": 5.1575, + "step": 326 + }, + { + "epoch": 0.21914136125654451, + "grad_norm": 2.4752912521362305, + "learning_rate": 4.469408341700304e-06, + "loss": 5.4496, + "step": 327 + }, + { + "epoch": 0.21981151832460732, + "grad_norm": 2.538774013519287, + "learning_rate": 4.466128411829168e-06, + "loss": 5.2787, + "step": 328 + }, + { + "epoch": 0.22048167539267016, + "grad_norm": 2.3332388401031494, + "learning_rate": 4.462839586865503e-06, + "loss": 5.6372, + "step": 329 + }, + { + "epoch": 0.22115183246073297, + "grad_norm": 2.3022541999816895, + "learning_rate": 4.459541881688501e-06, + "loss": 5.5492, + "step": 330 + }, + { + "epoch": 0.2218219895287958, + "grad_norm": 2.0971004962921143, + "learning_rate": 4.456235311217528e-06, + "loss": 5.4539, + "step": 331 + }, + { + "epoch": 0.22249214659685865, + "grad_norm": 2.689119815826416, + "learning_rate": 4.452919890412062e-06, + "loss": 5.43, + "step": 332 + }, + { + "epoch": 0.22316230366492146, + "grad_norm": 2.7383434772491455, + "learning_rate": 4.449595634271619e-06, + "loss": 5.4929, + "step": 333 + }, + { + "epoch": 0.2238324607329843, + "grad_norm": 2.2287087440490723, + "learning_rate": 4.446262557835686e-06, + "loss": 5.6617, + "step": 334 + }, + { + "epoch": 0.22450261780104713, + "grad_norm": 2.4107394218444824, + "learning_rate": 4.442920676183657e-06, + "loss": 5.4094, + "step": 335 + }, + { + "epoch": 0.22517277486910994, + "grad_norm": 2.970778465270996, + "learning_rate": 4.439570004434762e-06, + "loss": 5.59, + "step": 336 + }, + { + "epoch": 0.22584293193717278, + "grad_norm": 2.0888166427612305, + "learning_rate": 4.436210557747998e-06, + "loss": 5.2617, + "step": 337 + }, + { + "epoch": 0.2265130890052356, + "grad_norm": 1.821423053741455, + "learning_rate": 4.432842351322061e-06, + "loss": 5.1656, + "step": 338 + }, + { + "epoch": 0.22718324607329843, + "grad_norm": 2.560389757156372, + "learning_rate": 4.4294654003952785e-06, + "loss": 5.6349, + "step": 339 + }, + { + "epoch": 0.22785340314136127, + "grad_norm": 1.9959956407546997, + "learning_rate": 4.426079720245539e-06, + "loss": 5.5315, + "step": 340 + }, + { + "epoch": 0.22852356020942408, + "grad_norm": 2.0525870323181152, + "learning_rate": 4.422685326190224e-06, + "loss": 5.5211, + "step": 341 + }, + { + "epoch": 0.2291937172774869, + "grad_norm": 2.3681249618530273, + "learning_rate": 4.419282233586137e-06, + "loss": 5.5563, + "step": 342 + }, + { + "epoch": 0.22986387434554975, + "grad_norm": 2.2519259452819824, + "learning_rate": 4.415870457829436e-06, + "loss": 5.5249, + "step": 343 + }, + { + "epoch": 0.23053403141361256, + "grad_norm": 2.687836170196533, + "learning_rate": 4.412450014355564e-06, + "loss": 5.4244, + "step": 344 + }, + { + "epoch": 0.2312041884816754, + "grad_norm": 2.29353666305542, + "learning_rate": 4.409020918639177e-06, + "loss": 5.4925, + "step": 345 + }, + { + "epoch": 0.2318743455497382, + "grad_norm": 2.130478858947754, + "learning_rate": 4.405583186194074e-06, + "loss": 5.3689, + "step": 346 + }, + { + "epoch": 0.23254450261780105, + "grad_norm": 1.848215103149414, + "learning_rate": 4.402136832573132e-06, + "loss": 5.3763, + "step": 347 + }, + { + "epoch": 0.23321465968586388, + "grad_norm": 1.9483565092086792, + "learning_rate": 4.398681873368226e-06, + "loss": 5.8511, + "step": 348 + }, + { + "epoch": 0.2338848167539267, + "grad_norm": 1.8991060256958008, + "learning_rate": 4.395218324210169e-06, + "loss": 5.4366, + "step": 349 + }, + { + "epoch": 0.23455497382198953, + "grad_norm": 2.2358591556549072, + "learning_rate": 4.391746200768634e-06, + "loss": 5.3692, + "step": 350 + }, + { + "epoch": 0.23522513089005237, + "grad_norm": 2.185328960418701, + "learning_rate": 4.388265518752085e-06, + "loss": 5.1356, + "step": 351 + }, + { + "epoch": 0.23589528795811518, + "grad_norm": 2.137399911880493, + "learning_rate": 4.384776293907709e-06, + "loss": 5.2214, + "step": 352 + }, + { + "epoch": 0.23656544502617802, + "grad_norm": 1.9800633192062378, + "learning_rate": 4.38127854202134e-06, + "loss": 5.4082, + "step": 353 + }, + { + "epoch": 0.23723560209424083, + "grad_norm": 2.312138795852661, + "learning_rate": 4.3777722789173895e-06, + "loss": 5.6454, + "step": 354 + }, + { + "epoch": 0.23790575916230366, + "grad_norm": 2.105069160461426, + "learning_rate": 4.3742575204587774e-06, + "loss": 5.4411, + "step": 355 + }, + { + "epoch": 0.2385759162303665, + "grad_norm": 2.20829701423645, + "learning_rate": 4.370734282546856e-06, + "loss": 5.4063, + "step": 356 + }, + { + "epoch": 0.2392460732984293, + "grad_norm": 2.180429220199585, + "learning_rate": 4.36720258112134e-06, + "loss": 5.542, + "step": 357 + }, + { + "epoch": 0.23991623036649215, + "grad_norm": 2.4989399909973145, + "learning_rate": 4.363662432160236e-06, + "loss": 5.2325, + "step": 358 + }, + { + "epoch": 0.24058638743455496, + "grad_norm": 2.1118667125701904, + "learning_rate": 4.360113851679768e-06, + "loss": 5.5798, + "step": 359 + }, + { + "epoch": 0.2412565445026178, + "grad_norm": 2.0734126567840576, + "learning_rate": 4.356556855734304e-06, + "loss": 5.4093, + "step": 360 + }, + { + "epoch": 0.24192670157068064, + "grad_norm": 2.4159913063049316, + "learning_rate": 4.352991460416287e-06, + "loss": 5.3141, + "step": 361 + }, + { + "epoch": 0.24259685863874345, + "grad_norm": 2.5480008125305176, + "learning_rate": 4.349417681856159e-06, + "loss": 5.4403, + "step": 362 + }, + { + "epoch": 0.24326701570680628, + "grad_norm": 2.2297019958496094, + "learning_rate": 4.345835536222291e-06, + "loss": 5.4523, + "step": 363 + }, + { + "epoch": 0.24393717277486912, + "grad_norm": 2.1222262382507324, + "learning_rate": 4.3422450397209065e-06, + "loss": 5.3598, + "step": 364 + }, + { + "epoch": 0.24460732984293193, + "grad_norm": 2.1692678928375244, + "learning_rate": 4.338646208596009e-06, + "loss": 5.5585, + "step": 365 + }, + { + "epoch": 0.24527748691099477, + "grad_norm": 2.4140853881835938, + "learning_rate": 4.33503905912931e-06, + "loss": 5.0457, + "step": 366 + }, + { + "epoch": 0.24594764397905758, + "grad_norm": 2.552990436553955, + "learning_rate": 4.331423607640155e-06, + "loss": 5.3984, + "step": 367 + }, + { + "epoch": 0.24661780104712042, + "grad_norm": 2.685238838195801, + "learning_rate": 4.32779987048545e-06, + "loss": 5.3119, + "step": 368 + }, + { + "epoch": 0.24728795811518325, + "grad_norm": 2.309687376022339, + "learning_rate": 4.324167864059584e-06, + "loss": 5.4823, + "step": 369 + }, + { + "epoch": 0.24795811518324606, + "grad_norm": 2.3168299198150635, + "learning_rate": 4.32052760479436e-06, + "loss": 5.4587, + "step": 370 + }, + { + "epoch": 0.2486282722513089, + "grad_norm": 2.093782424926758, + "learning_rate": 4.316879109158918e-06, + "loss": 5.4821, + "step": 371 + }, + { + "epoch": 0.24929842931937174, + "grad_norm": 2.271437406539917, + "learning_rate": 4.31322239365966e-06, + "loss": 5.4722, + "step": 372 + }, + { + "epoch": 0.24996858638743455, + "grad_norm": 2.1802608966827393, + "learning_rate": 4.309557474840174e-06, + "loss": 5.5796, + "step": 373 + }, + { + "epoch": 0.25063874345549736, + "grad_norm": 2.491943597793579, + "learning_rate": 4.305884369281167e-06, + "loss": 5.5483, + "step": 374 + }, + { + "epoch": 0.2513089005235602, + "grad_norm": 2.3541171550750732, + "learning_rate": 4.302203093600379e-06, + "loss": 5.4499, + "step": 375 + }, + { + "epoch": 0.25197905759162303, + "grad_norm": 1.8869248628616333, + "learning_rate": 4.298513664452513e-06, + "loss": 5.2755, + "step": 376 + }, + { + "epoch": 0.2526492146596859, + "grad_norm": 2.284665822982788, + "learning_rate": 4.294816098529163e-06, + "loss": 5.3206, + "step": 377 + }, + { + "epoch": 0.2533193717277487, + "grad_norm": 2.177415609359741, + "learning_rate": 4.291110412558733e-06, + "loss": 5.5821, + "step": 378 + }, + { + "epoch": 0.2539895287958115, + "grad_norm": 2.1283631324768066, + "learning_rate": 4.287396623306363e-06, + "loss": 5.226, + "step": 379 + }, + { + "epoch": 0.25465968586387433, + "grad_norm": 2.4899802207946777, + "learning_rate": 4.283674747573856e-06, + "loss": 5.343, + "step": 380 + }, + { + "epoch": 0.25532984293193717, + "grad_norm": 2.8830642700195312, + "learning_rate": 4.279944802199597e-06, + "loss": 5.2881, + "step": 381 + }, + { + "epoch": 0.256, + "grad_norm": 2.234779119491577, + "learning_rate": 4.2762068040584795e-06, + "loss": 5.6279, + "step": 382 + }, + { + "epoch": 0.25667015706806284, + "grad_norm": 2.346729278564453, + "learning_rate": 4.272460770061832e-06, + "loss": 5.3688, + "step": 383 + }, + { + "epoch": 0.2573403141361257, + "grad_norm": 2.206702470779419, + "learning_rate": 4.268706717157334e-06, + "loss": 5.6164, + "step": 384 + }, + { + "epoch": 0.25801047120418846, + "grad_norm": 2.44124698638916, + "learning_rate": 4.264944662328947e-06, + "loss": 5.4314, + "step": 385 + }, + { + "epoch": 0.2586806282722513, + "grad_norm": 1.9755254983901978, + "learning_rate": 4.261174622596835e-06, + "loss": 5.2068, + "step": 386 + }, + { + "epoch": 0.25935078534031414, + "grad_norm": 1.844270944595337, + "learning_rate": 4.257396615017285e-06, + "loss": 5.3666, + "step": 387 + }, + { + "epoch": 0.260020942408377, + "grad_norm": 2.3097944259643555, + "learning_rate": 4.253610656682633e-06, + "loss": 5.3687, + "step": 388 + }, + { + "epoch": 0.2606910994764398, + "grad_norm": 2.1606905460357666, + "learning_rate": 4.2498167647211865e-06, + "loss": 5.3283, + "step": 389 + }, + { + "epoch": 0.2613612565445026, + "grad_norm": 2.251303195953369, + "learning_rate": 4.246014956297142e-06, + "loss": 5.4695, + "step": 390 + }, + { + "epoch": 0.26203141361256543, + "grad_norm": 2.2528789043426514, + "learning_rate": 4.242205248610518e-06, + "loss": 5.4339, + "step": 391 + }, + { + "epoch": 0.26270157068062827, + "grad_norm": 1.8444974422454834, + "learning_rate": 4.238387658897063e-06, + "loss": 5.5597, + "step": 392 + }, + { + "epoch": 0.2633717277486911, + "grad_norm": 2.023601531982422, + "learning_rate": 4.234562204428192e-06, + "loss": 5.4275, + "step": 393 + }, + { + "epoch": 0.26404188481675395, + "grad_norm": 2.0824105739593506, + "learning_rate": 4.230728902510896e-06, + "loss": 5.3526, + "step": 394 + }, + { + "epoch": 0.26471204188481673, + "grad_norm": 2.2396888732910156, + "learning_rate": 4.226887770487675e-06, + "loss": 5.5526, + "step": 395 + }, + { + "epoch": 0.26538219895287957, + "grad_norm": 2.153627395629883, + "learning_rate": 4.223038825736446e-06, + "loss": 5.3724, + "step": 396 + }, + { + "epoch": 0.2660523560209424, + "grad_norm": 2.5046567916870117, + "learning_rate": 4.2191820856704786e-06, + "loss": 5.426, + "step": 397 + }, + { + "epoch": 0.26672251308900524, + "grad_norm": 2.2566487789154053, + "learning_rate": 4.215317567738308e-06, + "loss": 5.2797, + "step": 398 + }, + { + "epoch": 0.2673926701570681, + "grad_norm": 2.192222833633423, + "learning_rate": 4.2114452894236565e-06, + "loss": 5.2759, + "step": 399 + }, + { + "epoch": 0.2680628272251309, + "grad_norm": 2.016427993774414, + "learning_rate": 4.207565268245356e-06, + "loss": 5.5729, + "step": 400 + }, + { + "epoch": 0.2687329842931937, + "grad_norm": 1.8648808002471924, + "learning_rate": 4.203677521757269e-06, + "loss": 5.6048, + "step": 401 + }, + { + "epoch": 0.26940314136125654, + "grad_norm": 2.325082778930664, + "learning_rate": 4.1997820675482085e-06, + "loss": 5.3213, + "step": 402 + }, + { + "epoch": 0.2700732984293194, + "grad_norm": 2.05472469329834, + "learning_rate": 4.19587892324186e-06, + "loss": 5.2503, + "step": 403 + }, + { + "epoch": 0.2707434554973822, + "grad_norm": 2.525400400161743, + "learning_rate": 4.191968106496696e-06, + "loss": 5.3963, + "step": 404 + }, + { + "epoch": 0.27141361256544505, + "grad_norm": 2.3485240936279297, + "learning_rate": 4.188049635005904e-06, + "loss": 5.4384, + "step": 405 + }, + { + "epoch": 0.27208376963350783, + "grad_norm": 2.287038803100586, + "learning_rate": 4.184123526497303e-06, + "loss": 5.2257, + "step": 406 + }, + { + "epoch": 0.27275392670157067, + "grad_norm": 2.2160794734954834, + "learning_rate": 4.180189798733259e-06, + "loss": 5.2822, + "step": 407 + }, + { + "epoch": 0.2734240837696335, + "grad_norm": 2.236074209213257, + "learning_rate": 4.176248469510616e-06, + "loss": 5.1092, + "step": 408 + }, + { + "epoch": 0.27409424083769635, + "grad_norm": 2.395686149597168, + "learning_rate": 4.172299556660601e-06, + "loss": 5.2411, + "step": 409 + }, + { + "epoch": 0.2747643979057592, + "grad_norm": 2.5743703842163086, + "learning_rate": 4.168343078048756e-06, + "loss": 5.4214, + "step": 410 + }, + { + "epoch": 0.27543455497382197, + "grad_norm": 2.148124933242798, + "learning_rate": 4.164379051574848e-06, + "loss": 5.258, + "step": 411 + }, + { + "epoch": 0.2761047120418848, + "grad_norm": 1.816332459449768, + "learning_rate": 4.160407495172794e-06, + "loss": 5.2502, + "step": 412 + }, + { + "epoch": 0.27677486910994764, + "grad_norm": 2.5482685565948486, + "learning_rate": 4.156428426810576e-06, + "loss": 5.2039, + "step": 413 + }, + { + "epoch": 0.2774450261780105, + "grad_norm": 2.4875996112823486, + "learning_rate": 4.152441864490162e-06, + "loss": 5.2924, + "step": 414 + }, + { + "epoch": 0.2781151832460733, + "grad_norm": 2.5088934898376465, + "learning_rate": 4.1484478262474255e-06, + "loss": 5.3185, + "step": 415 + }, + { + "epoch": 0.2787853403141361, + "grad_norm": 2.1406543254852295, + "learning_rate": 4.144446330152061e-06, + "loss": 5.3695, + "step": 416 + }, + { + "epoch": 0.27945549738219894, + "grad_norm": 2.8924715518951416, + "learning_rate": 4.140437394307502e-06, + "loss": 5.4743, + "step": 417 + }, + { + "epoch": 0.2801256544502618, + "grad_norm": 2.35742449760437, + "learning_rate": 4.136421036850844e-06, + "loss": 5.259, + "step": 418 + }, + { + "epoch": 0.2807958115183246, + "grad_norm": 2.7321271896362305, + "learning_rate": 4.132397275952756e-06, + "loss": 5.4331, + "step": 419 + }, + { + "epoch": 0.28146596858638745, + "grad_norm": 2.255457878112793, + "learning_rate": 4.128366129817403e-06, + "loss": 5.6379, + "step": 420 + }, + { + "epoch": 0.2821361256544503, + "grad_norm": 2.1841540336608887, + "learning_rate": 4.124327616682362e-06, + "loss": 5.4079, + "step": 421 + }, + { + "epoch": 0.28280628272251307, + "grad_norm": 1.8602941036224365, + "learning_rate": 4.12028175481854e-06, + "loss": 5.156, + "step": 422 + }, + { + "epoch": 0.2834764397905759, + "grad_norm": 4.119997024536133, + "learning_rate": 4.116228562530089e-06, + "loss": 5.3936, + "step": 423 + }, + { + "epoch": 0.28414659685863874, + "grad_norm": 2.1611454486846924, + "learning_rate": 4.112168058154327e-06, + "loss": 5.5529, + "step": 424 + }, + { + "epoch": 0.2848167539267016, + "grad_norm": 2.1483073234558105, + "learning_rate": 4.108100260061652e-06, + "loss": 5.1862, + "step": 425 + }, + { + "epoch": 0.2854869109947644, + "grad_norm": 1.9627047777175903, + "learning_rate": 4.10402518665546e-06, + "loss": 5.278, + "step": 426 + }, + { + "epoch": 0.2861570680628272, + "grad_norm": 2.049126386642456, + "learning_rate": 4.099942856372064e-06, + "loss": 5.1944, + "step": 427 + }, + { + "epoch": 0.28682722513089004, + "grad_norm": 2.7416749000549316, + "learning_rate": 4.095853287680604e-06, + "loss": 5.2854, + "step": 428 + }, + { + "epoch": 0.2874973821989529, + "grad_norm": 2.39632511138916, + "learning_rate": 4.0917564990829705e-06, + "loss": 5.5541, + "step": 429 + }, + { + "epoch": 0.2881675392670157, + "grad_norm": 2.582184076309204, + "learning_rate": 4.0876525091137195e-06, + "loss": 5.3696, + "step": 430 + }, + { + "epoch": 0.28883769633507855, + "grad_norm": 2.024606227874756, + "learning_rate": 4.083541336339984e-06, + "loss": 5.2294, + "step": 431 + }, + { + "epoch": 0.28950785340314134, + "grad_norm": 2.250382423400879, + "learning_rate": 4.079422999361395e-06, + "loss": 5.4614, + "step": 432 + }, + { + "epoch": 0.2901780104712042, + "grad_norm": 2.6855008602142334, + "learning_rate": 4.075297516809994e-06, + "loss": 5.3138, + "step": 433 + }, + { + "epoch": 0.290848167539267, + "grad_norm": 2.4782090187072754, + "learning_rate": 4.071164907350153e-06, + "loss": 5.3277, + "step": 434 + }, + { + "epoch": 0.29151832460732985, + "grad_norm": 2.666271448135376, + "learning_rate": 4.067025189678485e-06, + "loss": 5.3539, + "step": 435 + }, + { + "epoch": 0.2921884816753927, + "grad_norm": 2.2352638244628906, + "learning_rate": 4.062878382523764e-06, + "loss": 5.2922, + "step": 436 + }, + { + "epoch": 0.2928586387434555, + "grad_norm": 2.0030295848846436, + "learning_rate": 4.058724504646834e-06, + "loss": 5.2213, + "step": 437 + }, + { + "epoch": 0.2935287958115183, + "grad_norm": 2.5756025314331055, + "learning_rate": 4.054563574840532e-06, + "loss": 5.4836, + "step": 438 + }, + { + "epoch": 0.29419895287958114, + "grad_norm": 2.685851573944092, + "learning_rate": 4.050395611929599e-06, + "loss": 5.1647, + "step": 439 + }, + { + "epoch": 0.294869109947644, + "grad_norm": 2.5466887950897217, + "learning_rate": 4.046220634770591e-06, + "loss": 5.3462, + "step": 440 + }, + { + "epoch": 0.2955392670157068, + "grad_norm": 2.186382293701172, + "learning_rate": 4.042038662251801e-06, + "loss": 5.2909, + "step": 441 + }, + { + "epoch": 0.29620942408376966, + "grad_norm": 2.2794177532196045, + "learning_rate": 4.0378497132931724e-06, + "loss": 5.347, + "step": 442 + }, + { + "epoch": 0.29687958115183244, + "grad_norm": 2.45033860206604, + "learning_rate": 4.033653806846204e-06, + "loss": 5.5143, + "step": 443 + }, + { + "epoch": 0.2975497382198953, + "grad_norm": 1.970142126083374, + "learning_rate": 4.0294509618938785e-06, + "loss": 5.1962, + "step": 444 + }, + { + "epoch": 0.2982198952879581, + "grad_norm": 1.9088056087493896, + "learning_rate": 4.025241197450566e-06, + "loss": 5.2455, + "step": 445 + }, + { + "epoch": 0.29889005235602095, + "grad_norm": 1.9862786531448364, + "learning_rate": 4.021024532561941e-06, + "loss": 5.3017, + "step": 446 + }, + { + "epoch": 0.2995602094240838, + "grad_norm": 1.6905484199523926, + "learning_rate": 4.0168009863049e-06, + "loss": 5.4328, + "step": 447 + }, + { + "epoch": 0.30023036649214657, + "grad_norm": 2.0439505577087402, + "learning_rate": 4.012570577787469e-06, + "loss": 5.1759, + "step": 448 + }, + { + "epoch": 0.3009005235602094, + "grad_norm": 2.0029120445251465, + "learning_rate": 4.008333326148721e-06, + "loss": 5.1908, + "step": 449 + }, + { + "epoch": 0.30157068062827225, + "grad_norm": 2.239750385284424, + "learning_rate": 4.004089250558688e-06, + "loss": 5.2547, + "step": 450 + }, + { + "epoch": 0.3022408376963351, + "grad_norm": 2.2577266693115234, + "learning_rate": 3.9998383702182764e-06, + "loss": 5.3508, + "step": 451 + }, + { + "epoch": 0.3029109947643979, + "grad_norm": 2.021261692047119, + "learning_rate": 3.9955807043591765e-06, + "loss": 5.4991, + "step": 452 + }, + { + "epoch": 0.30358115183246076, + "grad_norm": 2.3646843433380127, + "learning_rate": 3.991316272243778e-06, + "loss": 5.5244, + "step": 453 + }, + { + "epoch": 0.30425130890052354, + "grad_norm": 2.3185791969299316, + "learning_rate": 3.987045093165083e-06, + "loss": 5.5034, + "step": 454 + }, + { + "epoch": 0.3049214659685864, + "grad_norm": 2.2033495903015137, + "learning_rate": 3.982767186446615e-06, + "loss": 5.4536, + "step": 455 + }, + { + "epoch": 0.3055916230366492, + "grad_norm": 2.128403663635254, + "learning_rate": 3.978482571442339e-06, + "loss": 5.3952, + "step": 456 + }, + { + "epoch": 0.30626178010471206, + "grad_norm": 2.455321788787842, + "learning_rate": 3.974191267536567e-06, + "loss": 5.3201, + "step": 457 + }, + { + "epoch": 0.3069319371727749, + "grad_norm": 2.422396659851074, + "learning_rate": 3.9698932941438714e-06, + "loss": 5.3798, + "step": 458 + }, + { + "epoch": 0.3076020942408377, + "grad_norm": 2.6799330711364746, + "learning_rate": 3.965588670709002e-06, + "loss": 5.2903, + "step": 459 + }, + { + "epoch": 0.3082722513089005, + "grad_norm": 2.433835506439209, + "learning_rate": 3.9612774167067905e-06, + "loss": 5.4659, + "step": 460 + }, + { + "epoch": 0.30894240837696335, + "grad_norm": 2.2600831985473633, + "learning_rate": 3.95695955164207e-06, + "loss": 5.4558, + "step": 461 + }, + { + "epoch": 0.3096125654450262, + "grad_norm": 2.3049960136413574, + "learning_rate": 3.95263509504958e-06, + "loss": 5.5923, + "step": 462 + }, + { + "epoch": 0.310282722513089, + "grad_norm": 2.1892147064208984, + "learning_rate": 3.948304066493885e-06, + "loss": 5.5232, + "step": 463 + }, + { + "epoch": 0.3109528795811518, + "grad_norm": 2.799586057662964, + "learning_rate": 3.9439664855692774e-06, + "loss": 5.2964, + "step": 464 + }, + { + "epoch": 0.31162303664921465, + "grad_norm": 2.493480682373047, + "learning_rate": 3.939622371899697e-06, + "loss": 5.489, + "step": 465 + }, + { + "epoch": 0.3122931937172775, + "grad_norm": 2.2202532291412354, + "learning_rate": 3.935271745138637e-06, + "loss": 5.3784, + "step": 466 + }, + { + "epoch": 0.3129633507853403, + "grad_norm": 1.9341351985931396, + "learning_rate": 3.9309146249690595e-06, + "loss": 5.3933, + "step": 467 + }, + { + "epoch": 0.31363350785340316, + "grad_norm": 1.8867202997207642, + "learning_rate": 3.9265510311033005e-06, + "loss": 5.3793, + "step": 468 + }, + { + "epoch": 0.31430366492146594, + "grad_norm": 2.1868042945861816, + "learning_rate": 3.9221809832829866e-06, + "loss": 5.2335, + "step": 469 + }, + { + "epoch": 0.3149738219895288, + "grad_norm": 2.428607940673828, + "learning_rate": 3.917804501278942e-06, + "loss": 5.6313, + "step": 470 + }, + { + "epoch": 0.3156439790575916, + "grad_norm": 2.402775287628174, + "learning_rate": 3.9134216048911e-06, + "loss": 5.4608, + "step": 471 + }, + { + "epoch": 0.31631413612565445, + "grad_norm": 1.9883973598480225, + "learning_rate": 3.909032313948415e-06, + "loss": 5.3552, + "step": 472 + }, + { + "epoch": 0.3169842931937173, + "grad_norm": 2.1537022590637207, + "learning_rate": 3.90463664830877e-06, + "loss": 5.3092, + "step": 473 + }, + { + "epoch": 0.31765445026178013, + "grad_norm": 2.195376396179199, + "learning_rate": 3.90023462785889e-06, + "loss": 5.1523, + "step": 474 + }, + { + "epoch": 0.3183246073298429, + "grad_norm": 1.9080901145935059, + "learning_rate": 3.895826272514247e-06, + "loss": 5.1277, + "step": 475 + }, + { + "epoch": 0.31899476439790575, + "grad_norm": 2.5250730514526367, + "learning_rate": 3.891411602218976e-06, + "loss": 5.3874, + "step": 476 + }, + { + "epoch": 0.3196649214659686, + "grad_norm": 2.1713311672210693, + "learning_rate": 3.8869906369457815e-06, + "loss": 5.1199, + "step": 477 + }, + { + "epoch": 0.3203350785340314, + "grad_norm": 2.3029656410217285, + "learning_rate": 3.882563396695846e-06, + "loss": 5.2659, + "step": 478 + }, + { + "epoch": 0.32100523560209426, + "grad_norm": 2.2043564319610596, + "learning_rate": 3.878129901498742e-06, + "loss": 5.2733, + "step": 479 + }, + { + "epoch": 0.32167539267015705, + "grad_norm": 2.431480884552002, + "learning_rate": 3.8736901714123415e-06, + "loss": 5.2519, + "step": 480 + }, + { + "epoch": 0.3223455497382199, + "grad_norm": 2.155893325805664, + "learning_rate": 3.869244226522723e-06, + "loss": 5.4178, + "step": 481 + }, + { + "epoch": 0.3230157068062827, + "grad_norm": 2.1351659297943115, + "learning_rate": 3.864792086944081e-06, + "loss": 5.5612, + "step": 482 + }, + { + "epoch": 0.32368586387434556, + "grad_norm": 2.427992105484009, + "learning_rate": 3.860333772818639e-06, + "loss": 5.2741, + "step": 483 + }, + { + "epoch": 0.3243560209424084, + "grad_norm": 2.6776323318481445, + "learning_rate": 3.85586930431655e-06, + "loss": 5.4813, + "step": 484 + }, + { + "epoch": 0.3250261780104712, + "grad_norm": 2.2945733070373535, + "learning_rate": 3.851398701635815e-06, + "loss": 5.5534, + "step": 485 + }, + { + "epoch": 0.325696335078534, + "grad_norm": 2.452486515045166, + "learning_rate": 3.8469219850021845e-06, + "loss": 5.3351, + "step": 486 + }, + { + "epoch": 0.32636649214659685, + "grad_norm": 2.1009156703948975, + "learning_rate": 3.842439174669072e-06, + "loss": 4.9801, + "step": 487 + }, + { + "epoch": 0.3270366492146597, + "grad_norm": 2.837606906890869, + "learning_rate": 3.837950290917457e-06, + "loss": 5.4142, + "step": 488 + }, + { + "epoch": 0.32770680628272253, + "grad_norm": 1.9545905590057373, + "learning_rate": 3.8334553540557986e-06, + "loss": 4.9649, + "step": 489 + }, + { + "epoch": 0.32837696335078537, + "grad_norm": 1.956103801727295, + "learning_rate": 3.828954384419939e-06, + "loss": 5.4059, + "step": 490 + }, + { + "epoch": 0.32904712041884815, + "grad_norm": 2.232672691345215, + "learning_rate": 3.8244474023730155e-06, + "loss": 5.074, + "step": 491 + }, + { + "epoch": 0.329717277486911, + "grad_norm": 2.234177827835083, + "learning_rate": 3.819934428305365e-06, + "loss": 5.3179, + "step": 492 + }, + { + "epoch": 0.3303874345549738, + "grad_norm": 1.9164519309997559, + "learning_rate": 3.815415482634436e-06, + "loss": 5.2802, + "step": 493 + }, + { + "epoch": 0.33105759162303666, + "grad_norm": 1.753132700920105, + "learning_rate": 3.8108905858046896e-06, + "loss": 5.1147, + "step": 494 + }, + { + "epoch": 0.3317277486910995, + "grad_norm": 2.30802845954895, + "learning_rate": 3.8063597582875135e-06, + "loss": 5.4144, + "step": 495 + }, + { + "epoch": 0.3323979057591623, + "grad_norm": 1.794412612915039, + "learning_rate": 3.8018230205811267e-06, + "loss": 5.255, + "step": 496 + }, + { + "epoch": 0.3330680628272251, + "grad_norm": 2.070859670639038, + "learning_rate": 3.797280393210487e-06, + "loss": 5.4724, + "step": 497 + }, + { + "epoch": 0.33373821989528796, + "grad_norm": 2.355644464492798, + "learning_rate": 3.7927318967271966e-06, + "loss": 5.443, + "step": 498 + }, + { + "epoch": 0.3344083769633508, + "grad_norm": 2.239596128463745, + "learning_rate": 3.788177551709411e-06, + "loss": 5.1885, + "step": 499 + }, + { + "epoch": 0.33507853403141363, + "grad_norm": 2.1752090454101562, + "learning_rate": 3.7836173787617475e-06, + "loss": 5.3933, + "step": 500 + }, + { + "epoch": 0.3357486910994764, + "grad_norm": 2.6792967319488525, + "learning_rate": 3.779051398515188e-06, + "loss": 5.2386, + "step": 501 + }, + { + "epoch": 0.33641884816753925, + "grad_norm": 2.3228695392608643, + "learning_rate": 3.774479631626986e-06, + "loss": 5.3472, + "step": 502 + }, + { + "epoch": 0.3370890052356021, + "grad_norm": 2.5239968299865723, + "learning_rate": 3.7699020987805784e-06, + "loss": 5.2552, + "step": 503 + }, + { + "epoch": 0.33775916230366493, + "grad_norm": 2.4592580795288086, + "learning_rate": 3.7653188206854867e-06, + "loss": 5.211, + "step": 504 + }, + { + "epoch": 0.33842931937172777, + "grad_norm": 2.3289928436279297, + "learning_rate": 3.760729818077224e-06, + "loss": 5.3682, + "step": 505 + }, + { + "epoch": 0.3390994764397906, + "grad_norm": 2.0905239582061768, + "learning_rate": 3.756135111717202e-06, + "loss": 5.3095, + "step": 506 + }, + { + "epoch": 0.3397696335078534, + "grad_norm": 2.084735870361328, + "learning_rate": 3.75153472239264e-06, + "loss": 5.0262, + "step": 507 + }, + { + "epoch": 0.3404397905759162, + "grad_norm": 2.0062642097473145, + "learning_rate": 3.746928670916464e-06, + "loss": 5.3134, + "step": 508 + }, + { + "epoch": 0.34110994764397906, + "grad_norm": 2.108950138092041, + "learning_rate": 3.7423169781272177e-06, + "loss": 5.439, + "step": 509 + }, + { + "epoch": 0.3417801047120419, + "grad_norm": 2.0042457580566406, + "learning_rate": 3.7376996648889703e-06, + "loss": 5.5338, + "step": 510 + }, + { + "epoch": 0.34245026178010474, + "grad_norm": 2.4976985454559326, + "learning_rate": 3.733076752091214e-06, + "loss": 5.3151, + "step": 511 + }, + { + "epoch": 0.3431204188481675, + "grad_norm": 2.1041195392608643, + "learning_rate": 3.728448260648777e-06, + "loss": 5.2065, + "step": 512 + }, + { + "epoch": 0.34379057591623036, + "grad_norm": 2.285289764404297, + "learning_rate": 3.7238142115017262e-06, + "loss": 5.2395, + "step": 513 + }, + { + "epoch": 0.3444607329842932, + "grad_norm": 2.4077186584472656, + "learning_rate": 3.719174625615273e-06, + "loss": 5.1368, + "step": 514 + }, + { + "epoch": 0.34513089005235603, + "grad_norm": 2.0603976249694824, + "learning_rate": 3.7145295239796747e-06, + "loss": 5.273, + "step": 515 + }, + { + "epoch": 0.34580104712041887, + "grad_norm": 1.9497524499893188, + "learning_rate": 3.709878927610146e-06, + "loss": 5.2622, + "step": 516 + }, + { + "epoch": 0.34647120418848165, + "grad_norm": 2.6009366512298584, + "learning_rate": 3.705222857546761e-06, + "loss": 5.48, + "step": 517 + }, + { + "epoch": 0.3471413612565445, + "grad_norm": 2.330073833465576, + "learning_rate": 3.700561334854355e-06, + "loss": 5.2746, + "step": 518 + }, + { + "epoch": 0.3478115183246073, + "grad_norm": 2.2093427181243896, + "learning_rate": 3.6958943806224333e-06, + "loss": 5.4129, + "step": 519 + }, + { + "epoch": 0.34848167539267016, + "grad_norm": 2.3132288455963135, + "learning_rate": 3.6912220159650757e-06, + "loss": 5.3603, + "step": 520 + }, + { + "epoch": 0.349151832460733, + "grad_norm": 2.2647898197174072, + "learning_rate": 3.686544262020838e-06, + "loss": 5.294, + "step": 521 + }, + { + "epoch": 0.3498219895287958, + "grad_norm": 2.105992317199707, + "learning_rate": 3.6818611399526576e-06, + "loss": 5.2418, + "step": 522 + }, + { + "epoch": 0.3504921465968586, + "grad_norm": 2.493412494659424, + "learning_rate": 3.6771726709477594e-06, + "loss": 5.1907, + "step": 523 + }, + { + "epoch": 0.35116230366492146, + "grad_norm": 2.1870672702789307, + "learning_rate": 3.6724788762175587e-06, + "loss": 5.2098, + "step": 524 + }, + { + "epoch": 0.3518324607329843, + "grad_norm": 2.307648181915283, + "learning_rate": 3.6677797769975653e-06, + "loss": 5.3077, + "step": 525 + }, + { + "epoch": 0.35250261780104714, + "grad_norm": 2.2626419067382812, + "learning_rate": 3.6630753945472854e-06, + "loss": 5.1255, + "step": 526 + }, + { + "epoch": 0.35317277486911, + "grad_norm": 2.2895169258117676, + "learning_rate": 3.658365750150131e-06, + "loss": 5.3941, + "step": 527 + }, + { + "epoch": 0.35384293193717276, + "grad_norm": 2.3610305786132812, + "learning_rate": 3.653650865113317e-06, + "loss": 5.4692, + "step": 528 + }, + { + "epoch": 0.3545130890052356, + "grad_norm": 2.173130750656128, + "learning_rate": 3.6489307607677694e-06, + "loss": 5.1767, + "step": 529 + }, + { + "epoch": 0.35518324607329843, + "grad_norm": 2.2770626544952393, + "learning_rate": 3.644205458468026e-06, + "loss": 5.5926, + "step": 530 + }, + { + "epoch": 0.35585340314136127, + "grad_norm": 2.6587777137756348, + "learning_rate": 3.6394749795921425e-06, + "loss": 5.3162, + "step": 531 + }, + { + "epoch": 0.3565235602094241, + "grad_norm": 2.2764198780059814, + "learning_rate": 3.6347393455415925e-06, + "loss": 5.4997, + "step": 532 + }, + { + "epoch": 0.3571937172774869, + "grad_norm": 2.466233491897583, + "learning_rate": 3.6299985777411744e-06, + "loss": 5.3123, + "step": 533 + }, + { + "epoch": 0.3578638743455497, + "grad_norm": 2.2345104217529297, + "learning_rate": 3.625252697638912e-06, + "loss": 5.2243, + "step": 534 + }, + { + "epoch": 0.35853403141361256, + "grad_norm": 1.9270764589309692, + "learning_rate": 3.6205017267059567e-06, + "loss": 5.11, + "step": 535 + }, + { + "epoch": 0.3592041884816754, + "grad_norm": 2.272279977798462, + "learning_rate": 3.6157456864364925e-06, + "loss": 5.2157, + "step": 536 + }, + { + "epoch": 0.35987434554973824, + "grad_norm": 2.258368968963623, + "learning_rate": 3.6109845983476393e-06, + "loss": 5.117, + "step": 537 + }, + { + "epoch": 0.360544502617801, + "grad_norm": 2.28820538520813, + "learning_rate": 3.606218483979352e-06, + "loss": 5.4147, + "step": 538 + }, + { + "epoch": 0.36121465968586386, + "grad_norm": 2.199136734008789, + "learning_rate": 3.601447364894326e-06, + "loss": 5.4522, + "step": 539 + }, + { + "epoch": 0.3618848167539267, + "grad_norm": 2.169766664505005, + "learning_rate": 3.596671262677898e-06, + "loss": 5.2305, + "step": 540 + }, + { + "epoch": 0.36255497382198953, + "grad_norm": 2.2157297134399414, + "learning_rate": 3.5918901989379527e-06, + "loss": 5.3279, + "step": 541 + }, + { + "epoch": 0.3632251308900524, + "grad_norm": 2.254014015197754, + "learning_rate": 3.587104195304818e-06, + "loss": 5.0062, + "step": 542 + }, + { + "epoch": 0.3638952879581152, + "grad_norm": 2.3738510608673096, + "learning_rate": 3.5823132734311704e-06, + "loss": 5.7765, + "step": 543 + }, + { + "epoch": 0.364565445026178, + "grad_norm": 2.2331602573394775, + "learning_rate": 3.5775174549919413e-06, + "loss": 5.5786, + "step": 544 + }, + { + "epoch": 0.36523560209424083, + "grad_norm": 2.5791873931884766, + "learning_rate": 3.57271676168421e-06, + "loss": 5.3063, + "step": 545 + }, + { + "epoch": 0.36590575916230367, + "grad_norm": 2.402435779571533, + "learning_rate": 3.5679112152271144e-06, + "loss": 5.2438, + "step": 546 + }, + { + "epoch": 0.3665759162303665, + "grad_norm": 2.1838717460632324, + "learning_rate": 3.5631008373617464e-06, + "loss": 5.3002, + "step": 547 + }, + { + "epoch": 0.36724607329842934, + "grad_norm": 2.4075636863708496, + "learning_rate": 3.5582856498510574e-06, + "loss": 5.3944, + "step": 548 + }, + { + "epoch": 0.3679162303664921, + "grad_norm": 2.2561862468719482, + "learning_rate": 3.5534656744797583e-06, + "loss": 5.3127, + "step": 549 + }, + { + "epoch": 0.36858638743455496, + "grad_norm": 2.3960776329040527, + "learning_rate": 3.54864093305422e-06, + "loss": 5.147, + "step": 550 + }, + { + "epoch": 0.3692565445026178, + "grad_norm": 2.1438934803009033, + "learning_rate": 3.543811447402378e-06, + "loss": 5.2816, + "step": 551 + }, + { + "epoch": 0.36992670157068064, + "grad_norm": 1.8878827095031738, + "learning_rate": 3.538977239373629e-06, + "loss": 5.3005, + "step": 552 + }, + { + "epoch": 0.3705968586387435, + "grad_norm": 2.35463809967041, + "learning_rate": 3.534138330838735e-06, + "loss": 5.2632, + "step": 553 + }, + { + "epoch": 0.37126701570680626, + "grad_norm": 2.1728053092956543, + "learning_rate": 3.5292947436897257e-06, + "loss": 5.1822, + "step": 554 + }, + { + "epoch": 0.3719371727748691, + "grad_norm": 2.3696134090423584, + "learning_rate": 3.5244464998397958e-06, + "loss": 5.3895, + "step": 555 + }, + { + "epoch": 0.37260732984293193, + "grad_norm": 2.2455461025238037, + "learning_rate": 3.5195936212232086e-06, + "loss": 5.4655, + "step": 556 + }, + { + "epoch": 0.37327748691099477, + "grad_norm": 2.0229246616363525, + "learning_rate": 3.5147361297951955e-06, + "loss": 5.0173, + "step": 557 + }, + { + "epoch": 0.3739476439790576, + "grad_norm": 2.8633840084075928, + "learning_rate": 3.509874047531856e-06, + "loss": 5.3669, + "step": 558 + }, + { + "epoch": 0.3746178010471204, + "grad_norm": 2.104095697402954, + "learning_rate": 3.5050073964300624e-06, + "loss": 5.1716, + "step": 559 + }, + { + "epoch": 0.37528795811518323, + "grad_norm": 2.1818394660949707, + "learning_rate": 3.5001361985073532e-06, + "loss": 5.327, + "step": 560 + }, + { + "epoch": 0.37595811518324607, + "grad_norm": 2.133225679397583, + "learning_rate": 3.495260475801841e-06, + "loss": 5.4634, + "step": 561 + }, + { + "epoch": 0.3766282722513089, + "grad_norm": 2.2934346199035645, + "learning_rate": 3.490380250372107e-06, + "loss": 5.3697, + "step": 562 + }, + { + "epoch": 0.37729842931937174, + "grad_norm": 2.2782511711120605, + "learning_rate": 3.485495544297104e-06, + "loss": 5.4309, + "step": 563 + }, + { + "epoch": 0.3779685863874346, + "grad_norm": 2.150676965713501, + "learning_rate": 3.4806063796760557e-06, + "loss": 5.2855, + "step": 564 + }, + { + "epoch": 0.37863874345549736, + "grad_norm": 2.218550682067871, + "learning_rate": 3.4757127786283583e-06, + "loss": 5.0748, + "step": 565 + }, + { + "epoch": 0.3793089005235602, + "grad_norm": 1.859222173690796, + "learning_rate": 3.4708147632934793e-06, + "loss": 5.3948, + "step": 566 + }, + { + "epoch": 0.37997905759162304, + "grad_norm": 2.387467384338379, + "learning_rate": 3.465912355830853e-06, + "loss": 5.0676, + "step": 567 + }, + { + "epoch": 0.3806492146596859, + "grad_norm": 2.3780243396759033, + "learning_rate": 3.4610055784197917e-06, + "loss": 5.4155, + "step": 568 + }, + { + "epoch": 0.3813193717277487, + "grad_norm": 2.393306255340576, + "learning_rate": 3.456094453259371e-06, + "loss": 5.2968, + "step": 569 + }, + { + "epoch": 0.3819895287958115, + "grad_norm": 2.1577823162078857, + "learning_rate": 3.4511790025683396e-06, + "loss": 5.257, + "step": 570 + }, + { + "epoch": 0.38265968586387433, + "grad_norm": 2.37388277053833, + "learning_rate": 3.4462592485850168e-06, + "loss": 5.3045, + "step": 571 + }, + { + "epoch": 0.38332984293193717, + "grad_norm": 2.1195552349090576, + "learning_rate": 3.4413352135671886e-06, + "loss": 5.2868, + "step": 572 + }, + { + "epoch": 0.384, + "grad_norm": 2.1157307624816895, + "learning_rate": 3.43640691979201e-06, + "loss": 5.377, + "step": 573 + }, + { + "epoch": 0.38467015706806285, + "grad_norm": 2.0737154483795166, + "learning_rate": 3.4314743895559033e-06, + "loss": 5.2294, + "step": 574 + }, + { + "epoch": 0.38534031413612563, + "grad_norm": 2.3180153369903564, + "learning_rate": 3.4265376451744564e-06, + "loss": 5.4814, + "step": 575 + }, + { + "epoch": 0.38601047120418847, + "grad_norm": 1.7790483236312866, + "learning_rate": 3.4215967089823243e-06, + "loss": 5.2332, + "step": 576 + }, + { + "epoch": 0.3866806282722513, + "grad_norm": 2.8925139904022217, + "learning_rate": 3.4166516033331255e-06, + "loss": 5.0873, + "step": 577 + }, + { + "epoch": 0.38735078534031414, + "grad_norm": 2.228334426879883, + "learning_rate": 3.411702350599341e-06, + "loss": 5.0322, + "step": 578 + }, + { + "epoch": 0.388020942408377, + "grad_norm": 1.9167518615722656, + "learning_rate": 3.406748973172216e-06, + "loss": 5.152, + "step": 579 + }, + { + "epoch": 0.3886910994764398, + "grad_norm": 2.6737799644470215, + "learning_rate": 3.4017914934616547e-06, + "loss": 5.2398, + "step": 580 + }, + { + "epoch": 0.3893612565445026, + "grad_norm": 1.8289151191711426, + "learning_rate": 3.3968299338961213e-06, + "loss": 5.2169, + "step": 581 + }, + { + "epoch": 0.39003141361256544, + "grad_norm": 2.4280478954315186, + "learning_rate": 3.391864316922538e-06, + "loss": 5.1239, + "step": 582 + }, + { + "epoch": 0.3907015706806283, + "grad_norm": 2.227719783782959, + "learning_rate": 3.386894665006184e-06, + "loss": 5.3494, + "step": 583 + }, + { + "epoch": 0.3913717277486911, + "grad_norm": 2.0614211559295654, + "learning_rate": 3.381921000630591e-06, + "loss": 4.9243, + "step": 584 + }, + { + "epoch": 0.39204188481675395, + "grad_norm": 2.4239277839660645, + "learning_rate": 3.3769433462974476e-06, + "loss": 5.4025, + "step": 585 + }, + { + "epoch": 0.39271204188481673, + "grad_norm": 2.205012798309326, + "learning_rate": 3.371961724526489e-06, + "loss": 5.1364, + "step": 586 + }, + { + "epoch": 0.39338219895287957, + "grad_norm": 2.2407400608062744, + "learning_rate": 3.3669761578554037e-06, + "loss": 5.303, + "step": 587 + }, + { + "epoch": 0.3940523560209424, + "grad_norm": 2.5046958923339844, + "learning_rate": 3.361986668839726e-06, + "loss": 5.3919, + "step": 588 + }, + { + "epoch": 0.39472251308900524, + "grad_norm": 2.4827637672424316, + "learning_rate": 3.356993280052735e-06, + "loss": 5.4525, + "step": 589 + }, + { + "epoch": 0.3953926701570681, + "grad_norm": 2.063934803009033, + "learning_rate": 3.3519960140853542e-06, + "loss": 5.326, + "step": 590 + }, + { + "epoch": 0.39606282722513086, + "grad_norm": 2.1795217990875244, + "learning_rate": 3.346994893546046e-06, + "loss": 5.1236, + "step": 591 + }, + { + "epoch": 0.3967329842931937, + "grad_norm": 1.890066385269165, + "learning_rate": 3.3419899410607142e-06, + "loss": 5.1188, + "step": 592 + }, + { + "epoch": 0.39740314136125654, + "grad_norm": 2.031525135040283, + "learning_rate": 3.3369811792725977e-06, + "loss": 5.0546, + "step": 593 + }, + { + "epoch": 0.3980732984293194, + "grad_norm": 2.3749258518218994, + "learning_rate": 3.3319686308421697e-06, + "loss": 5.2319, + "step": 594 + }, + { + "epoch": 0.3987434554973822, + "grad_norm": 2.0839717388153076, + "learning_rate": 3.326952318447033e-06, + "loss": 5.1436, + "step": 595 + }, + { + "epoch": 0.39941361256544505, + "grad_norm": 2.0904011726379395, + "learning_rate": 3.321932264781822e-06, + "loss": 5.2929, + "step": 596 + }, + { + "epoch": 0.40008376963350784, + "grad_norm": 2.1882741451263428, + "learning_rate": 3.3169084925580963e-06, + "loss": 5.0201, + "step": 597 + }, + { + "epoch": 0.4007539267015707, + "grad_norm": 1.9673819541931152, + "learning_rate": 3.3118810245042376e-06, + "loss": 4.9752, + "step": 598 + }, + { + "epoch": 0.4014240837696335, + "grad_norm": 1.6698298454284668, + "learning_rate": 3.3068498833653482e-06, + "loss": 5.1009, + "step": 599 + }, + { + "epoch": 0.40209424083769635, + "grad_norm": 2.1457369327545166, + "learning_rate": 3.301815091903151e-06, + "loss": 5.4315, + "step": 600 + }, + { + "epoch": 0.4027643979057592, + "grad_norm": 1.8999826908111572, + "learning_rate": 3.2967766728958805e-06, + "loss": 5.4815, + "step": 601 + }, + { + "epoch": 0.40343455497382197, + "grad_norm": 2.4097654819488525, + "learning_rate": 3.2917346491381837e-06, + "loss": 5.228, + "step": 602 + }, + { + "epoch": 0.4041047120418848, + "grad_norm": 2.1515157222747803, + "learning_rate": 3.2866890434410153e-06, + "loss": 4.81, + "step": 603 + }, + { + "epoch": 0.40477486910994764, + "grad_norm": 2.127140998840332, + "learning_rate": 3.2816398786315377e-06, + "loss": 5.3845, + "step": 604 + }, + { + "epoch": 0.4054450261780105, + "grad_norm": 2.041879653930664, + "learning_rate": 3.2765871775530135e-06, + "loss": 5.1179, + "step": 605 + }, + { + "epoch": 0.4061151832460733, + "grad_norm": 2.8136348724365234, + "learning_rate": 3.271530963064703e-06, + "loss": 5.3501, + "step": 606 + }, + { + "epoch": 0.4067853403141361, + "grad_norm": 2.181309938430786, + "learning_rate": 3.2664712580417648e-06, + "loss": 5.1919, + "step": 607 + }, + { + "epoch": 0.40745549738219894, + "grad_norm": 2.3441267013549805, + "learning_rate": 3.261408085375146e-06, + "loss": 5.3501, + "step": 608 + }, + { + "epoch": 0.4081256544502618, + "grad_norm": 2.380401849746704, + "learning_rate": 3.2563414679714844e-06, + "loss": 5.2335, + "step": 609 + }, + { + "epoch": 0.4087958115183246, + "grad_norm": 2.020721912384033, + "learning_rate": 3.2512714287530007e-06, + "loss": 5.279, + "step": 610 + }, + { + "epoch": 0.40946596858638745, + "grad_norm": 2.2213196754455566, + "learning_rate": 3.246197990657397e-06, + "loss": 5.4033, + "step": 611 + }, + { + "epoch": 0.41013612565445023, + "grad_norm": 2.499452829360962, + "learning_rate": 3.241121176637753e-06, + "loss": 5.4596, + "step": 612 + }, + { + "epoch": 0.41080628272251307, + "grad_norm": 2.035451650619507, + "learning_rate": 3.2360410096624204e-06, + "loss": 4.818, + "step": 613 + }, + { + "epoch": 0.4114764397905759, + "grad_norm": 2.1440494060516357, + "learning_rate": 3.230957512714923e-06, + "loss": 5.3732, + "step": 614 + }, + { + "epoch": 0.41214659685863875, + "grad_norm": 2.139416456222534, + "learning_rate": 3.2258707087938464e-06, + "loss": 5.1974, + "step": 615 + }, + { + "epoch": 0.4128167539267016, + "grad_norm": 2.4063401222229004, + "learning_rate": 3.2207806209127383e-06, + "loss": 5.228, + "step": 616 + }, + { + "epoch": 0.4134869109947644, + "grad_norm": 2.257823944091797, + "learning_rate": 3.2156872721000066e-06, + "loss": 5.185, + "step": 617 + }, + { + "epoch": 0.4141570680628272, + "grad_norm": 2.1968698501586914, + "learning_rate": 3.2105906853988083e-06, + "loss": 5.4655, + "step": 618 + }, + { + "epoch": 0.41482722513089004, + "grad_norm": 2.221487045288086, + "learning_rate": 3.205490883866952e-06, + "loss": 5.2044, + "step": 619 + }, + { + "epoch": 0.4154973821989529, + "grad_norm": 2.286722183227539, + "learning_rate": 3.2003878905767877e-06, + "loss": 5.3126, + "step": 620 + }, + { + "epoch": 0.4161675392670157, + "grad_norm": 2.044743299484253, + "learning_rate": 3.1952817286151093e-06, + "loss": 5.501, + "step": 621 + }, + { + "epoch": 0.41683769633507856, + "grad_norm": 2.2196061611175537, + "learning_rate": 3.1901724210830437e-06, + "loss": 5.372, + "step": 622 + }, + { + "epoch": 0.41750785340314134, + "grad_norm": 2.22896146774292, + "learning_rate": 3.185059991095949e-06, + "loss": 5.0381, + "step": 623 + }, + { + "epoch": 0.4181780104712042, + "grad_norm": 2.2274248600006104, + "learning_rate": 3.1799444617833103e-06, + "loss": 5.3017, + "step": 624 + }, + { + "epoch": 0.418848167539267, + "grad_norm": 2.2849974632263184, + "learning_rate": 3.1748258562886356e-06, + "loss": 5.3214, + "step": 625 + }, + { + "epoch": 0.41951832460732985, + "grad_norm": 2.5994486808776855, + "learning_rate": 3.1697041977693475e-06, + "loss": 5.1276, + "step": 626 + }, + { + "epoch": 0.4201884816753927, + "grad_norm": 2.21824049949646, + "learning_rate": 3.1645795093966836e-06, + "loss": 5.0308, + "step": 627 + }, + { + "epoch": 0.42085863874345547, + "grad_norm": 2.221174716949463, + "learning_rate": 3.159451814355588e-06, + "loss": 5.2276, + "step": 628 + }, + { + "epoch": 0.4215287958115183, + "grad_norm": 2.3974714279174805, + "learning_rate": 3.1543211358446068e-06, + "loss": 5.1327, + "step": 629 + }, + { + "epoch": 0.42219895287958115, + "grad_norm": 2.2819957733154297, + "learning_rate": 3.149187497075784e-06, + "loss": 5.3803, + "step": 630 + }, + { + "epoch": 0.422869109947644, + "grad_norm": 2.1973443031311035, + "learning_rate": 3.1440509212745584e-06, + "loss": 5.1437, + "step": 631 + }, + { + "epoch": 0.4235392670157068, + "grad_norm": 2.5798709392547607, + "learning_rate": 3.138911431679654e-06, + "loss": 5.0404, + "step": 632 + }, + { + "epoch": 0.42420942408376966, + "grad_norm": 2.094707727432251, + "learning_rate": 3.133769051542977e-06, + "loss": 5.3171, + "step": 633 + }, + { + "epoch": 0.42487958115183244, + "grad_norm": 1.9110760688781738, + "learning_rate": 3.1286238041295134e-06, + "loss": 5.0615, + "step": 634 + }, + { + "epoch": 0.4255497382198953, + "grad_norm": 2.4531033039093018, + "learning_rate": 3.1234757127172195e-06, + "loss": 5.1769, + "step": 635 + }, + { + "epoch": 0.4262198952879581, + "grad_norm": 2.4736881256103516, + "learning_rate": 3.1183248005969186e-06, + "loss": 5.267, + "step": 636 + }, + { + "epoch": 0.42689005235602095, + "grad_norm": 2.087996244430542, + "learning_rate": 3.1131710910721943e-06, + "loss": 4.9783, + "step": 637 + }, + { + "epoch": 0.4275602094240838, + "grad_norm": 1.8800019025802612, + "learning_rate": 3.1080146074592882e-06, + "loss": 4.9921, + "step": 638 + }, + { + "epoch": 0.4282303664921466, + "grad_norm": 2.587238311767578, + "learning_rate": 3.1028553730869913e-06, + "loss": 5.2164, + "step": 639 + }, + { + "epoch": 0.4289005235602094, + "grad_norm": 2.528468132019043, + "learning_rate": 3.0976934112965394e-06, + "loss": 5.2577, + "step": 640 + }, + { + "epoch": 0.42957068062827225, + "grad_norm": 1.9641128778457642, + "learning_rate": 3.0925287454415083e-06, + "loss": 5.2473, + "step": 641 + }, + { + "epoch": 0.4302408376963351, + "grad_norm": 1.896745204925537, + "learning_rate": 3.087361398887706e-06, + "loss": 5.0601, + "step": 642 + }, + { + "epoch": 0.4309109947643979, + "grad_norm": 2.3236024379730225, + "learning_rate": 3.0821913950130694e-06, + "loss": 5.2882, + "step": 643 + }, + { + "epoch": 0.4315811518324607, + "grad_norm": 1.563217282295227, + "learning_rate": 3.077018757207558e-06, + "loss": 5.1928, + "step": 644 + }, + { + "epoch": 0.43225130890052355, + "grad_norm": 2.0513296127319336, + "learning_rate": 3.071843508873046e-06, + "loss": 5.1316, + "step": 645 + }, + { + "epoch": 0.4329214659685864, + "grad_norm": 2.402965545654297, + "learning_rate": 3.06666567342322e-06, + "loss": 5.153, + "step": 646 + }, + { + "epoch": 0.4335916230366492, + "grad_norm": 2.0920908451080322, + "learning_rate": 3.0614852742834706e-06, + "loss": 5.1931, + "step": 647 + }, + { + "epoch": 0.43426178010471206, + "grad_norm": 2.192448616027832, + "learning_rate": 3.056302334890786e-06, + "loss": 4.8578, + "step": 648 + }, + { + "epoch": 0.4349319371727749, + "grad_norm": 1.793662667274475, + "learning_rate": 3.0511168786936483e-06, + "loss": 5.2735, + "step": 649 + }, + { + "epoch": 0.4356020942408377, + "grad_norm": 1.9571502208709717, + "learning_rate": 3.0459289291519244e-06, + "loss": 5.2437, + "step": 650 + }, + { + "epoch": 0.4362722513089005, + "grad_norm": 2.033674478530884, + "learning_rate": 3.040738509736763e-06, + "loss": 5.1984, + "step": 651 + }, + { + "epoch": 0.43694240837696335, + "grad_norm": 2.2626571655273438, + "learning_rate": 3.0355456439304865e-06, + "loss": 5.2065, + "step": 652 + }, + { + "epoch": 0.4376125654450262, + "grad_norm": 2.125727415084839, + "learning_rate": 3.0303503552264846e-06, + "loss": 5.3969, + "step": 653 + }, + { + "epoch": 0.43828272251308903, + "grad_norm": 2.778316020965576, + "learning_rate": 3.025152667129109e-06, + "loss": 5.6389, + "step": 654 + }, + { + "epoch": 0.4389528795811518, + "grad_norm": 2.4682669639587402, + "learning_rate": 3.0199526031535675e-06, + "loss": 5.2054, + "step": 655 + }, + { + "epoch": 0.43962303664921465, + "grad_norm": 1.9458839893341064, + "learning_rate": 3.014750186825815e-06, + "loss": 5.0855, + "step": 656 + }, + { + "epoch": 0.4402931937172775, + "grad_norm": 2.1652393341064453, + "learning_rate": 3.00954544168245e-06, + "loss": 5.1186, + "step": 657 + }, + { + "epoch": 0.4409633507853403, + "grad_norm": 2.2295773029327393, + "learning_rate": 3.004338391270607e-06, + "loss": 5.5311, + "step": 658 + }, + { + "epoch": 0.44163350785340316, + "grad_norm": 2.231815814971924, + "learning_rate": 2.9991290591478495e-06, + "loss": 5.2779, + "step": 659 + }, + { + "epoch": 0.44230366492146594, + "grad_norm": 2.0944957733154297, + "learning_rate": 2.993917468882064e-06, + "loss": 5.4016, + "step": 660 + }, + { + "epoch": 0.4429738219895288, + "grad_norm": 2.0941920280456543, + "learning_rate": 2.9887036440513534e-06, + "loss": 5.1275, + "step": 661 + }, + { + "epoch": 0.4436439790575916, + "grad_norm": 2.7081048488616943, + "learning_rate": 2.98348760824393e-06, + "loss": 5.0913, + "step": 662 + }, + { + "epoch": 0.44431413612565446, + "grad_norm": 3.268285036087036, + "learning_rate": 2.9782693850580084e-06, + "loss": 5.1262, + "step": 663 + }, + { + "epoch": 0.4449842931937173, + "grad_norm": 2.2694761753082275, + "learning_rate": 2.9730489981017e-06, + "loss": 5.0765, + "step": 664 + }, + { + "epoch": 0.4456544502617801, + "grad_norm": 2.0705652236938477, + "learning_rate": 2.9678264709929054e-06, + "loss": 5.2526, + "step": 665 + }, + { + "epoch": 0.4463246073298429, + "grad_norm": 2.3029444217681885, + "learning_rate": 2.962601827359208e-06, + "loss": 4.9703, + "step": 666 + }, + { + "epoch": 0.44699476439790575, + "grad_norm": 2.4257922172546387, + "learning_rate": 2.957375090837764e-06, + "loss": 5.1845, + "step": 667 + }, + { + "epoch": 0.4476649214659686, + "grad_norm": 2.0682663917541504, + "learning_rate": 2.9521462850752027e-06, + "loss": 5.1377, + "step": 668 + }, + { + "epoch": 0.44833507853403143, + "grad_norm": 2.2123470306396484, + "learning_rate": 2.946915433727511e-06, + "loss": 5.1044, + "step": 669 + }, + { + "epoch": 0.44900523560209427, + "grad_norm": 2.343519926071167, + "learning_rate": 2.941682560459933e-06, + "loss": 5.2421, + "step": 670 + }, + { + "epoch": 0.44967539267015705, + "grad_norm": 2.291065216064453, + "learning_rate": 2.9364476889468585e-06, + "loss": 5.5774, + "step": 671 + }, + { + "epoch": 0.4503455497382199, + "grad_norm": 2.0356202125549316, + "learning_rate": 2.9312108428717177e-06, + "loss": 5.174, + "step": 672 + }, + { + "epoch": 0.4510157068062827, + "grad_norm": 2.0141336917877197, + "learning_rate": 2.9259720459268782e-06, + "loss": 5.0388, + "step": 673 + }, + { + "epoch": 0.45168586387434556, + "grad_norm": 2.0286974906921387, + "learning_rate": 2.920731321813526e-06, + "loss": 5.3889, + "step": 674 + }, + { + "epoch": 0.4523560209424084, + "grad_norm": 1.9957947731018066, + "learning_rate": 2.9154886942415727e-06, + "loss": 5.1728, + "step": 675 + }, + { + "epoch": 0.4530261780104712, + "grad_norm": 2.285508632659912, + "learning_rate": 2.910244186929539e-06, + "loss": 5.2326, + "step": 676 + }, + { + "epoch": 0.453696335078534, + "grad_norm": 2.101499080657959, + "learning_rate": 2.9049978236044486e-06, + "loss": 5.1891, + "step": 677 + }, + { + "epoch": 0.45436649214659686, + "grad_norm": 2.504451274871826, + "learning_rate": 2.8997496280017233e-06, + "loss": 4.8588, + "step": 678 + }, + { + "epoch": 0.4550366492146597, + "grad_norm": 2.49049973487854, + "learning_rate": 2.894499623865075e-06, + "loss": 5.4943, + "step": 679 + }, + { + "epoch": 0.45570680628272253, + "grad_norm": 2.0159759521484375, + "learning_rate": 2.8892478349463987e-06, + "loss": 5.2162, + "step": 680 + }, + { + "epoch": 0.4563769633507853, + "grad_norm": 2.0550146102905273, + "learning_rate": 2.8839942850056596e-06, + "loss": 5.5218, + "step": 681 + }, + { + "epoch": 0.45704712041884815, + "grad_norm": 2.3331401348114014, + "learning_rate": 2.8787389978107947e-06, + "loss": 5.2914, + "step": 682 + }, + { + "epoch": 0.457717277486911, + "grad_norm": 1.9607702493667603, + "learning_rate": 2.873481997137599e-06, + "loss": 5.0108, + "step": 683 + }, + { + "epoch": 0.4583874345549738, + "grad_norm": 2.2367026805877686, + "learning_rate": 2.868223306769619e-06, + "loss": 5.0986, + "step": 684 + }, + { + "epoch": 0.45905759162303666, + "grad_norm": 2.385270357131958, + "learning_rate": 2.862962950498046e-06, + "loss": 5.2578, + "step": 685 + }, + { + "epoch": 0.4597277486910995, + "grad_norm": 2.34967303276062, + "learning_rate": 2.8577009521216083e-06, + "loss": 5.0958, + "step": 686 + }, + { + "epoch": 0.4603979057591623, + "grad_norm": 2.663029432296753, + "learning_rate": 2.8524373354464647e-06, + "loss": 5.1852, + "step": 687 + }, + { + "epoch": 0.4610680628272251, + "grad_norm": 2.3971621990203857, + "learning_rate": 2.847172124286091e-06, + "loss": 5.5013, + "step": 688 + }, + { + "epoch": 0.46173821989528796, + "grad_norm": 2.6697640419006348, + "learning_rate": 2.841905342461182e-06, + "loss": 5.3391, + "step": 689 + }, + { + "epoch": 0.4624083769633508, + "grad_norm": 2.132344961166382, + "learning_rate": 2.836637013799537e-06, + "loss": 5.0274, + "step": 690 + }, + { + "epoch": 0.46307853403141364, + "grad_norm": 2.4103102684020996, + "learning_rate": 2.8313671621359503e-06, + "loss": 5.1447, + "step": 691 + }, + { + "epoch": 0.4637486910994764, + "grad_norm": 2.3897173404693604, + "learning_rate": 2.82609581131211e-06, + "loss": 5.2839, + "step": 692 + }, + { + "epoch": 0.46441884816753926, + "grad_norm": 1.9705911874771118, + "learning_rate": 2.8208229851764857e-06, + "loss": 5.1606, + "step": 693 + }, + { + "epoch": 0.4650890052356021, + "grad_norm": 2.035224199295044, + "learning_rate": 2.815548707584222e-06, + "loss": 5.4598, + "step": 694 + }, + { + "epoch": 0.46575916230366493, + "grad_norm": 2.2948644161224365, + "learning_rate": 2.810273002397028e-06, + "loss": 5.0036, + "step": 695 + }, + { + "epoch": 0.46642931937172777, + "grad_norm": 2.438422441482544, + "learning_rate": 2.8049958934830735e-06, + "loss": 4.8402, + "step": 696 + }, + { + "epoch": 0.46709947643979055, + "grad_norm": 1.948908805847168, + "learning_rate": 2.79971740471688e-06, + "loss": 4.953, + "step": 697 + }, + { + "epoch": 0.4677696335078534, + "grad_norm": 1.7720941305160522, + "learning_rate": 2.794437559979208e-06, + "loss": 5.3874, + "step": 698 + }, + { + "epoch": 0.4684397905759162, + "grad_norm": 2.545875310897827, + "learning_rate": 2.7891563831569562e-06, + "loss": 5.4277, + "step": 699 + }, + { + "epoch": 0.46910994764397906, + "grad_norm": 2.0979082584381104, + "learning_rate": 2.7838738981430475e-06, + "loss": 5.128, + "step": 700 + }, + { + "epoch": 0.4697801047120419, + "grad_norm": 2.4145262241363525, + "learning_rate": 2.7785901288363253e-06, + "loss": 5.3202, + "step": 701 + }, + { + "epoch": 0.47045026178010474, + "grad_norm": 2.7727010250091553, + "learning_rate": 2.7733050991414424e-06, + "loss": 5.197, + "step": 702 + }, + { + "epoch": 0.4711204188481675, + "grad_norm": 2.412595748901367, + "learning_rate": 2.7680188329687517e-06, + "loss": 5.3592, + "step": 703 + }, + { + "epoch": 0.47179057591623036, + "grad_norm": 2.443044900894165, + "learning_rate": 2.7627313542342053e-06, + "loss": 5.2874, + "step": 704 + }, + { + "epoch": 0.4724607329842932, + "grad_norm": 2.2860937118530273, + "learning_rate": 2.7574426868592354e-06, + "loss": 5.24, + "step": 705 + }, + { + "epoch": 0.47313089005235603, + "grad_norm": 2.3260374069213867, + "learning_rate": 2.752152854770655e-06, + "loss": 5.0235, + "step": 706 + }, + { + "epoch": 0.4738010471204189, + "grad_norm": 2.500518560409546, + "learning_rate": 2.7468618819005464e-06, + "loss": 5.1536, + "step": 707 + }, + { + "epoch": 0.47447120418848165, + "grad_norm": 2.2053189277648926, + "learning_rate": 2.7415697921861527e-06, + "loss": 5.1939, + "step": 708 + }, + { + "epoch": 0.4751413612565445, + "grad_norm": 1.915648102760315, + "learning_rate": 2.736276609569769e-06, + "loss": 5.0484, + "step": 709 + }, + { + "epoch": 0.47581151832460733, + "grad_norm": 2.492061138153076, + "learning_rate": 2.7309823579986345e-06, + "loss": 5.2883, + "step": 710 + }, + { + "epoch": 0.47648167539267017, + "grad_norm": 2.494213819503784, + "learning_rate": 2.7256870614248277e-06, + "loss": 5.3296, + "step": 711 + }, + { + "epoch": 0.477151832460733, + "grad_norm": 2.026763916015625, + "learning_rate": 2.7203907438051504e-06, + "loss": 5.3168, + "step": 712 + }, + { + "epoch": 0.4778219895287958, + "grad_norm": 2.2470083236694336, + "learning_rate": 2.7150934291010267e-06, + "loss": 5.3063, + "step": 713 + }, + { + "epoch": 0.4784921465968586, + "grad_norm": 2.2697417736053467, + "learning_rate": 2.7097951412783913e-06, + "loss": 5.3035, + "step": 714 + }, + { + "epoch": 0.47916230366492146, + "grad_norm": 2.0392396450042725, + "learning_rate": 2.7044959043075815e-06, + "loss": 5.1482, + "step": 715 + }, + { + "epoch": 0.4798324607329843, + "grad_norm": 2.2121872901916504, + "learning_rate": 2.6991957421632274e-06, + "loss": 5.2075, + "step": 716 + }, + { + "epoch": 0.48050261780104714, + "grad_norm": 2.4546327590942383, + "learning_rate": 2.6938946788241454e-06, + "loss": 5.4877, + "step": 717 + }, + { + "epoch": 0.4811727748691099, + "grad_norm": 2.6870744228363037, + "learning_rate": 2.68859273827323e-06, + "loss": 5.4175, + "step": 718 + }, + { + "epoch": 0.48184293193717276, + "grad_norm": 2.482137441635132, + "learning_rate": 2.6832899444973444e-06, + "loss": 5.1865, + "step": 719 + }, + { + "epoch": 0.4825130890052356, + "grad_norm": 2.210994243621826, + "learning_rate": 2.6779863214872094e-06, + "loss": 5.1259, + "step": 720 + }, + { + "epoch": 0.48318324607329843, + "grad_norm": 2.396907329559326, + "learning_rate": 2.672681893237301e-06, + "loss": 5.4887, + "step": 721 + }, + { + "epoch": 0.48385340314136127, + "grad_norm": 2.131406545639038, + "learning_rate": 2.667376683745735e-06, + "loss": 5.1091, + "step": 722 + }, + { + "epoch": 0.4845235602094241, + "grad_norm": 2.202169895172119, + "learning_rate": 2.662070717014163e-06, + "loss": 4.9898, + "step": 723 + }, + { + "epoch": 0.4851937172774869, + "grad_norm": 2.0075578689575195, + "learning_rate": 2.6567640170476637e-06, + "loss": 5.0948, + "step": 724 + }, + { + "epoch": 0.48586387434554973, + "grad_norm": 2.250730514526367, + "learning_rate": 2.651456607854632e-06, + "loss": 5.1126, + "step": 725 + }, + { + "epoch": 0.48653403141361257, + "grad_norm": 2.187277317047119, + "learning_rate": 2.64614851344667e-06, + "loss": 5.2793, + "step": 726 + }, + { + "epoch": 0.4872041884816754, + "grad_norm": 2.349581480026245, + "learning_rate": 2.640839757838482e-06, + "loss": 5.1725, + "step": 727 + }, + { + "epoch": 0.48787434554973824, + "grad_norm": 2.1224112510681152, + "learning_rate": 2.635530365047764e-06, + "loss": 5.043, + "step": 728 + }, + { + "epoch": 0.488544502617801, + "grad_norm": 2.0967764854431152, + "learning_rate": 2.6302203590950914e-06, + "loss": 5.0547, + "step": 729 + }, + { + "epoch": 0.48921465968586386, + "grad_norm": 2.5871574878692627, + "learning_rate": 2.6249097640038175e-06, + "loss": 5.4152, + "step": 730 + }, + { + "epoch": 0.4898848167539267, + "grad_norm": 2.2016632556915283, + "learning_rate": 2.6195986037999587e-06, + "loss": 4.9081, + "step": 731 + }, + { + "epoch": 0.49055497382198954, + "grad_norm": 2.4175848960876465, + "learning_rate": 2.6142869025120893e-06, + "loss": 5.2261, + "step": 732 + }, + { + "epoch": 0.4912251308900524, + "grad_norm": 2.046898126602173, + "learning_rate": 2.608974684171231e-06, + "loss": 5.3583, + "step": 733 + }, + { + "epoch": 0.49189528795811516, + "grad_norm": 2.4127092361450195, + "learning_rate": 2.6036619728107434e-06, + "loss": 5.4236, + "step": 734 + }, + { + "epoch": 0.492565445026178, + "grad_norm": 2.0978167057037354, + "learning_rate": 2.598348792466219e-06, + "loss": 5.2757, + "step": 735 + }, + { + "epoch": 0.49323560209424083, + "grad_norm": 2.4408397674560547, + "learning_rate": 2.5930351671753707e-06, + "loss": 5.2243, + "step": 736 + }, + { + "epoch": 0.49390575916230367, + "grad_norm": 2.442858934402466, + "learning_rate": 2.5877211209779247e-06, + "loss": 5.2263, + "step": 737 + }, + { + "epoch": 0.4945759162303665, + "grad_norm": 2.2256414890289307, + "learning_rate": 2.5824066779155118e-06, + "loss": 5.0348, + "step": 738 + }, + { + "epoch": 0.49524607329842935, + "grad_norm": 2.098453998565674, + "learning_rate": 2.5770918620315573e-06, + "loss": 5.2263, + "step": 739 + }, + { + "epoch": 0.49591623036649213, + "grad_norm": 2.2181296348571777, + "learning_rate": 2.5717766973711734e-06, + "loss": 5.3056, + "step": 740 + }, + { + "epoch": 0.49658638743455497, + "grad_norm": 2.378608465194702, + "learning_rate": 2.5664612079810512e-06, + "loss": 5.1194, + "step": 741 + }, + { + "epoch": 0.4972565445026178, + "grad_norm": 2.3113150596618652, + "learning_rate": 2.5611454179093515e-06, + "loss": 5.1407, + "step": 742 + }, + { + "epoch": 0.49792670157068064, + "grad_norm": 2.7640299797058105, + "learning_rate": 2.5558293512055927e-06, + "loss": 5.2319, + "step": 743 + }, + { + "epoch": 0.4985968586387435, + "grad_norm": 1.936281442642212, + "learning_rate": 2.5505130319205464e-06, + "loss": 5.2259, + "step": 744 + }, + { + "epoch": 0.49926701570680626, + "grad_norm": 1.9551467895507812, + "learning_rate": 2.545196484106127e-06, + "loss": 5.0921, + "step": 745 + }, + { + "epoch": 0.4999371727748691, + "grad_norm": 3.0447819232940674, + "learning_rate": 2.539879731815284e-06, + "loss": 5.1542, + "step": 746 + }, + { + "epoch": 0.500607329842932, + "grad_norm": 2.248399496078491, + "learning_rate": 2.5345627991018884e-06, + "loss": 4.9197, + "step": 747 + }, + { + "epoch": 0.5012774869109947, + "grad_norm": 2.5293211936950684, + "learning_rate": 2.529245710020632e-06, + "loss": 5.1887, + "step": 748 + }, + { + "epoch": 0.5019476439790576, + "grad_norm": 1.9050483703613281, + "learning_rate": 2.52392848862691e-06, + "loss": 5.198, + "step": 749 + }, + { + "epoch": 0.5026178010471204, + "grad_norm": 1.7937599420547485, + "learning_rate": 2.5186111589767187e-06, + "loss": 5.1386, + "step": 750 + }, + { + "epoch": 0.5032879581151832, + "grad_norm": 2.233506679534912, + "learning_rate": 2.5132937451265427e-06, + "loss": 5.3234, + "step": 751 + }, + { + "epoch": 0.5039581151832461, + "grad_norm": 2.8709378242492676, + "learning_rate": 2.507976271133249e-06, + "loss": 4.9134, + "step": 752 + }, + { + "epoch": 0.5046282722513089, + "grad_norm": 2.2755050659179688, + "learning_rate": 2.502658761053976e-06, + "loss": 4.88, + "step": 753 + }, + { + "epoch": 0.5052984293193717, + "grad_norm": 2.3214619159698486, + "learning_rate": 2.4973412389460245e-06, + "loss": 4.9027, + "step": 754 + }, + { + "epoch": 0.5059685863874346, + "grad_norm": 2.645906925201416, + "learning_rate": 2.4920237288667516e-06, + "loss": 5.1749, + "step": 755 + }, + { + "epoch": 0.5066387434554974, + "grad_norm": 2.431973457336426, + "learning_rate": 2.4867062548734585e-06, + "loss": 5.3679, + "step": 756 + }, + { + "epoch": 0.5073089005235603, + "grad_norm": 2.163480758666992, + "learning_rate": 2.4813888410232825e-06, + "loss": 5.0453, + "step": 757 + }, + { + "epoch": 0.507979057591623, + "grad_norm": 2.323777437210083, + "learning_rate": 2.476071511373091e-06, + "loss": 4.8367, + "step": 758 + }, + { + "epoch": 0.5086492146596858, + "grad_norm": 2.0933356285095215, + "learning_rate": 2.470754289979369e-06, + "loss": 5.1988, + "step": 759 + }, + { + "epoch": 0.5093193717277487, + "grad_norm": 2.36794376373291, + "learning_rate": 2.4654372008981116e-06, + "loss": 5.2736, + "step": 760 + }, + { + "epoch": 0.5099895287958115, + "grad_norm": 2.062737464904785, + "learning_rate": 2.4601202681847173e-06, + "loss": 5.1134, + "step": 761 + }, + { + "epoch": 0.5106596858638743, + "grad_norm": 2.358090400695801, + "learning_rate": 2.4548035158938734e-06, + "loss": 5.2011, + "step": 762 + }, + { + "epoch": 0.5113298429319372, + "grad_norm": 2.187567949295044, + "learning_rate": 2.4494869680794544e-06, + "loss": 5.0013, + "step": 763 + }, + { + "epoch": 0.512, + "grad_norm": 2.2072947025299072, + "learning_rate": 2.444170648794408e-06, + "loss": 5.4118, + "step": 764 + }, + { + "epoch": 0.5126701570680628, + "grad_norm": 2.464012622833252, + "learning_rate": 2.438854582090649e-06, + "loss": 5.101, + "step": 765 + }, + { + "epoch": 0.5133403141361257, + "grad_norm": 1.9377411603927612, + "learning_rate": 2.433538792018949e-06, + "loss": 5.2372, + "step": 766 + }, + { + "epoch": 0.5140104712041885, + "grad_norm": 2.2407162189483643, + "learning_rate": 2.428223302628828e-06, + "loss": 5.1168, + "step": 767 + }, + { + "epoch": 0.5146806282722514, + "grad_norm": 2.3990302085876465, + "learning_rate": 2.422908137968444e-06, + "loss": 5.0664, + "step": 768 + }, + { + "epoch": 0.5153507853403141, + "grad_norm": 2.019167900085449, + "learning_rate": 2.4175933220844895e-06, + "loss": 4.9689, + "step": 769 + }, + { + "epoch": 0.5160209424083769, + "grad_norm": 2.4319581985473633, + "learning_rate": 2.4122788790220757e-06, + "loss": 5.0069, + "step": 770 + }, + { + "epoch": 0.5166910994764398, + "grad_norm": 2.629899024963379, + "learning_rate": 2.4069648328246305e-06, + "loss": 5.1161, + "step": 771 + }, + { + "epoch": 0.5173612565445026, + "grad_norm": 2.352264404296875, + "learning_rate": 2.4016512075337815e-06, + "loss": 5.1608, + "step": 772 + }, + { + "epoch": 0.5180314136125654, + "grad_norm": 2.2626523971557617, + "learning_rate": 2.3963380271892575e-06, + "loss": 5.1764, + "step": 773 + }, + { + "epoch": 0.5187015706806283, + "grad_norm": 2.327106475830078, + "learning_rate": 2.39102531582877e-06, + "loss": 4.9147, + "step": 774 + }, + { + "epoch": 0.5193717277486911, + "grad_norm": 2.278066635131836, + "learning_rate": 2.3857130974879107e-06, + "loss": 5.1306, + "step": 775 + }, + { + "epoch": 0.520041884816754, + "grad_norm": 2.14404296875, + "learning_rate": 2.3804013962000417e-06, + "loss": 5.1931, + "step": 776 + }, + { + "epoch": 0.5207120418848168, + "grad_norm": 2.2046420574188232, + "learning_rate": 2.3750902359961834e-06, + "loss": 5.2374, + "step": 777 + }, + { + "epoch": 0.5213821989528796, + "grad_norm": 2.2356772422790527, + "learning_rate": 2.3697796409049094e-06, + "loss": 5.1609, + "step": 778 + }, + { + "epoch": 0.5220523560209424, + "grad_norm": 2.101590633392334, + "learning_rate": 2.3644696349522367e-06, + "loss": 5.1114, + "step": 779 + }, + { + "epoch": 0.5227225130890052, + "grad_norm": 2.1280152797698975, + "learning_rate": 2.359160242161518e-06, + "loss": 5.1482, + "step": 780 + }, + { + "epoch": 0.523392670157068, + "grad_norm": 2.1777684688568115, + "learning_rate": 2.3538514865533315e-06, + "loss": 5.2686, + "step": 781 + }, + { + "epoch": 0.5240628272251309, + "grad_norm": 2.2483649253845215, + "learning_rate": 2.348543392145369e-06, + "loss": 5.3181, + "step": 782 + }, + { + "epoch": 0.5247329842931937, + "grad_norm": 2.2942185401916504, + "learning_rate": 2.343235982952337e-06, + "loss": 5.0594, + "step": 783 + }, + { + "epoch": 0.5254031413612565, + "grad_norm": 1.8971693515777588, + "learning_rate": 2.3379292829858373e-06, + "loss": 4.9603, + "step": 784 + }, + { + "epoch": 0.5260732984293194, + "grad_norm": 2.3738677501678467, + "learning_rate": 2.3326233162542655e-06, + "loss": 5.2229, + "step": 785 + }, + { + "epoch": 0.5267434554973822, + "grad_norm": 2.0925230979919434, + "learning_rate": 2.3273181067627e-06, + "loss": 5.1843, + "step": 786 + }, + { + "epoch": 0.5274136125654451, + "grad_norm": 2.359753131866455, + "learning_rate": 2.3220136785127914e-06, + "loss": 5.294, + "step": 787 + }, + { + "epoch": 0.5280837696335079, + "grad_norm": 3.4682345390319824, + "learning_rate": 2.316710055502656e-06, + "loss": 5.2098, + "step": 788 + }, + { + "epoch": 0.5287539267015707, + "grad_norm": 2.415839433670044, + "learning_rate": 2.3114072617267695e-06, + "loss": 5.2111, + "step": 789 + }, + { + "epoch": 0.5294240837696335, + "grad_norm": 2.775794267654419, + "learning_rate": 2.3061053211758554e-06, + "loss": 5.2005, + "step": 790 + }, + { + "epoch": 0.5300942408376963, + "grad_norm": 1.9597004652023315, + "learning_rate": 2.300804257836774e-06, + "loss": 4.9533, + "step": 791 + }, + { + "epoch": 0.5307643979057591, + "grad_norm": 2.0995726585388184, + "learning_rate": 2.2955040956924194e-06, + "loss": 5.0624, + "step": 792 + }, + { + "epoch": 0.531434554973822, + "grad_norm": 2.133856773376465, + "learning_rate": 2.290204858721609e-06, + "loss": 5.1067, + "step": 793 + }, + { + "epoch": 0.5321047120418848, + "grad_norm": 2.7437381744384766, + "learning_rate": 2.2849065708989737e-06, + "loss": 5.1492, + "step": 794 + }, + { + "epoch": 0.5327748691099476, + "grad_norm": 2.054792881011963, + "learning_rate": 2.279609256194851e-06, + "loss": 5.16, + "step": 795 + }, + { + "epoch": 0.5334450261780105, + "grad_norm": 2.819891929626465, + "learning_rate": 2.2743129385751736e-06, + "loss": 5.0842, + "step": 796 + }, + { + "epoch": 0.5341151832460733, + "grad_norm": 1.7283567190170288, + "learning_rate": 2.2690176420013663e-06, + "loss": 5.0836, + "step": 797 + }, + { + "epoch": 0.5347853403141362, + "grad_norm": 2.002793550491333, + "learning_rate": 2.263723390430232e-06, + "loss": 5.2371, + "step": 798 + }, + { + "epoch": 0.535455497382199, + "grad_norm": 1.953526496887207, + "learning_rate": 2.2584302078138477e-06, + "loss": 4.928, + "step": 799 + }, + { + "epoch": 0.5361256544502618, + "grad_norm": 2.3337416648864746, + "learning_rate": 2.2531381180994544e-06, + "loss": 5.1453, + "step": 800 + }, + { + "epoch": 0.5367958115183246, + "grad_norm": 2.313509464263916, + "learning_rate": 2.247847145229346e-06, + "loss": 5.4026, + "step": 801 + }, + { + "epoch": 0.5374659685863874, + "grad_norm": 2.409526824951172, + "learning_rate": 2.242557313140766e-06, + "loss": 5.394, + "step": 802 + }, + { + "epoch": 0.5381361256544502, + "grad_norm": 2.319575548171997, + "learning_rate": 2.2372686457657955e-06, + "loss": 5.0763, + "step": 803 + }, + { + "epoch": 0.5388062827225131, + "grad_norm": 2.0233211517333984, + "learning_rate": 2.2319811670312483e-06, + "loss": 4.9404, + "step": 804 + }, + { + "epoch": 0.5394764397905759, + "grad_norm": 2.419832468032837, + "learning_rate": 2.2266949008585593e-06, + "loss": 5.0889, + "step": 805 + }, + { + "epoch": 0.5401465968586388, + "grad_norm": 2.5951621532440186, + "learning_rate": 2.221409871163675e-06, + "loss": 4.7896, + "step": 806 + }, + { + "epoch": 0.5408167539267016, + "grad_norm": 2.0952606201171875, + "learning_rate": 2.2161261018569534e-06, + "loss": 5.1611, + "step": 807 + }, + { + "epoch": 0.5414869109947644, + "grad_norm": 2.287743330001831, + "learning_rate": 2.2108436168430446e-06, + "loss": 5.0981, + "step": 808 + }, + { + "epoch": 0.5421570680628273, + "grad_norm": 2.6961002349853516, + "learning_rate": 2.205562440020792e-06, + "loss": 5.0804, + "step": 809 + }, + { + "epoch": 0.5428272251308901, + "grad_norm": 2.004816770553589, + "learning_rate": 2.200282595283121e-06, + "loss": 5.1105, + "step": 810 + }, + { + "epoch": 0.5434973821989528, + "grad_norm": 2.5916943550109863, + "learning_rate": 2.195004106516927e-06, + "loss": 5.4152, + "step": 811 + }, + { + "epoch": 0.5441675392670157, + "grad_norm": 2.091641664505005, + "learning_rate": 2.1897269976029727e-06, + "loss": 5.3991, + "step": 812 + }, + { + "epoch": 0.5448376963350785, + "grad_norm": 1.8039250373840332, + "learning_rate": 2.1844512924157783e-06, + "loss": 5.1846, + "step": 813 + }, + { + "epoch": 0.5455078534031413, + "grad_norm": 2.0316944122314453, + "learning_rate": 2.1791770148235143e-06, + "loss": 5.3211, + "step": 814 + }, + { + "epoch": 0.5461780104712042, + "grad_norm": 2.534327745437622, + "learning_rate": 2.173904188687891e-06, + "loss": 5.0661, + "step": 815 + }, + { + "epoch": 0.546848167539267, + "grad_norm": 2.272688627243042, + "learning_rate": 2.1686328378640506e-06, + "loss": 5.0007, + "step": 816 + }, + { + "epoch": 0.5475183246073299, + "grad_norm": 2.319639205932617, + "learning_rate": 2.163362986200464e-06, + "loss": 5.2632, + "step": 817 + }, + { + "epoch": 0.5481884816753927, + "grad_norm": 2.070636510848999, + "learning_rate": 2.158094657538818e-06, + "loss": 4.8467, + "step": 818 + }, + { + "epoch": 0.5488586387434555, + "grad_norm": 2.220353364944458, + "learning_rate": 2.152827875713909e-06, + "loss": 5.0162, + "step": 819 + }, + { + "epoch": 0.5495287958115184, + "grad_norm": 2.3918845653533936, + "learning_rate": 2.147562664553537e-06, + "loss": 5.0224, + "step": 820 + }, + { + "epoch": 0.5501989528795812, + "grad_norm": 1.9870012998580933, + "learning_rate": 2.142299047878392e-06, + "loss": 5.0995, + "step": 821 + }, + { + "epoch": 0.5508691099476439, + "grad_norm": 2.232142448425293, + "learning_rate": 2.1370370495019543e-06, + "loss": 5.2807, + "step": 822 + }, + { + "epoch": 0.5515392670157068, + "grad_norm": 2.1024062633514404, + "learning_rate": 2.1317766932303812e-06, + "loss": 5.2492, + "step": 823 + }, + { + "epoch": 0.5522094240837696, + "grad_norm": 2.2490932941436768, + "learning_rate": 2.1265180028624017e-06, + "loss": 5.3037, + "step": 824 + }, + { + "epoch": 0.5528795811518324, + "grad_norm": 2.358008623123169, + "learning_rate": 2.121261002189206e-06, + "loss": 5.0627, + "step": 825 + }, + { + "epoch": 0.5535497382198953, + "grad_norm": 2.569955825805664, + "learning_rate": 2.1160057149943408e-06, + "loss": 5.2377, + "step": 826 + }, + { + "epoch": 0.5542198952879581, + "grad_norm": 2.1548383235931396, + "learning_rate": 2.110752165053602e-06, + "loss": 5.0036, + "step": 827 + }, + { + "epoch": 0.554890052356021, + "grad_norm": 2.5272021293640137, + "learning_rate": 2.105500376134925e-06, + "loss": 4.9154, + "step": 828 + }, + { + "epoch": 0.5555602094240838, + "grad_norm": 2.289073944091797, + "learning_rate": 2.100250371998278e-06, + "loss": 4.9875, + "step": 829 + }, + { + "epoch": 0.5562303664921466, + "grad_norm": 2.1486709117889404, + "learning_rate": 2.0950021763955526e-06, + "loss": 4.9589, + "step": 830 + }, + { + "epoch": 0.5569005235602095, + "grad_norm": 2.581888437271118, + "learning_rate": 2.089755813070462e-06, + "loss": 4.994, + "step": 831 + }, + { + "epoch": 0.5575706806282722, + "grad_norm": 2.5435903072357178, + "learning_rate": 2.0845113057584277e-06, + "loss": 5.399, + "step": 832 + }, + { + "epoch": 0.558240837696335, + "grad_norm": 2.547701835632324, + "learning_rate": 2.079268678186474e-06, + "loss": 4.9896, + "step": 833 + }, + { + "epoch": 0.5589109947643979, + "grad_norm": 2.241283655166626, + "learning_rate": 2.0740279540731234e-06, + "loss": 4.9348, + "step": 834 + }, + { + "epoch": 0.5595811518324607, + "grad_norm": 2.11767315864563, + "learning_rate": 2.0687891571282827e-06, + "loss": 5.0824, + "step": 835 + }, + { + "epoch": 0.5602513089005235, + "grad_norm": 2.2595574855804443, + "learning_rate": 2.0635523110531424e-06, + "loss": 5.0977, + "step": 836 + }, + { + "epoch": 0.5609214659685864, + "grad_norm": 2.1792337894439697, + "learning_rate": 2.0583174395400675e-06, + "loss": 5.1129, + "step": 837 + }, + { + "epoch": 0.5615916230366492, + "grad_norm": 1.6265062093734741, + "learning_rate": 2.0530845662724897e-06, + "loss": 5.2009, + "step": 838 + }, + { + "epoch": 0.5622617801047121, + "grad_norm": 2.008075475692749, + "learning_rate": 2.0478537149247985e-06, + "loss": 5.0015, + "step": 839 + }, + { + "epoch": 0.5629319371727749, + "grad_norm": 2.2118005752563477, + "learning_rate": 2.042624909162237e-06, + "loss": 5.0713, + "step": 840 + }, + { + "epoch": 0.5636020942408377, + "grad_norm": 2.1996700763702393, + "learning_rate": 2.037398172640793e-06, + "loss": 5.0876, + "step": 841 + }, + { + "epoch": 0.5642722513089006, + "grad_norm": 2.2208292484283447, + "learning_rate": 2.032173529007095e-06, + "loss": 5.0323, + "step": 842 + }, + { + "epoch": 0.5649424083769633, + "grad_norm": 2.339991569519043, + "learning_rate": 2.0269510018983003e-06, + "loss": 5.2139, + "step": 843 + }, + { + "epoch": 0.5656125654450261, + "grad_norm": 2.166781187057495, + "learning_rate": 2.0217306149419925e-06, + "loss": 5.0463, + "step": 844 + }, + { + "epoch": 0.566282722513089, + "grad_norm": 2.1559500694274902, + "learning_rate": 2.0165123917560708e-06, + "loss": 5.1012, + "step": 845 + }, + { + "epoch": 0.5669528795811518, + "grad_norm": 2.5520191192626953, + "learning_rate": 2.011296355948647e-06, + "loss": 5.0468, + "step": 846 + }, + { + "epoch": 0.5676230366492147, + "grad_norm": 2.157261848449707, + "learning_rate": 2.006082531117936e-06, + "loss": 5.1208, + "step": 847 + }, + { + "epoch": 0.5682931937172775, + "grad_norm": 2.015791416168213, + "learning_rate": 2.000870940852151e-06, + "loss": 5.0915, + "step": 848 + }, + { + "epoch": 0.5689633507853403, + "grad_norm": 2.281003713607788, + "learning_rate": 1.9956616087293943e-06, + "loss": 4.9733, + "step": 849 + }, + { + "epoch": 0.5696335078534032, + "grad_norm": 2.614745616912842, + "learning_rate": 1.990454558317551e-06, + "loss": 5.0554, + "step": 850 + }, + { + "epoch": 0.570303664921466, + "grad_norm": 1.9381040334701538, + "learning_rate": 1.985249813174186e-06, + "loss": 5.4856, + "step": 851 + }, + { + "epoch": 0.5709738219895288, + "grad_norm": 1.8871889114379883, + "learning_rate": 1.9800473968464333e-06, + "loss": 4.8442, + "step": 852 + }, + { + "epoch": 0.5716439790575917, + "grad_norm": 2.239786148071289, + "learning_rate": 1.974847332870891e-06, + "loss": 4.9804, + "step": 853 + }, + { + "epoch": 0.5723141361256544, + "grad_norm": 2.7184863090515137, + "learning_rate": 1.9696496447735162e-06, + "loss": 5.2111, + "step": 854 + }, + { + "epoch": 0.5729842931937172, + "grad_norm": 1.9862799644470215, + "learning_rate": 1.964454356069514e-06, + "loss": 5.0734, + "step": 855 + }, + { + "epoch": 0.5736544502617801, + "grad_norm": 2.061177968978882, + "learning_rate": 1.9592614902632374e-06, + "loss": 4.8461, + "step": 856 + }, + { + "epoch": 0.5743246073298429, + "grad_norm": 2.305881977081299, + "learning_rate": 1.954071070848076e-06, + "loss": 4.9771, + "step": 857 + }, + { + "epoch": 0.5749947643979058, + "grad_norm": 2.083867311477661, + "learning_rate": 1.948883121306353e-06, + "loss": 4.9486, + "step": 858 + }, + { + "epoch": 0.5756649214659686, + "grad_norm": 2.0957860946655273, + "learning_rate": 1.9436976651092143e-06, + "loss": 5.0704, + "step": 859 + }, + { + "epoch": 0.5763350785340314, + "grad_norm": 2.01287579536438, + "learning_rate": 1.93851472571653e-06, + "loss": 4.7694, + "step": 860 + }, + { + "epoch": 0.5770052356020943, + "grad_norm": 1.8653457164764404, + "learning_rate": 1.9333343265767803e-06, + "loss": 5.2827, + "step": 861 + }, + { + "epoch": 0.5776753926701571, + "grad_norm": 6.047257900238037, + "learning_rate": 1.9281564911269544e-06, + "loss": 5.1922, + "step": 862 + }, + { + "epoch": 0.5783455497382199, + "grad_norm": 1.9761441946029663, + "learning_rate": 1.9229812427924438e-06, + "loss": 5.304, + "step": 863 + }, + { + "epoch": 0.5790157068062827, + "grad_norm": 2.323146343231201, + "learning_rate": 1.9178086049869314e-06, + "loss": 5.2857, + "step": 864 + }, + { + "epoch": 0.5796858638743455, + "grad_norm": 2.059492349624634, + "learning_rate": 1.912638601112295e-06, + "loss": 4.705, + "step": 865 + }, + { + "epoch": 0.5803560209424083, + "grad_norm": 2.288398504257202, + "learning_rate": 1.9074712545584925e-06, + "loss": 5.0554, + "step": 866 + }, + { + "epoch": 0.5810261780104712, + "grad_norm": 2.4416613578796387, + "learning_rate": 1.9023065887034606e-06, + "loss": 5.2227, + "step": 867 + }, + { + "epoch": 0.581696335078534, + "grad_norm": 2.665241241455078, + "learning_rate": 1.8971446269130095e-06, + "loss": 5.1925, + "step": 868 + }, + { + "epoch": 0.5823664921465969, + "grad_norm": 2.535707950592041, + "learning_rate": 1.8919853925407122e-06, + "loss": 5.0185, + "step": 869 + }, + { + "epoch": 0.5830366492146597, + "grad_norm": 1.6442601680755615, + "learning_rate": 1.8868289089278066e-06, + "loss": 5.1138, + "step": 870 + }, + { + "epoch": 0.5837068062827225, + "grad_norm": 2.151127815246582, + "learning_rate": 1.8816751994030825e-06, + "loss": 4.977, + "step": 871 + }, + { + "epoch": 0.5843769633507854, + "grad_norm": 2.169908285140991, + "learning_rate": 1.876524287282781e-06, + "loss": 5.198, + "step": 872 + }, + { + "epoch": 0.5850471204188482, + "grad_norm": 2.1878390312194824, + "learning_rate": 1.8713761958704874e-06, + "loss": 4.8579, + "step": 873 + }, + { + "epoch": 0.585717277486911, + "grad_norm": 2.040858745574951, + "learning_rate": 1.8662309484570237e-06, + "loss": 5.185, + "step": 874 + }, + { + "epoch": 0.5863874345549738, + "grad_norm": 1.8941375017166138, + "learning_rate": 1.8610885683203467e-06, + "loss": 4.9263, + "step": 875 + }, + { + "epoch": 0.5870575916230366, + "grad_norm": 2.8969922065734863, + "learning_rate": 1.8559490787254423e-06, + "loss": 5.1801, + "step": 876 + }, + { + "epoch": 0.5877277486910994, + "grad_norm": 2.199195623397827, + "learning_rate": 1.8508125029242158e-06, + "loss": 5.1154, + "step": 877 + }, + { + "epoch": 0.5883979057591623, + "grad_norm": 2.813744306564331, + "learning_rate": 1.8456788641553947e-06, + "loss": 5.0422, + "step": 878 + }, + { + "epoch": 0.5890680628272251, + "grad_norm": 1.960178256034851, + "learning_rate": 1.840548185644413e-06, + "loss": 5.0718, + "step": 879 + }, + { + "epoch": 0.589738219895288, + "grad_norm": 2.070737361907959, + "learning_rate": 1.835420490603317e-06, + "loss": 5.2001, + "step": 880 + }, + { + "epoch": 0.5904083769633508, + "grad_norm": 2.2630679607391357, + "learning_rate": 1.8302958022306527e-06, + "loss": 5.285, + "step": 881 + }, + { + "epoch": 0.5910785340314136, + "grad_norm": 1.8324917554855347, + "learning_rate": 1.8251741437113646e-06, + "loss": 4.9438, + "step": 882 + }, + { + "epoch": 0.5917486910994765, + "grad_norm": 2.199841260910034, + "learning_rate": 1.8200555382166901e-06, + "loss": 4.932, + "step": 883 + }, + { + "epoch": 0.5924188481675393, + "grad_norm": 2.254441499710083, + "learning_rate": 1.8149400089040519e-06, + "loss": 4.9877, + "step": 884 + }, + { + "epoch": 0.593089005235602, + "grad_norm": 2.3117265701293945, + "learning_rate": 1.809827578916957e-06, + "loss": 5.1466, + "step": 885 + }, + { + "epoch": 0.5937591623036649, + "grad_norm": 2.0028316974639893, + "learning_rate": 1.8047182713848913e-06, + "loss": 5.1821, + "step": 886 + }, + { + "epoch": 0.5944293193717277, + "grad_norm": 2.3601784706115723, + "learning_rate": 1.7996121094232125e-06, + "loss": 5.0561, + "step": 887 + }, + { + "epoch": 0.5950994764397906, + "grad_norm": 1.9924535751342773, + "learning_rate": 1.7945091161330498e-06, + "loss": 5.1431, + "step": 888 + }, + { + "epoch": 0.5957696335078534, + "grad_norm": 2.1438329219818115, + "learning_rate": 1.7894093146011923e-06, + "loss": 5.374, + "step": 889 + }, + { + "epoch": 0.5964397905759162, + "grad_norm": 2.116299629211426, + "learning_rate": 1.7843127278999944e-06, + "loss": 5.1171, + "step": 890 + }, + { + "epoch": 0.5971099476439791, + "grad_norm": 2.721327781677246, + "learning_rate": 1.779219379087262e-06, + "loss": 4.9894, + "step": 891 + }, + { + "epoch": 0.5977801047120419, + "grad_norm": 2.1536357402801514, + "learning_rate": 1.774129291206155e-06, + "loss": 5.2512, + "step": 892 + }, + { + "epoch": 0.5984502617801047, + "grad_norm": 2.7440242767333984, + "learning_rate": 1.7690424872850776e-06, + "loss": 5.3032, + "step": 893 + }, + { + "epoch": 0.5991204188481676, + "grad_norm": 2.2082743644714355, + "learning_rate": 1.7639589903375798e-06, + "loss": 5.1788, + "step": 894 + }, + { + "epoch": 0.5997905759162304, + "grad_norm": 2.348146915435791, + "learning_rate": 1.7588788233622471e-06, + "loss": 4.9295, + "step": 895 + }, + { + "epoch": 0.6004607329842931, + "grad_norm": 2.086449146270752, + "learning_rate": 1.7538020093426028e-06, + "loss": 5.0697, + "step": 896 + }, + { + "epoch": 0.601130890052356, + "grad_norm": 2.2433080673217773, + "learning_rate": 1.7487285712470004e-06, + "loss": 5.1808, + "step": 897 + }, + { + "epoch": 0.6018010471204188, + "grad_norm": 2.728177547454834, + "learning_rate": 1.7436585320285165e-06, + "loss": 5.2392, + "step": 898 + }, + { + "epoch": 0.6024712041884817, + "grad_norm": 2.220369815826416, + "learning_rate": 1.7385919146248542e-06, + "loss": 4.7992, + "step": 899 + }, + { + "epoch": 0.6031413612565445, + "grad_norm": 1.9938991069793701, + "learning_rate": 1.7335287419582359e-06, + "loss": 4.9839, + "step": 900 + }, + { + "epoch": 0.6038115183246073, + "grad_norm": 2.1020009517669678, + "learning_rate": 1.728469036935297e-06, + "loss": 5.3504, + "step": 901 + }, + { + "epoch": 0.6044816753926702, + "grad_norm": 2.0839734077453613, + "learning_rate": 1.7234128224469876e-06, + "loss": 5.1468, + "step": 902 + }, + { + "epoch": 0.605151832460733, + "grad_norm": 1.8644421100616455, + "learning_rate": 1.7183601213684625e-06, + "loss": 4.9848, + "step": 903 + }, + { + "epoch": 0.6058219895287958, + "grad_norm": 2.2959697246551514, + "learning_rate": 1.7133109565589851e-06, + "loss": 5.0542, + "step": 904 + }, + { + "epoch": 0.6064921465968587, + "grad_norm": 2.755241632461548, + "learning_rate": 1.708265350861817e-06, + "loss": 5.2195, + "step": 905 + }, + { + "epoch": 0.6071623036649215, + "grad_norm": 2.443690776824951, + "learning_rate": 1.7032233271041195e-06, + "loss": 5.3558, + "step": 906 + }, + { + "epoch": 0.6078324607329842, + "grad_norm": 2.377474784851074, + "learning_rate": 1.6981849080968495e-06, + "loss": 5.1478, + "step": 907 + }, + { + "epoch": 0.6085026178010471, + "grad_norm": 2.394932985305786, + "learning_rate": 1.6931501166346522e-06, + "loss": 5.1642, + "step": 908 + }, + { + "epoch": 0.6091727748691099, + "grad_norm": 2.0526111125946045, + "learning_rate": 1.6881189754957634e-06, + "loss": 4.9269, + "step": 909 + }, + { + "epoch": 0.6098429319371728, + "grad_norm": 2.377007484436035, + "learning_rate": 1.6830915074419041e-06, + "loss": 5.1605, + "step": 910 + }, + { + "epoch": 0.6105130890052356, + "grad_norm": 2.189647912979126, + "learning_rate": 1.6780677352181781e-06, + "loss": 5.4189, + "step": 911 + }, + { + "epoch": 0.6111832460732984, + "grad_norm": 2.9951906204223633, + "learning_rate": 1.673047681552968e-06, + "loss": 5.0452, + "step": 912 + }, + { + "epoch": 0.6118534031413613, + "grad_norm": 2.1930408477783203, + "learning_rate": 1.6680313691578314e-06, + "loss": 4.9555, + "step": 913 + }, + { + "epoch": 0.6125235602094241, + "grad_norm": 2.541778802871704, + "learning_rate": 1.6630188207274034e-06, + "loss": 5.347, + "step": 914 + }, + { + "epoch": 0.613193717277487, + "grad_norm": 2.424107074737549, + "learning_rate": 1.6580100589392862e-06, + "loss": 4.8499, + "step": 915 + }, + { + "epoch": 0.6138638743455498, + "grad_norm": 2.1423468589782715, + "learning_rate": 1.6530051064539544e-06, + "loss": 4.891, + "step": 916 + }, + { + "epoch": 0.6145340314136125, + "grad_norm": 2.1596438884735107, + "learning_rate": 1.648003985914647e-06, + "loss": 5.2224, + "step": 917 + }, + { + "epoch": 0.6152041884816754, + "grad_norm": 2.459508180618286, + "learning_rate": 1.643006719947266e-06, + "loss": 4.9728, + "step": 918 + }, + { + "epoch": 0.6158743455497382, + "grad_norm": 2.7368266582489014, + "learning_rate": 1.6380133311602745e-06, + "loss": 5.0703, + "step": 919 + }, + { + "epoch": 0.616544502617801, + "grad_norm": 2.2180020809173584, + "learning_rate": 1.6330238421445959e-06, + "loss": 4.8884, + "step": 920 + }, + { + "epoch": 0.6172146596858639, + "grad_norm": 2.030007839202881, + "learning_rate": 1.628038275473511e-06, + "loss": 5.0728, + "step": 921 + }, + { + "epoch": 0.6178848167539267, + "grad_norm": 2.1178250312805176, + "learning_rate": 1.6230566537025539e-06, + "loss": 5.336, + "step": 922 + }, + { + "epoch": 0.6185549738219895, + "grad_norm": 2.596489191055298, + "learning_rate": 1.6180789993694092e-06, + "loss": 5.3047, + "step": 923 + }, + { + "epoch": 0.6192251308900524, + "grad_norm": 2.662217855453491, + "learning_rate": 1.6131053349938167e-06, + "loss": 5.0644, + "step": 924 + }, + { + "epoch": 0.6198952879581152, + "grad_norm": 2.6869699954986572, + "learning_rate": 1.6081356830774625e-06, + "loss": 4.8568, + "step": 925 + }, + { + "epoch": 0.620565445026178, + "grad_norm": 2.0684654712677, + "learning_rate": 1.60317006610388e-06, + "loss": 4.9038, + "step": 926 + }, + { + "epoch": 0.6212356020942409, + "grad_norm": 2.492082357406616, + "learning_rate": 1.5982085065383463e-06, + "loss": 4.9686, + "step": 927 + }, + { + "epoch": 0.6219057591623036, + "grad_norm": 1.890755534172058, + "learning_rate": 1.593251026827785e-06, + "loss": 5.0921, + "step": 928 + }, + { + "epoch": 0.6225759162303665, + "grad_norm": 2.640437126159668, + "learning_rate": 1.5882976494006597e-06, + "loss": 5.0312, + "step": 929 + }, + { + "epoch": 0.6232460732984293, + "grad_norm": 2.528960704803467, + "learning_rate": 1.583348396666875e-06, + "loss": 5.3454, + "step": 930 + }, + { + "epoch": 0.6239162303664921, + "grad_norm": 1.8781567811965942, + "learning_rate": 1.5784032910176767e-06, + "loss": 4.9509, + "step": 931 + }, + { + "epoch": 0.624586387434555, + "grad_norm": 2.1682565212249756, + "learning_rate": 1.5734623548255446e-06, + "loss": 5.0253, + "step": 932 + }, + { + "epoch": 0.6252565445026178, + "grad_norm": 2.2101004123687744, + "learning_rate": 1.5685256104440977e-06, + "loss": 5.0726, + "step": 933 + }, + { + "epoch": 0.6259267015706806, + "grad_norm": 2.197580337524414, + "learning_rate": 1.5635930802079905e-06, + "loss": 5.0462, + "step": 934 + }, + { + "epoch": 0.6265968586387435, + "grad_norm": 2.051706075668335, + "learning_rate": 1.558664786432812e-06, + "loss": 5.0463, + "step": 935 + }, + { + "epoch": 0.6272670157068063, + "grad_norm": 2.4994492530822754, + "learning_rate": 1.5537407514149843e-06, + "loss": 5.2271, + "step": 936 + }, + { + "epoch": 0.6279371727748692, + "grad_norm": 2.1373913288116455, + "learning_rate": 1.5488209974316608e-06, + "loss": 5.1098, + "step": 937 + }, + { + "epoch": 0.6286073298429319, + "grad_norm": 3.135277509689331, + "learning_rate": 1.5439055467406299e-06, + "loss": 5.1169, + "step": 938 + }, + { + "epoch": 0.6292774869109947, + "grad_norm": 2.1849703788757324, + "learning_rate": 1.5389944215802093e-06, + "loss": 5.1895, + "step": 939 + }, + { + "epoch": 0.6299476439790576, + "grad_norm": 2.2958791255950928, + "learning_rate": 1.5340876441691465e-06, + "loss": 5.2696, + "step": 940 + }, + { + "epoch": 0.6306178010471204, + "grad_norm": 2.3132193088531494, + "learning_rate": 1.5291852367065224e-06, + "loss": 5.2749, + "step": 941 + }, + { + "epoch": 0.6312879581151832, + "grad_norm": 3.056617021560669, + "learning_rate": 1.5242872213716426e-06, + "loss": 4.9438, + "step": 942 + }, + { + "epoch": 0.6319581151832461, + "grad_norm": 1.7447956800460815, + "learning_rate": 1.5193936203239452e-06, + "loss": 5.0243, + "step": 943 + }, + { + "epoch": 0.6326282722513089, + "grad_norm": 2.151508092880249, + "learning_rate": 1.514504455702897e-06, + "loss": 5.2879, + "step": 944 + }, + { + "epoch": 0.6332984293193717, + "grad_norm": 2.2350332736968994, + "learning_rate": 1.5096197496278942e-06, + "loss": 5.2136, + "step": 945 + }, + { + "epoch": 0.6339685863874346, + "grad_norm": 2.3778679370880127, + "learning_rate": 1.5047395241981606e-06, + "loss": 5.215, + "step": 946 + }, + { + "epoch": 0.6346387434554974, + "grad_norm": 2.2321999073028564, + "learning_rate": 1.4998638014926476e-06, + "loss": 5.1403, + "step": 947 + }, + { + "epoch": 0.6353089005235603, + "grad_norm": 2.958186388015747, + "learning_rate": 1.4949926035699382e-06, + "loss": 4.9567, + "step": 948 + }, + { + "epoch": 0.635979057591623, + "grad_norm": 1.9767028093338013, + "learning_rate": 1.4901259524681444e-06, + "loss": 5.1582, + "step": 949 + }, + { + "epoch": 0.6366492146596858, + "grad_norm": 5.674414157867432, + "learning_rate": 1.485263870204805e-06, + "loss": 4.9456, + "step": 950 + }, + { + "epoch": 0.6373193717277487, + "grad_norm": 2.754943370819092, + "learning_rate": 1.4804063787767924e-06, + "loss": 5.4178, + "step": 951 + }, + { + "epoch": 0.6379895287958115, + "grad_norm": 2.0564680099487305, + "learning_rate": 1.4755535001602044e-06, + "loss": 5.1593, + "step": 952 + }, + { + "epoch": 0.6386596858638743, + "grad_norm": 2.3718740940093994, + "learning_rate": 1.470705256310275e-06, + "loss": 5.0125, + "step": 953 + }, + { + "epoch": 0.6393298429319372, + "grad_norm": 2.3798463344573975, + "learning_rate": 1.4658616691612653e-06, + "loss": 5.1758, + "step": 954 + }, + { + "epoch": 0.64, + "grad_norm": 2.253955602645874, + "learning_rate": 1.4610227606263718e-06, + "loss": 5.0712, + "step": 955 + }, + { + "epoch": 0.6406701570680629, + "grad_norm": 2.273934841156006, + "learning_rate": 1.4561885525976227e-06, + "loss": 5.1689, + "step": 956 + }, + { + "epoch": 0.6413403141361257, + "grad_norm": 2.539167881011963, + "learning_rate": 1.4513590669457798e-06, + "loss": 5.1739, + "step": 957 + }, + { + "epoch": 0.6420104712041885, + "grad_norm": 2.5597174167633057, + "learning_rate": 1.4465343255202425e-06, + "loss": 5.2156, + "step": 958 + }, + { + "epoch": 0.6426806282722514, + "grad_norm": 2.087078094482422, + "learning_rate": 1.4417143501489428e-06, + "loss": 4.8461, + "step": 959 + }, + { + "epoch": 0.6433507853403141, + "grad_norm": 2.6639723777770996, + "learning_rate": 1.436899162638255e-06, + "loss": 5.0139, + "step": 960 + }, + { + "epoch": 0.6440209424083769, + "grad_norm": 2.5338354110717773, + "learning_rate": 1.4320887847728864e-06, + "loss": 5.381, + "step": 961 + }, + { + "epoch": 0.6446910994764398, + "grad_norm": 2.2862353324890137, + "learning_rate": 1.4272832383157902e-06, + "loss": 5.1397, + "step": 962 + }, + { + "epoch": 0.6453612565445026, + "grad_norm": 2.38358998298645, + "learning_rate": 1.422482545008059e-06, + "loss": 5.1036, + "step": 963 + }, + { + "epoch": 0.6460314136125654, + "grad_norm": 2.191732168197632, + "learning_rate": 1.41768672656883e-06, + "loss": 5.3168, + "step": 964 + }, + { + "epoch": 0.6467015706806283, + "grad_norm": 2.504222869873047, + "learning_rate": 1.4128958046951835e-06, + "loss": 5.0436, + "step": 965 + }, + { + "epoch": 0.6473717277486911, + "grad_norm": 2.3530819416046143, + "learning_rate": 1.4081098010620481e-06, + "loss": 5.3618, + "step": 966 + }, + { + "epoch": 0.648041884816754, + "grad_norm": 2.2972545623779297, + "learning_rate": 1.4033287373221022e-06, + "loss": 5.1409, + "step": 967 + }, + { + "epoch": 0.6487120418848168, + "grad_norm": 2.2007195949554443, + "learning_rate": 1.398552635105675e-06, + "loss": 5.3514, + "step": 968 + }, + { + "epoch": 0.6493821989528796, + "grad_norm": 2.092994213104248, + "learning_rate": 1.3937815160206485e-06, + "loss": 5.1033, + "step": 969 + }, + { + "epoch": 0.6500523560209424, + "grad_norm": 1.8113712072372437, + "learning_rate": 1.3890154016523615e-06, + "loss": 4.8722, + "step": 970 + }, + { + "epoch": 0.6507225130890052, + "grad_norm": 2.0513923168182373, + "learning_rate": 1.3842543135635075e-06, + "loss": 5.2054, + "step": 971 + }, + { + "epoch": 0.651392670157068, + "grad_norm": 2.532233238220215, + "learning_rate": 1.3794982732940444e-06, + "loss": 5.0319, + "step": 972 + }, + { + "epoch": 0.6520628272251309, + "grad_norm": 2.2876851558685303, + "learning_rate": 1.3747473023610885e-06, + "loss": 5.084, + "step": 973 + }, + { + "epoch": 0.6527329842931937, + "grad_norm": 1.9787242412567139, + "learning_rate": 1.3700014222588256e-06, + "loss": 4.9917, + "step": 974 + }, + { + "epoch": 0.6534031413612565, + "grad_norm": 2.3899760246276855, + "learning_rate": 1.3652606544584086e-06, + "loss": 5.1363, + "step": 975 + }, + { + "epoch": 0.6540732984293194, + "grad_norm": 2.2005436420440674, + "learning_rate": 1.3605250204078585e-06, + "loss": 5.0114, + "step": 976 + }, + { + "epoch": 0.6547434554973822, + "grad_norm": 2.094499349594116, + "learning_rate": 1.3557945415319746e-06, + "loss": 5.2157, + "step": 977 + }, + { + "epoch": 0.6554136125654451, + "grad_norm": 2.4904541969299316, + "learning_rate": 1.3510692392322316e-06, + "loss": 5.2516, + "step": 978 + }, + { + "epoch": 0.6560837696335079, + "grad_norm": 1.9249300956726074, + "learning_rate": 1.3463491348866837e-06, + "loss": 5.1889, + "step": 979 + }, + { + "epoch": 0.6567539267015707, + "grad_norm": 2.2775089740753174, + "learning_rate": 1.3416342498498703e-06, + "loss": 5.2225, + "step": 980 + }, + { + "epoch": 0.6574240837696335, + "grad_norm": 2.057075262069702, + "learning_rate": 1.3369246054527152e-06, + "loss": 5.279, + "step": 981 + }, + { + "epoch": 0.6580942408376963, + "grad_norm": 3.090810775756836, + "learning_rate": 1.3322202230024356e-06, + "loss": 5.0438, + "step": 982 + }, + { + "epoch": 0.6587643979057591, + "grad_norm": 2.0964043140411377, + "learning_rate": 1.3275211237824415e-06, + "loss": 5.1803, + "step": 983 + }, + { + "epoch": 0.659434554973822, + "grad_norm": 2.631629228591919, + "learning_rate": 1.3228273290522403e-06, + "loss": 4.9872, + "step": 984 + }, + { + "epoch": 0.6601047120418848, + "grad_norm": 2.3109307289123535, + "learning_rate": 1.3181388600473433e-06, + "loss": 5.0663, + "step": 985 + }, + { + "epoch": 0.6607748691099476, + "grad_norm": 2.0783398151397705, + "learning_rate": 1.3134557379791636e-06, + "loss": 5.0157, + "step": 986 + }, + { + "epoch": 0.6614450261780105, + "grad_norm": 1.9334152936935425, + "learning_rate": 1.308777984034925e-06, + "loss": 4.9186, + "step": 987 + }, + { + "epoch": 0.6621151832460733, + "grad_norm": 2.0614535808563232, + "learning_rate": 1.3041056193775667e-06, + "loss": 5.1108, + "step": 988 + }, + { + "epoch": 0.6627853403141362, + "grad_norm": 2.4771032333374023, + "learning_rate": 1.2994386651456454e-06, + "loss": 5.0904, + "step": 989 + }, + { + "epoch": 0.663455497382199, + "grad_norm": 2.3995018005371094, + "learning_rate": 1.2947771424532399e-06, + "loss": 5.1486, + "step": 990 + }, + { + "epoch": 0.6641256544502617, + "grad_norm": 2.0989136695861816, + "learning_rate": 1.290121072389854e-06, + "loss": 5.1089, + "step": 991 + }, + { + "epoch": 0.6647958115183246, + "grad_norm": 2.0078938007354736, + "learning_rate": 1.2854704760203265e-06, + "loss": 5.327, + "step": 992 + }, + { + "epoch": 0.6654659685863874, + "grad_norm": 2.3669705390930176, + "learning_rate": 1.280825374384728e-06, + "loss": 5.0939, + "step": 993 + }, + { + "epoch": 0.6661361256544502, + "grad_norm": 2.0225884914398193, + "learning_rate": 1.2761857884982748e-06, + "loss": 5.1236, + "step": 994 + }, + { + "epoch": 0.6668062827225131, + "grad_norm": 2.120720863342285, + "learning_rate": 1.2715517393512239e-06, + "loss": 5.0135, + "step": 995 + }, + { + "epoch": 0.6674764397905759, + "grad_norm": 2.4720404148101807, + "learning_rate": 1.2669232479087868e-06, + "loss": 5.202, + "step": 996 + }, + { + "epoch": 0.6681465968586388, + "grad_norm": 2.271608352661133, + "learning_rate": 1.2623003351110302e-06, + "loss": 4.9941, + "step": 997 + }, + { + "epoch": 0.6688167539267016, + "grad_norm": 2.137437343597412, + "learning_rate": 1.2576830218727817e-06, + "loss": 4.9493, + "step": 998 + }, + { + "epoch": 0.6694869109947644, + "grad_norm": 2.634817600250244, + "learning_rate": 1.253071329083537e-06, + "loss": 5.2495, + "step": 999 + }, + { + "epoch": 0.6701570680628273, + "grad_norm": 2.1700284481048584, + "learning_rate": 1.2484652776073613e-06, + "loss": 5.0001, + "step": 1000 + }, + { + "epoch": 0.6708272251308901, + "grad_norm": 2.2349369525909424, + "learning_rate": 1.2438648882827984e-06, + "loss": 5.0308, + "step": 1001 + }, + { + "epoch": 0.6714973821989528, + "grad_norm": 2.265810966491699, + "learning_rate": 1.2392701819227768e-06, + "loss": 5.1102, + "step": 1002 + }, + { + "epoch": 0.6721675392670157, + "grad_norm": 2.1368231773376465, + "learning_rate": 1.2346811793145135e-06, + "loss": 4.8558, + "step": 1003 + }, + { + "epoch": 0.6728376963350785, + "grad_norm": 2.780846118927002, + "learning_rate": 1.2300979012194222e-06, + "loss": 5.0876, + "step": 1004 + }, + { + "epoch": 0.6735078534031413, + "grad_norm": 2.227250814437866, + "learning_rate": 1.2255203683730144e-06, + "loss": 5.2692, + "step": 1005 + }, + { + "epoch": 0.6741780104712042, + "grad_norm": 1.9963433742523193, + "learning_rate": 1.2209486014848138e-06, + "loss": 5.0172, + "step": 1006 + }, + { + "epoch": 0.674848167539267, + "grad_norm": 2.1841299533843994, + "learning_rate": 1.216382621238253e-06, + "loss": 5.1824, + "step": 1007 + }, + { + "epoch": 0.6755183246073299, + "grad_norm": 2.150474786758423, + "learning_rate": 1.2118224482905892e-06, + "loss": 5.2183, + "step": 1008 + }, + { + "epoch": 0.6761884816753927, + "grad_norm": 2.589063882827759, + "learning_rate": 1.2072681032728048e-06, + "loss": 5.0682, + "step": 1009 + }, + { + "epoch": 0.6768586387434555, + "grad_norm": 2.179386615753174, + "learning_rate": 1.2027196067895138e-06, + "loss": 4.9421, + "step": 1010 + }, + { + "epoch": 0.6775287958115184, + "grad_norm": 1.9265069961547852, + "learning_rate": 1.1981769794188735e-06, + "loss": 5.1369, + "step": 1011 + }, + { + "epoch": 0.6781989528795812, + "grad_norm": 2.187723159790039, + "learning_rate": 1.1936402417124865e-06, + "loss": 5.0849, + "step": 1012 + }, + { + "epoch": 0.6788691099476439, + "grad_norm": 2.1362497806549072, + "learning_rate": 1.1891094141953114e-06, + "loss": 5.1315, + "step": 1013 + }, + { + "epoch": 0.6795392670157068, + "grad_norm": 2.778053045272827, + "learning_rate": 1.1845845173655658e-06, + "loss": 5.1386, + "step": 1014 + }, + { + "epoch": 0.6802094240837696, + "grad_norm": 2.422011137008667, + "learning_rate": 1.1800655716946355e-06, + "loss": 5.2709, + "step": 1015 + }, + { + "epoch": 0.6808795811518324, + "grad_norm": 3.0462863445281982, + "learning_rate": 1.1755525976269851e-06, + "loss": 5.0757, + "step": 1016 + }, + { + "epoch": 0.6815497382198953, + "grad_norm": 2.319657564163208, + "learning_rate": 1.1710456155800612e-06, + "loss": 5.2053, + "step": 1017 + }, + { + "epoch": 0.6822198952879581, + "grad_norm": 1.9604647159576416, + "learning_rate": 1.1665446459442014e-06, + "loss": 4.9627, + "step": 1018 + }, + { + "epoch": 0.682890052356021, + "grad_norm": 2.1266629695892334, + "learning_rate": 1.1620497090825434e-06, + "loss": 5.0072, + "step": 1019 + }, + { + "epoch": 0.6835602094240838, + "grad_norm": 2.379615068435669, + "learning_rate": 1.157560825330929e-06, + "loss": 4.9827, + "step": 1020 + }, + { + "epoch": 0.6842303664921466, + "grad_norm": 1.7763530015945435, + "learning_rate": 1.153078014997816e-06, + "loss": 4.9343, + "step": 1021 + }, + { + "epoch": 0.6849005235602095, + "grad_norm": 1.9431225061416626, + "learning_rate": 1.1486012983641858e-06, + "loss": 5.3294, + "step": 1022 + }, + { + "epoch": 0.6855706806282722, + "grad_norm": 2.3577218055725098, + "learning_rate": 1.1441306956834506e-06, + "loss": 4.8451, + "step": 1023 + }, + { + "epoch": 0.686240837696335, + "grad_norm": 2.1077463626861572, + "learning_rate": 1.1396662271813625e-06, + "loss": 5.1231, + "step": 1024 + }, + { + "epoch": 0.6869109947643979, + "grad_norm": 2.119215250015259, + "learning_rate": 1.135207913055919e-06, + "loss": 5.1054, + "step": 1025 + }, + { + "epoch": 0.6875811518324607, + "grad_norm": 1.9343498945236206, + "learning_rate": 1.130755773477278e-06, + "loss": 4.9137, + "step": 1026 + }, + { + "epoch": 0.6882513089005236, + "grad_norm": 1.9753541946411133, + "learning_rate": 1.1263098285876592e-06, + "loss": 5.0259, + "step": 1027 + }, + { + "epoch": 0.6889214659685864, + "grad_norm": 1.7094663381576538, + "learning_rate": 1.1218700985012584e-06, + "loss": 5.3622, + "step": 1028 + }, + { + "epoch": 0.6895916230366492, + "grad_norm": 2.331881046295166, + "learning_rate": 1.1174366033041555e-06, + "loss": 5.1145, + "step": 1029 + }, + { + "epoch": 0.6902617801047121, + "grad_norm": 2.7443439960479736, + "learning_rate": 1.11300936305422e-06, + "loss": 5.2896, + "step": 1030 + }, + { + "epoch": 0.6909319371727749, + "grad_norm": 2.5008018016815186, + "learning_rate": 1.1085883977810247e-06, + "loss": 5.1273, + "step": 1031 + }, + { + "epoch": 0.6916020942408377, + "grad_norm": 2.0488998889923096, + "learning_rate": 1.1041737274857533e-06, + "loss": 5.3134, + "step": 1032 + }, + { + "epoch": 0.6922722513089006, + "grad_norm": 1.9997189044952393, + "learning_rate": 1.099765372141111e-06, + "loss": 4.9215, + "step": 1033 + }, + { + "epoch": 0.6929424083769633, + "grad_norm": 2.278038740158081, + "learning_rate": 1.095363351691231e-06, + "loss": 5.3046, + "step": 1034 + }, + { + "epoch": 0.6936125654450261, + "grad_norm": 2.224224805831909, + "learning_rate": 1.090967686051586e-06, + "loss": 4.8629, + "step": 1035 + }, + { + "epoch": 0.694282722513089, + "grad_norm": 1.8891581296920776, + "learning_rate": 1.0865783951089005e-06, + "loss": 4.8682, + "step": 1036 + }, + { + "epoch": 0.6949528795811518, + "grad_norm": 2.5251624584198, + "learning_rate": 1.0821954987210584e-06, + "loss": 5.1834, + "step": 1037 + }, + { + "epoch": 0.6956230366492147, + "grad_norm": 1.9501714706420898, + "learning_rate": 1.0778190167170145e-06, + "loss": 4.988, + "step": 1038 + }, + { + "epoch": 0.6962931937172775, + "grad_norm": 2.3251214027404785, + "learning_rate": 1.0734489688967001e-06, + "loss": 4.9515, + "step": 1039 + }, + { + "epoch": 0.6969633507853403, + "grad_norm": 2.245398998260498, + "learning_rate": 1.0690853750309417e-06, + "loss": 5.034, + "step": 1040 + }, + { + "epoch": 0.6976335078534032, + "grad_norm": 2.169013261795044, + "learning_rate": 1.0647282548613636e-06, + "loss": 5.1129, + "step": 1041 + }, + { + "epoch": 0.698303664921466, + "grad_norm": 2.124728202819824, + "learning_rate": 1.0603776281003037e-06, + "loss": 4.9932, + "step": 1042 + }, + { + "epoch": 0.6989738219895288, + "grad_norm": 2.4038686752319336, + "learning_rate": 1.0560335144307238e-06, + "loss": 5.0806, + "step": 1043 + }, + { + "epoch": 0.6996439790575916, + "grad_norm": 2.3839521408081055, + "learning_rate": 1.0516959335061157e-06, + "loss": 5.2674, + "step": 1044 + }, + { + "epoch": 0.7003141361256544, + "grad_norm": 2.4349517822265625, + "learning_rate": 1.0473649049504195e-06, + "loss": 5.0355, + "step": 1045 + }, + { + "epoch": 0.7009842931937172, + "grad_norm": 2.201889991760254, + "learning_rate": 1.0430404483579299e-06, + "loss": 4.9743, + "step": 1046 + }, + { + "epoch": 0.7016544502617801, + "grad_norm": 1.9042011499404907, + "learning_rate": 1.03872258329321e-06, + "loss": 4.9538, + "step": 1047 + }, + { + "epoch": 0.7023246073298429, + "grad_norm": 1.9022135734558105, + "learning_rate": 1.0344113292909993e-06, + "loss": 5.0116, + "step": 1048 + }, + { + "epoch": 0.7029947643979058, + "grad_norm": 2.583439588546753, + "learning_rate": 1.0301067058561296e-06, + "loss": 5.1247, + "step": 1049 + }, + { + "epoch": 0.7036649214659686, + "grad_norm": 2.090341329574585, + "learning_rate": 1.025808732463434e-06, + "loss": 5.0901, + "step": 1050 + }, + { + "epoch": 0.7043350785340314, + "grad_norm": 2.315129280090332, + "learning_rate": 1.0215174285576615e-06, + "loss": 5.0314, + "step": 1051 + }, + { + "epoch": 0.7050052356020943, + "grad_norm": 2.396319627761841, + "learning_rate": 1.0172328135533852e-06, + "loss": 5.2357, + "step": 1052 + }, + { + "epoch": 0.7056753926701571, + "grad_norm": 2.5406458377838135, + "learning_rate": 1.0129549068349185e-06, + "loss": 5.1052, + "step": 1053 + }, + { + "epoch": 0.70634554973822, + "grad_norm": 1.9954371452331543, + "learning_rate": 1.008683727756223e-06, + "loss": 5.1479, + "step": 1054 + }, + { + "epoch": 0.7070157068062827, + "grad_norm": 2.01151967048645, + "learning_rate": 1.0044192956408241e-06, + "loss": 5.1476, + "step": 1055 + }, + { + "epoch": 0.7076858638743455, + "grad_norm": 1.9611108303070068, + "learning_rate": 1.0001616297817238e-06, + "loss": 5.0162, + "step": 1056 + }, + { + "epoch": 0.7083560209424083, + "grad_norm": 2.1401920318603516, + "learning_rate": 9.959107494413118e-07, + "loss": 5.1986, + "step": 1057 + }, + { + "epoch": 0.7090261780104712, + "grad_norm": 2.2252449989318848, + "learning_rate": 9.9166667385128e-07, + "loss": 5.2287, + "step": 1058 + }, + { + "epoch": 0.709696335078534, + "grad_norm": 2.3955202102661133, + "learning_rate": 9.874294222125316e-07, + "loss": 5.1653, + "step": 1059 + }, + { + "epoch": 0.7103664921465969, + "grad_norm": 2.327396869659424, + "learning_rate": 9.831990136951003e-07, + "loss": 4.9633, + "step": 1060 + }, + { + "epoch": 0.7110366492146597, + "grad_norm": 1.979533314704895, + "learning_rate": 9.789754674380597e-07, + "loss": 5.1874, + "step": 1061 + }, + { + "epoch": 0.7117068062827225, + "grad_norm": 2.0694804191589355, + "learning_rate": 9.747588025494351e-07, + "loss": 5.1106, + "step": 1062 + }, + { + "epoch": 0.7123769633507854, + "grad_norm": 2.1586427688598633, + "learning_rate": 9.705490381061227e-07, + "loss": 5.7843, + "step": 1063 + }, + { + "epoch": 0.7130471204188482, + "grad_norm": 2.1827874183654785, + "learning_rate": 9.663461931537965e-07, + "loss": 5.0521, + "step": 1064 + }, + { + "epoch": 0.7137172774869109, + "grad_norm": 1.917332649230957, + "learning_rate": 9.621502867068286e-07, + "loss": 4.7877, + "step": 1065 + }, + { + "epoch": 0.7143874345549738, + "grad_norm": 2.061023473739624, + "learning_rate": 9.57961337748198e-07, + "loss": 5.1603, + "step": 1066 + }, + { + "epoch": 0.7150575916230366, + "grad_norm": 2.5405564308166504, + "learning_rate": 9.537793652294096e-07, + "loss": 4.9064, + "step": 1067 + }, + { + "epoch": 0.7157277486910995, + "grad_norm": 2.10152268409729, + "learning_rate": 9.496043880704025e-07, + "loss": 4.9259, + "step": 1068 + }, + { + "epoch": 0.7163979057591623, + "grad_norm": 2.14532732963562, + "learning_rate": 9.454364251594683e-07, + "loss": 4.8689, + "step": 1069 + }, + { + "epoch": 0.7170680628272251, + "grad_norm": 2.5526065826416016, + "learning_rate": 9.412754953531664e-07, + "loss": 5.0766, + "step": 1070 + }, + { + "epoch": 0.717738219895288, + "grad_norm": 2.3036630153656006, + "learning_rate": 9.371216174762365e-07, + "loss": 5.0425, + "step": 1071 + }, + { + "epoch": 0.7184083769633508, + "grad_norm": 2.6235127449035645, + "learning_rate": 9.329748103215153e-07, + "loss": 5.1771, + "step": 1072 + }, + { + "epoch": 0.7190785340314136, + "grad_norm": 2.0794284343719482, + "learning_rate": 9.288350926498474e-07, + "loss": 5.0205, + "step": 1073 + }, + { + "epoch": 0.7197486910994765, + "grad_norm": 2.227818489074707, + "learning_rate": 9.247024831900065e-07, + "loss": 5.1375, + "step": 1074 + }, + { + "epoch": 0.7204188481675393, + "grad_norm": 2.365229606628418, + "learning_rate": 9.205770006386066e-07, + "loss": 4.9339, + "step": 1075 + }, + { + "epoch": 0.721089005235602, + "grad_norm": 2.539462089538574, + "learning_rate": 9.164586636600168e-07, + "loss": 5.1168, + "step": 1076 + }, + { + "epoch": 0.7217591623036649, + "grad_norm": 1.930780053138733, + "learning_rate": 9.123474908862817e-07, + "loss": 4.8905, + "step": 1077 + }, + { + "epoch": 0.7224293193717277, + "grad_norm": 2.19105863571167, + "learning_rate": 9.082435009170298e-07, + "loss": 5.0776, + "step": 1078 + }, + { + "epoch": 0.7230994764397906, + "grad_norm": 2.0612986087799072, + "learning_rate": 9.041467123193967e-07, + "loss": 4.9747, + "step": 1079 + }, + { + "epoch": 0.7237696335078534, + "grad_norm": 2.2077128887176514, + "learning_rate": 9.000571436279363e-07, + "loss": 5.2412, + "step": 1080 + }, + { + "epoch": 0.7244397905759162, + "grad_norm": 2.2451539039611816, + "learning_rate": 8.959748133445401e-07, + "loss": 5.2, + "step": 1081 + }, + { + "epoch": 0.7251099476439791, + "grad_norm": 2.2490241527557373, + "learning_rate": 8.918997399383491e-07, + "loss": 4.871, + "step": 1082 + }, + { + "epoch": 0.7257801047120419, + "grad_norm": 2.010239839553833, + "learning_rate": 8.878319418456738e-07, + "loss": 5.3053, + "step": 1083 + }, + { + "epoch": 0.7264502617801047, + "grad_norm": 2.4550492763519287, + "learning_rate": 8.837714374699116e-07, + "loss": 5.1626, + "step": 1084 + }, + { + "epoch": 0.7271204188481676, + "grad_norm": 2.0976195335388184, + "learning_rate": 8.797182451814603e-07, + "loss": 5.2316, + "step": 1085 + }, + { + "epoch": 0.7277905759162304, + "grad_norm": 1.84067964553833, + "learning_rate": 8.756723833176376e-07, + "loss": 4.9795, + "step": 1086 + }, + { + "epoch": 0.7284607329842931, + "grad_norm": 2.3378286361694336, + "learning_rate": 8.716338701825974e-07, + "loss": 4.9382, + "step": 1087 + }, + { + "epoch": 0.729130890052356, + "grad_norm": 2.4951651096343994, + "learning_rate": 8.676027240472445e-07, + "loss": 4.9807, + "step": 1088 + }, + { + "epoch": 0.7298010471204188, + "grad_norm": 2.410188913345337, + "learning_rate": 8.63578963149157e-07, + "loss": 4.8992, + "step": 1089 + }, + { + "epoch": 0.7304712041884817, + "grad_norm": 2.0807738304138184, + "learning_rate": 8.595626056924983e-07, + "loss": 5.0869, + "step": 1090 + }, + { + "epoch": 0.7311413612565445, + "grad_norm": 2.450817108154297, + "learning_rate": 8.555536698479395e-07, + "loss": 5.0584, + "step": 1091 + }, + { + "epoch": 0.7318115183246073, + "grad_norm": 2.6124391555786133, + "learning_rate": 8.515521737525748e-07, + "loss": 5.0537, + "step": 1092 + }, + { + "epoch": 0.7324816753926702, + "grad_norm": 2.224575996398926, + "learning_rate": 8.47558135509838e-07, + "loss": 5.165, + "step": 1093 + }, + { + "epoch": 0.733151832460733, + "grad_norm": 2.324970006942749, + "learning_rate": 8.435715731894245e-07, + "loss": 5.2625, + "step": 1094 + }, + { + "epoch": 0.7338219895287958, + "grad_norm": 3.057370185852051, + "learning_rate": 8.395925048272069e-07, + "loss": 4.9905, + "step": 1095 + }, + { + "epoch": 0.7344921465968587, + "grad_norm": 2.1883432865142822, + "learning_rate": 8.356209484251521e-07, + "loss": 4.9127, + "step": 1096 + }, + { + "epoch": 0.7351623036649214, + "grad_norm": 2.510040283203125, + "learning_rate": 8.316569219512446e-07, + "loss": 5.1409, + "step": 1097 + }, + { + "epoch": 0.7358324607329843, + "grad_norm": 2.030768394470215, + "learning_rate": 8.27700443339399e-07, + "loss": 4.9366, + "step": 1098 + }, + { + "epoch": 0.7365026178010471, + "grad_norm": 2.413208246231079, + "learning_rate": 8.237515304893845e-07, + "loss": 4.7532, + "step": 1099 + }, + { + "epoch": 0.7371727748691099, + "grad_norm": 2.9401113986968994, + "learning_rate": 8.198102012667409e-07, + "loss": 5.2135, + "step": 1100 + }, + { + "epoch": 0.7378429319371728, + "grad_norm": 2.438283681869507, + "learning_rate": 8.158764735026989e-07, + "loss": 4.8833, + "step": 1101 + }, + { + "epoch": 0.7385130890052356, + "grad_norm": 1.9578114748001099, + "learning_rate": 8.119503649940969e-07, + "loss": 4.7994, + "step": 1102 + }, + { + "epoch": 0.7391832460732984, + "grad_norm": 2.337479591369629, + "learning_rate": 8.080318935033057e-07, + "loss": 5.0201, + "step": 1103 + }, + { + "epoch": 0.7398534031413613, + "grad_norm": 2.590935707092285, + "learning_rate": 8.041210767581415e-07, + "loss": 5.0649, + "step": 1104 + }, + { + "epoch": 0.7405235602094241, + "grad_norm": 2.108517646789551, + "learning_rate": 8.002179324517914e-07, + "loss": 5.0238, + "step": 1105 + }, + { + "epoch": 0.741193717277487, + "grad_norm": 2.358654022216797, + "learning_rate": 7.96322478242732e-07, + "loss": 4.8828, + "step": 1106 + }, + { + "epoch": 0.7418638743455498, + "grad_norm": 2.0274851322174072, + "learning_rate": 7.92434731754645e-07, + "loss": 5.159, + "step": 1107 + }, + { + "epoch": 0.7425340314136125, + "grad_norm": 1.9625911712646484, + "learning_rate": 7.885547105763444e-07, + "loss": 5.2291, + "step": 1108 + }, + { + "epoch": 0.7432041884816754, + "grad_norm": 2.2201850414276123, + "learning_rate": 7.84682432261693e-07, + "loss": 4.7749, + "step": 1109 + }, + { + "epoch": 0.7438743455497382, + "grad_norm": 2.2542552947998047, + "learning_rate": 7.808179143295219e-07, + "loss": 4.7366, + "step": 1110 + }, + { + "epoch": 0.744544502617801, + "grad_norm": 2.288684368133545, + "learning_rate": 7.769611742635555e-07, + "loss": 5.3394, + "step": 1111 + }, + { + "epoch": 0.7452146596858639, + "grad_norm": 1.952508807182312, + "learning_rate": 7.731122295123267e-07, + "loss": 5.0954, + "step": 1112 + }, + { + "epoch": 0.7458848167539267, + "grad_norm": 2.026873826980591, + "learning_rate": 7.692710974891041e-07, + "loss": 5.2031, + "step": 1113 + }, + { + "epoch": 0.7465549738219895, + "grad_norm": 2.2017242908477783, + "learning_rate": 7.654377955718084e-07, + "loss": 5.0217, + "step": 1114 + }, + { + "epoch": 0.7472251308900524, + "grad_norm": 2.8049333095550537, + "learning_rate": 7.616123411029367e-07, + "loss": 5.5266, + "step": 1115 + }, + { + "epoch": 0.7478952879581152, + "grad_norm": 2.2660751342773438, + "learning_rate": 7.577947513894829e-07, + "loss": 5.1222, + "step": 1116 + }, + { + "epoch": 0.7485654450261781, + "grad_norm": 2.282757520675659, + "learning_rate": 7.539850437028584e-07, + "loss": 4.9671, + "step": 1117 + }, + { + "epoch": 0.7492356020942408, + "grad_norm": 2.5191409587860107, + "learning_rate": 7.501832352788147e-07, + "loss": 5.0362, + "step": 1118 + }, + { + "epoch": 0.7499057591623036, + "grad_norm": 2.01081919670105, + "learning_rate": 7.463893433173669e-07, + "loss": 5.1149, + "step": 1119 + }, + { + "epoch": 0.7505759162303665, + "grad_norm": 2.890888214111328, + "learning_rate": 7.426033849827149e-07, + "loss": 4.9724, + "step": 1120 + }, + { + "epoch": 0.7512460732984293, + "grad_norm": 2.2375943660736084, + "learning_rate": 7.388253774031659e-07, + "loss": 5.0092, + "step": 1121 + }, + { + "epoch": 0.7519162303664921, + "grad_norm": 2.171166181564331, + "learning_rate": 7.350553376710534e-07, + "loss": 5.2243, + "step": 1122 + }, + { + "epoch": 0.752586387434555, + "grad_norm": 2.401020050048828, + "learning_rate": 7.312932828426677e-07, + "loss": 5.0838, + "step": 1123 + }, + { + "epoch": 0.7532565445026178, + "grad_norm": 2.0519063472747803, + "learning_rate": 7.275392299381695e-07, + "loss": 4.9773, + "step": 1124 + }, + { + "epoch": 0.7539267015706806, + "grad_norm": 2.1711297035217285, + "learning_rate": 7.237931959415207e-07, + "loss": 4.922, + "step": 1125 + }, + { + "epoch": 0.7545968586387435, + "grad_norm": 2.1511411666870117, + "learning_rate": 7.200551978004039e-07, + "loss": 5.2377, + "step": 1126 + }, + { + "epoch": 0.7552670157068063, + "grad_norm": 2.283114433288574, + "learning_rate": 7.163252524261443e-07, + "loss": 5.052, + "step": 1127 + }, + { + "epoch": 0.7559371727748692, + "grad_norm": 2.2724831104278564, + "learning_rate": 7.126033766936366e-07, + "loss": 4.9807, + "step": 1128 + }, + { + "epoch": 0.7566073298429319, + "grad_norm": 2.3890180587768555, + "learning_rate": 7.088895874412668e-07, + "loss": 5.1586, + "step": 1129 + }, + { + "epoch": 0.7572774869109947, + "grad_norm": 2.5954933166503906, + "learning_rate": 7.051839014708372e-07, + "loss": 5.064, + "step": 1130 + }, + { + "epoch": 0.7579476439790576, + "grad_norm": 2.3010849952697754, + "learning_rate": 7.01486335547488e-07, + "loss": 4.944, + "step": 1131 + }, + { + "epoch": 0.7586178010471204, + "grad_norm": 1.916985034942627, + "learning_rate": 6.977969063996223e-07, + "loss": 4.9986, + "step": 1132 + }, + { + "epoch": 0.7592879581151832, + "grad_norm": 2.1439950466156006, + "learning_rate": 6.941156307188332e-07, + "loss": 5.0494, + "step": 1133 + }, + { + "epoch": 0.7599581151832461, + "grad_norm": 2.3296618461608887, + "learning_rate": 6.904425251598254e-07, + "loss": 5.1706, + "step": 1134 + }, + { + "epoch": 0.7606282722513089, + "grad_norm": 2.1923646926879883, + "learning_rate": 6.867776063403411e-07, + "loss": 4.9433, + "step": 1135 + }, + { + "epoch": 0.7612984293193717, + "grad_norm": 2.6342825889587402, + "learning_rate": 6.831208908410825e-07, + "loss": 5.0178, + "step": 1136 + }, + { + "epoch": 0.7619685863874346, + "grad_norm": 2.1129515171051025, + "learning_rate": 6.794723952056406e-07, + "loss": 5.5518, + "step": 1137 + }, + { + "epoch": 0.7626387434554974, + "grad_norm": 2.1442720890045166, + "learning_rate": 6.758321359404166e-07, + "loss": 4.8999, + "step": 1138 + }, + { + "epoch": 0.7633089005235603, + "grad_norm": 2.1809842586517334, + "learning_rate": 6.722001295145509e-07, + "loss": 4.9485, + "step": 1139 + }, + { + "epoch": 0.763979057591623, + "grad_norm": 2.5500996112823486, + "learning_rate": 6.685763923598457e-07, + "loss": 5.2035, + "step": 1140 + }, + { + "epoch": 0.7646492146596858, + "grad_norm": 2.273024320602417, + "learning_rate": 6.649609408706909e-07, + "loss": 5.169, + "step": 1141 + }, + { + "epoch": 0.7653193717277487, + "grad_norm": 2.008328676223755, + "learning_rate": 6.613537914039916e-07, + "loss": 4.6944, + "step": 1142 + }, + { + "epoch": 0.7659895287958115, + "grad_norm": 1.9586178064346313, + "learning_rate": 6.577549602790936e-07, + "loss": 4.9895, + "step": 1143 + }, + { + "epoch": 0.7666596858638743, + "grad_norm": 2.1195785999298096, + "learning_rate": 6.541644637777089e-07, + "loss": 5.0358, + "step": 1144 + }, + { + "epoch": 0.7673298429319372, + "grad_norm": 1.9128762483596802, + "learning_rate": 6.505823181438414e-07, + "loss": 4.7646, + "step": 1145 + }, + { + "epoch": 0.768, + "grad_norm": 2.1416807174682617, + "learning_rate": 6.470085395837139e-07, + "loss": 4.8051, + "step": 1146 + }, + { + "epoch": 0.7686701570680629, + "grad_norm": 2.2263646125793457, + "learning_rate": 6.434431442656966e-07, + "loss": 4.9021, + "step": 1147 + }, + { + "epoch": 0.7693403141361257, + "grad_norm": 2.5426077842712402, + "learning_rate": 6.398861483202326e-07, + "loss": 4.9737, + "step": 1148 + }, + { + "epoch": 0.7700104712041885, + "grad_norm": 1.969666600227356, + "learning_rate": 6.363375678397638e-07, + "loss": 4.8842, + "step": 1149 + }, + { + "epoch": 0.7706806282722513, + "grad_norm": 1.8267338275909424, + "learning_rate": 6.3279741887866e-07, + "loss": 5.1273, + "step": 1150 + }, + { + "epoch": 0.7713507853403141, + "grad_norm": 2.2450010776519775, + "learning_rate": 6.292657174531449e-07, + "loss": 4.918, + "step": 1151 + }, + { + "epoch": 0.7720209424083769, + "grad_norm": 2.3020267486572266, + "learning_rate": 6.257424795412229e-07, + "loss": 5.1702, + "step": 1152 + }, + { + "epoch": 0.7726910994764398, + "grad_norm": 2.17891788482666, + "learning_rate": 6.222277210826105e-07, + "loss": 4.9306, + "step": 1153 + }, + { + "epoch": 0.7733612565445026, + "grad_norm": 2.244767904281616, + "learning_rate": 6.187214579786602e-07, + "loss": 5.2499, + "step": 1154 + }, + { + "epoch": 0.7740314136125654, + "grad_norm": 2.6995861530303955, + "learning_rate": 6.152237060922916e-07, + "loss": 4.4595, + "step": 1155 + }, + { + "epoch": 0.7747015706806283, + "grad_norm": 2.22466778755188, + "learning_rate": 6.117344812479154e-07, + "loss": 4.7913, + "step": 1156 + }, + { + "epoch": 0.7753717277486911, + "grad_norm": 2.3127236366271973, + "learning_rate": 6.082537992313667e-07, + "loss": 5.1861, + "step": 1157 + }, + { + "epoch": 0.776041884816754, + "grad_norm": 2.3279953002929688, + "learning_rate": 6.04781675789832e-07, + "loss": 4.8803, + "step": 1158 + }, + { + "epoch": 0.7767120418848168, + "grad_norm": 1.9932987689971924, + "learning_rate": 6.013181266317747e-07, + "loss": 4.8846, + "step": 1159 + }, + { + "epoch": 0.7773821989528796, + "grad_norm": 2.2041752338409424, + "learning_rate": 5.978631674268693e-07, + "loss": 5.0406, + "step": 1160 + }, + { + "epoch": 0.7780523560209424, + "grad_norm": 2.7398922443389893, + "learning_rate": 5.944168138059261e-07, + "loss": 4.8435, + "step": 1161 + }, + { + "epoch": 0.7787225130890052, + "grad_norm": 2.2988779544830322, + "learning_rate": 5.909790813608235e-07, + "loss": 4.9794, + "step": 1162 + }, + { + "epoch": 0.779392670157068, + "grad_norm": 2.4860384464263916, + "learning_rate": 5.875499856444358e-07, + "loss": 4.9771, + "step": 1163 + }, + { + "epoch": 0.7800628272251309, + "grad_norm": 2.2307345867156982, + "learning_rate": 5.841295421705642e-07, + "loss": 5.1321, + "step": 1164 + }, + { + "epoch": 0.7807329842931937, + "grad_norm": 1.9979619979858398, + "learning_rate": 5.807177664138644e-07, + "loss": 5.0066, + "step": 1165 + }, + { + "epoch": 0.7814031413612565, + "grad_norm": 2.7662293910980225, + "learning_rate": 5.773146738097773e-07, + "loss": 5.1328, + "step": 1166 + }, + { + "epoch": 0.7820732984293194, + "grad_norm": 1.8403937816619873, + "learning_rate": 5.739202797544619e-07, + "loss": 5.1026, + "step": 1167 + }, + { + "epoch": 0.7827434554973822, + "grad_norm": 2.0250093936920166, + "learning_rate": 5.70534599604722e-07, + "loss": 5.0364, + "step": 1168 + }, + { + "epoch": 0.7834136125654451, + "grad_norm": 2.4427311420440674, + "learning_rate": 5.671576486779398e-07, + "loss": 5.0155, + "step": 1169 + }, + { + "epoch": 0.7840837696335079, + "grad_norm": 2.440207004547119, + "learning_rate": 5.637894422520027e-07, + "loss": 5.0432, + "step": 1170 + }, + { + "epoch": 0.7847539267015706, + "grad_norm": 2.796550750732422, + "learning_rate": 5.604299955652381e-07, + "loss": 5.1718, + "step": 1171 + }, + { + "epoch": 0.7854240837696335, + "grad_norm": 1.7754937410354614, + "learning_rate": 5.570793238163436e-07, + "loss": 5.0406, + "step": 1172 + }, + { + "epoch": 0.7860942408376963, + "grad_norm": 2.1862494945526123, + "learning_rate": 5.537374421643146e-07, + "loss": 4.9814, + "step": 1173 + }, + { + "epoch": 0.7867643979057591, + "grad_norm": 2.3494925498962402, + "learning_rate": 5.504043657283823e-07, + "loss": 5.0955, + "step": 1174 + }, + { + "epoch": 0.787434554973822, + "grad_norm": 2.3137881755828857, + "learning_rate": 5.470801095879382e-07, + "loss": 5.4067, + "step": 1175 + }, + { + "epoch": 0.7881047120418848, + "grad_norm": 2.568342685699463, + "learning_rate": 5.437646887824721e-07, + "loss": 5.2062, + "step": 1176 + }, + { + "epoch": 0.7887748691099477, + "grad_norm": 2.170259952545166, + "learning_rate": 5.404581183114998e-07, + "loss": 5.1373, + "step": 1177 + }, + { + "epoch": 0.7894450261780105, + "grad_norm": 2.1512749195098877, + "learning_rate": 5.371604131344979e-07, + "loss": 5.1238, + "step": 1178 + }, + { + "epoch": 0.7901151832460733, + "grad_norm": 2.1920998096466064, + "learning_rate": 5.338715881708331e-07, + "loss": 4.895, + "step": 1179 + }, + { + "epoch": 0.7907853403141362, + "grad_norm": 2.220663070678711, + "learning_rate": 5.30591658299697e-07, + "loss": 5.0831, + "step": 1180 + }, + { + "epoch": 0.791455497382199, + "grad_norm": 1.8620455265045166, + "learning_rate": 5.273206383600396e-07, + "loss": 4.9863, + "step": 1181 + }, + { + "epoch": 0.7921256544502617, + "grad_norm": 1.9880365133285522, + "learning_rate": 5.240585431505002e-07, + "loss": 4.5929, + "step": 1182 + }, + { + "epoch": 0.7927958115183246, + "grad_norm": 2.086104154586792, + "learning_rate": 5.208053874293406e-07, + "loss": 4.9963, + "step": 1183 + }, + { + "epoch": 0.7934659685863874, + "grad_norm": 2.240558624267578, + "learning_rate": 5.175611859143806e-07, + "loss": 4.7543, + "step": 1184 + }, + { + "epoch": 0.7941361256544502, + "grad_norm": 2.365817070007324, + "learning_rate": 5.143259532829267e-07, + "loss": 5.3058, + "step": 1185 + }, + { + "epoch": 0.7948062827225131, + "grad_norm": 2.296628475189209, + "learning_rate": 5.110997041717116e-07, + "loss": 5.0126, + "step": 1186 + }, + { + "epoch": 0.7954764397905759, + "grad_norm": 2.3187358379364014, + "learning_rate": 5.078824531768228e-07, + "loss": 4.8611, + "step": 1187 + }, + { + "epoch": 0.7961465968586388, + "grad_norm": 2.1453304290771484, + "learning_rate": 5.046742148536404e-07, + "loss": 4.8574, + "step": 1188 + }, + { + "epoch": 0.7968167539267016, + "grad_norm": 2.0868799686431885, + "learning_rate": 5.014750037167698e-07, + "loss": 5.1505, + "step": 1189 + }, + { + "epoch": 0.7974869109947644, + "grad_norm": 1.784549593925476, + "learning_rate": 4.982848342399741e-07, + "loss": 4.8991, + "step": 1190 + }, + { + "epoch": 0.7981570680628273, + "grad_norm": 1.9321773052215576, + "learning_rate": 4.951037208561116e-07, + "loss": 4.928, + "step": 1191 + }, + { + "epoch": 0.7988272251308901, + "grad_norm": 2.221848726272583, + "learning_rate": 4.919316779570707e-07, + "loss": 5.1757, + "step": 1192 + }, + { + "epoch": 0.7994973821989528, + "grad_norm": 2.8202672004699707, + "learning_rate": 4.887687198937002e-07, + "loss": 5.0547, + "step": 1193 + }, + { + "epoch": 0.8001675392670157, + "grad_norm": 2.555051803588867, + "learning_rate": 4.856148609757508e-07, + "loss": 5.2369, + "step": 1194 + }, + { + "epoch": 0.8008376963350785, + "grad_norm": 1.9884748458862305, + "learning_rate": 4.824701154718045e-07, + "loss": 5.1295, + "step": 1195 + }, + { + "epoch": 0.8015078534031413, + "grad_norm": 2.368027925491333, + "learning_rate": 4.793344976092146e-07, + "loss": 4.9066, + "step": 1196 + }, + { + "epoch": 0.8021780104712042, + "grad_norm": 2.2029364109039307, + "learning_rate": 4.762080215740386e-07, + "loss": 5.1582, + "step": 1197 + }, + { + "epoch": 0.802848167539267, + "grad_norm": 2.4221737384796143, + "learning_rate": 4.730907015109759e-07, + "loss": 5.1814, + "step": 1198 + }, + { + "epoch": 0.8035183246073299, + "grad_norm": 3.0611066818237305, + "learning_rate": 4.699825515233003e-07, + "loss": 5.1989, + "step": 1199 + }, + { + "epoch": 0.8041884816753927, + "grad_norm": 1.8770856857299805, + "learning_rate": 4.6688358567280186e-07, + "loss": 4.8958, + "step": 1200 + }, + { + "epoch": 0.8048586387434555, + "grad_norm": 2.4339778423309326, + "learning_rate": 4.637938179797166e-07, + "loss": 5.1851, + "step": 1201 + }, + { + "epoch": 0.8055287958115184, + "grad_norm": 2.19170880317688, + "learning_rate": 4.6071326242266984e-07, + "loss": 5.1224, + "step": 1202 + }, + { + "epoch": 0.8061989528795811, + "grad_norm": 2.668105125427246, + "learning_rate": 4.576419329386081e-07, + "loss": 5.1409, + "step": 1203 + }, + { + "epoch": 0.8068691099476439, + "grad_norm": 2.373500108718872, + "learning_rate": 4.545798434227366e-07, + "loss": 5.1758, + "step": 1204 + }, + { + "epoch": 0.8075392670157068, + "grad_norm": 2.703608989715576, + "learning_rate": 4.5152700772845947e-07, + "loss": 5.1138, + "step": 1205 + }, + { + "epoch": 0.8082094240837696, + "grad_norm": 2.1617355346679688, + "learning_rate": 4.484834396673146e-07, + "loss": 5.1007, + "step": 1206 + }, + { + "epoch": 0.8088795811518324, + "grad_norm": 2.0932817459106445, + "learning_rate": 4.4544915300890915e-07, + "loss": 5.0408, + "step": 1207 + }, + { + "epoch": 0.8095497382198953, + "grad_norm": 2.0439183712005615, + "learning_rate": 4.424241614808636e-07, + "loss": 5.0576, + "step": 1208 + }, + { + "epoch": 0.8102198952879581, + "grad_norm": 2.4854025840759277, + "learning_rate": 4.3940847876874193e-07, + "loss": 4.9091, + "step": 1209 + }, + { + "epoch": 0.810890052356021, + "grad_norm": 1.8081549406051636, + "learning_rate": 4.3640211851599614e-07, + "loss": 5.0549, + "step": 1210 + }, + { + "epoch": 0.8115602094240838, + "grad_norm": 2.7424466609954834, + "learning_rate": 4.3340509432390043e-07, + "loss": 5.025, + "step": 1211 + }, + { + "epoch": 0.8122303664921466, + "grad_norm": 1.8654872179031372, + "learning_rate": 4.3041741975149294e-07, + "loss": 5.0045, + "step": 1212 + }, + { + "epoch": 0.8129005235602095, + "grad_norm": 2.3444607257843018, + "learning_rate": 4.274391083155091e-07, + "loss": 4.8565, + "step": 1213 + }, + { + "epoch": 0.8135706806282722, + "grad_norm": 2.078972339630127, + "learning_rate": 4.244701734903281e-07, + "loss": 5.0893, + "step": 1214 + }, + { + "epoch": 0.814240837696335, + "grad_norm": 1.8032852411270142, + "learning_rate": 4.215106287079043e-07, + "loss": 4.9006, + "step": 1215 + }, + { + "epoch": 0.8149109947643979, + "grad_norm": 2.370866060256958, + "learning_rate": 4.185604873577118e-07, + "loss": 4.6488, + "step": 1216 + }, + { + "epoch": 0.8155811518324607, + "grad_norm": 5.689456939697266, + "learning_rate": 4.1561976278668174e-07, + "loss": 4.8656, + "step": 1217 + }, + { + "epoch": 0.8162513089005236, + "grad_norm": 2.522265672683716, + "learning_rate": 4.1268846829914305e-07, + "loss": 5.106, + "step": 1218 + }, + { + "epoch": 0.8169214659685864, + "grad_norm": 2.0813515186309814, + "learning_rate": 4.0976661715675854e-07, + "loss": 5.1492, + "step": 1219 + }, + { + "epoch": 0.8175916230366492, + "grad_norm": 2.3402888774871826, + "learning_rate": 4.0685422257847134e-07, + "loss": 5.0009, + "step": 1220 + }, + { + "epoch": 0.8182617801047121, + "grad_norm": 2.1948485374450684, + "learning_rate": 4.0395129774043836e-07, + "loss": 4.7593, + "step": 1221 + }, + { + "epoch": 0.8189319371727749, + "grad_norm": 2.4026002883911133, + "learning_rate": 4.010578557759759e-07, + "loss": 4.9038, + "step": 1222 + }, + { + "epoch": 0.8196020942408377, + "grad_norm": 2.156548500061035, + "learning_rate": 3.98173909775498e-07, + "loss": 4.9309, + "step": 1223 + }, + { + "epoch": 0.8202722513089005, + "grad_norm": 2.5898830890655518, + "learning_rate": 3.9529947278645563e-07, + "loss": 4.859, + "step": 1224 + }, + { + "epoch": 0.8209424083769633, + "grad_norm": 2.2762200832366943, + "learning_rate": 3.9243455781328153e-07, + "loss": 4.9816, + "step": 1225 + }, + { + "epoch": 0.8216125654450261, + "grad_norm": 2.2160186767578125, + "learning_rate": 3.8957917781732883e-07, + "loss": 5.0222, + "step": 1226 + }, + { + "epoch": 0.822282722513089, + "grad_norm": 1.7954459190368652, + "learning_rate": 3.867333457168115e-07, + "loss": 4.9616, + "step": 1227 + }, + { + "epoch": 0.8229528795811518, + "grad_norm": 2.0955536365509033, + "learning_rate": 3.8389707438674875e-07, + "loss": 5.2389, + "step": 1228 + }, + { + "epoch": 0.8236230366492147, + "grad_norm": 2.1967079639434814, + "learning_rate": 3.8107037665890386e-07, + "loss": 5.0087, + "step": 1229 + }, + { + "epoch": 0.8242931937172775, + "grad_norm": 2.6518898010253906, + "learning_rate": 3.7825326532172887e-07, + "loss": 4.9795, + "step": 1230 + }, + { + "epoch": 0.8249633507853403, + "grad_norm": 2.064209461212158, + "learning_rate": 3.754457531203048e-07, + "loss": 5.0624, + "step": 1231 + }, + { + "epoch": 0.8256335078534032, + "grad_norm": 1.7200958728790283, + "learning_rate": 3.726478527562846e-07, + "loss": 4.7556, + "step": 1232 + }, + { + "epoch": 0.826303664921466, + "grad_norm": 2.319408655166626, + "learning_rate": 3.698595768878363e-07, + "loss": 4.7299, + "step": 1233 + }, + { + "epoch": 0.8269738219895288, + "grad_norm": 2.6238155364990234, + "learning_rate": 3.6708093812958396e-07, + "loss": 5.0846, + "step": 1234 + }, + { + "epoch": 0.8276439790575916, + "grad_norm": 2.5130534172058105, + "learning_rate": 3.643119490525518e-07, + "loss": 5.0706, + "step": 1235 + }, + { + "epoch": 0.8283141361256544, + "grad_norm": 2.467212438583374, + "learning_rate": 3.615526221841084e-07, + "loss": 4.9499, + "step": 1236 + }, + { + "epoch": 0.8289842931937172, + "grad_norm": 2.0987708568573, + "learning_rate": 3.5880297000790847e-07, + "loss": 5.1668, + "step": 1237 + }, + { + "epoch": 0.8296544502617801, + "grad_norm": 2.1706671714782715, + "learning_rate": 3.5606300496383584e-07, + "loss": 4.9499, + "step": 1238 + }, + { + "epoch": 0.8303246073298429, + "grad_norm": 2.0631673336029053, + "learning_rate": 3.533327394479491e-07, + "loss": 4.9016, + "step": 1239 + }, + { + "epoch": 0.8309947643979058, + "grad_norm": 2.171215772628784, + "learning_rate": 3.506121858124253e-07, + "loss": 5.0346, + "step": 1240 + }, + { + "epoch": 0.8316649214659686, + "grad_norm": 2.081542730331421, + "learning_rate": 3.479013563655012e-07, + "loss": 4.8801, + "step": 1241 + }, + { + "epoch": 0.8323350785340314, + "grad_norm": 2.2110610008239746, + "learning_rate": 3.452002633714219e-07, + "loss": 5.0428, + "step": 1242 + }, + { + "epoch": 0.8330052356020943, + "grad_norm": 1.9400110244750977, + "learning_rate": 3.425089190503808e-07, + "loss": 4.9467, + "step": 1243 + }, + { + "epoch": 0.8336753926701571, + "grad_norm": 2.2724132537841797, + "learning_rate": 3.39827335578469e-07, + "loss": 4.9223, + "step": 1244 + }, + { + "epoch": 0.83434554973822, + "grad_norm": 1.9748417139053345, + "learning_rate": 3.371555250876168e-07, + "loss": 5.0014, + "step": 1245 + }, + { + "epoch": 0.8350157068062827, + "grad_norm": 2.634237766265869, + "learning_rate": 3.3449349966553995e-07, + "loss": 4.7432, + "step": 1246 + }, + { + "epoch": 0.8356858638743455, + "grad_norm": 2.3327419757843018, + "learning_rate": 3.3184127135568596e-07, + "loss": 5.1241, + "step": 1247 + }, + { + "epoch": 0.8363560209424084, + "grad_norm": 2.305849075317383, + "learning_rate": 3.2919885215717626e-07, + "loss": 4.9703, + "step": 1248 + }, + { + "epoch": 0.8370261780104712, + "grad_norm": 2.3055591583251953, + "learning_rate": 3.2656625402475576e-07, + "loss": 5.0756, + "step": 1249 + }, + { + "epoch": 0.837696335078534, + "grad_norm": 2.3185768127441406, + "learning_rate": 3.2394348886873735e-07, + "loss": 5.052, + "step": 1250 + }, + { + "epoch": 0.8383664921465969, + "grad_norm": 2.2942299842834473, + "learning_rate": 3.2133056855494753e-07, + "loss": 5.14, + "step": 1251 + }, + { + "epoch": 0.8390366492146597, + "grad_norm": 2.267603874206543, + "learning_rate": 3.187275049046737e-07, + "loss": 5.2036, + "step": 1252 + }, + { + "epoch": 0.8397068062827225, + "grad_norm": 2.073228120803833, + "learning_rate": 3.1613430969460844e-07, + "loss": 4.7733, + "step": 1253 + }, + { + "epoch": 0.8403769633507854, + "grad_norm": 2.136348009109497, + "learning_rate": 3.135509946568002e-07, + "loss": 5.0415, + "step": 1254 + }, + { + "epoch": 0.8410471204188482, + "grad_norm": 2.260221481323242, + "learning_rate": 3.1097757147859566e-07, + "loss": 5.1981, + "step": 1255 + }, + { + "epoch": 0.8417172774869109, + "grad_norm": 2.2212343215942383, + "learning_rate": 3.084140518025902e-07, + "loss": 4.9098, + "step": 1256 + }, + { + "epoch": 0.8423874345549738, + "grad_norm": 2.0503504276275635, + "learning_rate": 3.058604472265747e-07, + "loss": 5.1323, + "step": 1257 + }, + { + "epoch": 0.8430575916230366, + "grad_norm": 2.308495044708252, + "learning_rate": 3.033167693034808e-07, + "loss": 4.7553, + "step": 1258 + }, + { + "epoch": 0.8437277486910995, + "grad_norm": 2.055201768875122, + "learning_rate": 3.0078302954133166e-07, + "loss": 5.0294, + "step": 1259 + }, + { + "epoch": 0.8443979057591623, + "grad_norm": 2.3220279216766357, + "learning_rate": 2.98259239403188e-07, + "loss": 4.8975, + "step": 1260 + }, + { + "epoch": 0.8450680628272251, + "grad_norm": 1.8509376049041748, + "learning_rate": 2.957454103070978e-07, + "loss": 5.0668, + "step": 1261 + }, + { + "epoch": 0.845738219895288, + "grad_norm": 2.187986373901367, + "learning_rate": 2.932415536260419e-07, + "loss": 4.8605, + "step": 1262 + }, + { + "epoch": 0.8464083769633508, + "grad_norm": 2.2834601402282715, + "learning_rate": 2.907476806878845e-07, + "loss": 5.3201, + "step": 1263 + }, + { + "epoch": 0.8470785340314136, + "grad_norm": 2.043067216873169, + "learning_rate": 2.882638027753232e-07, + "loss": 4.841, + "step": 1264 + }, + { + "epoch": 0.8477486910994765, + "grad_norm": 2.645731210708618, + "learning_rate": 2.8578993112583525e-07, + "loss": 5.2072, + "step": 1265 + }, + { + "epoch": 0.8484188481675393, + "grad_norm": 2.133944511413574, + "learning_rate": 2.8332607693162845e-07, + "loss": 4.9593, + "step": 1266 + }, + { + "epoch": 0.849089005235602, + "grad_norm": 2.55283784866333, + "learning_rate": 2.808722513395901e-07, + "loss": 4.985, + "step": 1267 + }, + { + "epoch": 0.8497591623036649, + "grad_norm": 2.150108575820923, + "learning_rate": 2.784284654512351e-07, + "loss": 5.5307, + "step": 1268 + }, + { + "epoch": 0.8504293193717277, + "grad_norm": 2.09555721282959, + "learning_rate": 2.7599473032265817e-07, + "loss": 5.1041, + "step": 1269 + }, + { + "epoch": 0.8510994764397906, + "grad_norm": 2.0397253036499023, + "learning_rate": 2.7357105696448254e-07, + "loss": 5.3425, + "step": 1270 + }, + { + "epoch": 0.8517696335078534, + "grad_norm": 2.5002498626708984, + "learning_rate": 2.7115745634181097e-07, + "loss": 5.1043, + "step": 1271 + }, + { + "epoch": 0.8524397905759162, + "grad_norm": 2.1711158752441406, + "learning_rate": 2.687539393741739e-07, + "loss": 4.9387, + "step": 1272 + }, + { + "epoch": 0.8531099476439791, + "grad_norm": 1.9462813138961792, + "learning_rate": 2.663605169354833e-07, + "loss": 4.9784, + "step": 1273 + }, + { + "epoch": 0.8537801047120419, + "grad_norm": 2.0433311462402344, + "learning_rate": 2.639771998539806e-07, + "loss": 4.9194, + "step": 1274 + }, + { + "epoch": 0.8544502617801047, + "grad_norm": 2.780001640319824, + "learning_rate": 2.616039989121899e-07, + "loss": 5.0706, + "step": 1275 + }, + { + "epoch": 0.8551204188481676, + "grad_norm": 2.471343517303467, + "learning_rate": 2.5924092484686746e-07, + "loss": 5.118, + "step": 1276 + }, + { + "epoch": 0.8557905759162303, + "grad_norm": 2.0888404846191406, + "learning_rate": 2.5688798834895335e-07, + "loss": 5.0195, + "step": 1277 + }, + { + "epoch": 0.8564607329842931, + "grad_norm": 1.878853440284729, + "learning_rate": 2.545452000635251e-07, + "loss": 5.0919, + "step": 1278 + }, + { + "epoch": 0.857130890052356, + "grad_norm": 1.875441312789917, + "learning_rate": 2.522125705897471e-07, + "loss": 4.9351, + "step": 1279 + }, + { + "epoch": 0.8578010471204188, + "grad_norm": 2.194225788116455, + "learning_rate": 2.498901104808241e-07, + "loss": 5.1229, + "step": 1280 + }, + { + "epoch": 0.8584712041884817, + "grad_norm": 2.382415294647217, + "learning_rate": 2.4757783024395244e-07, + "loss": 5.056, + "step": 1281 + }, + { + "epoch": 0.8591413612565445, + "grad_norm": 2.61612868309021, + "learning_rate": 2.452757403402728e-07, + "loss": 4.9424, + "step": 1282 + }, + { + "epoch": 0.8598115183246073, + "grad_norm": 2.4941701889038086, + "learning_rate": 2.4298385118482345e-07, + "loss": 4.8454, + "step": 1283 + }, + { + "epoch": 0.8604816753926702, + "grad_norm": 2.6090404987335205, + "learning_rate": 2.4070217314649316e-07, + "loss": 5.3559, + "step": 1284 + }, + { + "epoch": 0.861151832460733, + "grad_norm": 2.0773582458496094, + "learning_rate": 2.3843071654797325e-07, + "loss": 5.2277, + "step": 1285 + }, + { + "epoch": 0.8618219895287959, + "grad_norm": 2.038578510284424, + "learning_rate": 2.3616949166571256e-07, + "loss": 5.1219, + "step": 1286 + }, + { + "epoch": 0.8624921465968587, + "grad_norm": 2.4837217330932617, + "learning_rate": 2.3391850872986815e-07, + "loss": 5.0083, + "step": 1287 + }, + { + "epoch": 0.8631623036649214, + "grad_norm": 2.193636655807495, + "learning_rate": 2.3167777792426237e-07, + "loss": 5.0886, + "step": 1288 + }, + { + "epoch": 0.8638324607329843, + "grad_norm": 2.2123429775238037, + "learning_rate": 2.294473093863353e-07, + "loss": 5.1775, + "step": 1289 + }, + { + "epoch": 0.8645026178010471, + "grad_norm": 2.225257158279419, + "learning_rate": 2.2722711320709745e-07, + "loss": 4.8214, + "step": 1290 + }, + { + "epoch": 0.8651727748691099, + "grad_norm": 2.601940870285034, + "learning_rate": 2.250171994310868e-07, + "loss": 5.1491, + "step": 1291 + }, + { + "epoch": 0.8658429319371728, + "grad_norm": 2.152582883834839, + "learning_rate": 2.2281757805632076e-07, + "loss": 5.1693, + "step": 1292 + }, + { + "epoch": 0.8665130890052356, + "grad_norm": 2.2037830352783203, + "learning_rate": 2.2062825903425316e-07, + "loss": 5.0747, + "step": 1293 + }, + { + "epoch": 0.8671832460732984, + "grad_norm": 2.5105886459350586, + "learning_rate": 2.1844925226972847e-07, + "loss": 5.0326, + "step": 1294 + }, + { + "epoch": 0.8678534031413613, + "grad_norm": 2.2535853385925293, + "learning_rate": 2.1628056762093674e-07, + "loss": 4.9301, + "step": 1295 + }, + { + "epoch": 0.8685235602094241, + "grad_norm": 2.282524824142456, + "learning_rate": 2.1412221489936796e-07, + "loss": 4.9776, + "step": 1296 + }, + { + "epoch": 0.869193717277487, + "grad_norm": 2.4529056549072266, + "learning_rate": 2.119742038697689e-07, + "loss": 5.0795, + "step": 1297 + }, + { + "epoch": 0.8698638743455498, + "grad_norm": 2.4816536903381348, + "learning_rate": 2.098365442501002e-07, + "loss": 5.0823, + "step": 1298 + }, + { + "epoch": 0.8705340314136125, + "grad_norm": 2.298016309738159, + "learning_rate": 2.0770924571149014e-07, + "loss": 5.1371, + "step": 1299 + }, + { + "epoch": 0.8712041884816754, + "grad_norm": 1.9586299657821655, + "learning_rate": 2.0559231787819128e-07, + "loss": 4.8267, + "step": 1300 + }, + { + "epoch": 0.8718743455497382, + "grad_norm": 2.62386155128479, + "learning_rate": 2.0348577032753892e-07, + "loss": 4.8385, + "step": 1301 + }, + { + "epoch": 0.872544502617801, + "grad_norm": 1.8399372100830078, + "learning_rate": 2.0138961258990397e-07, + "loss": 5.1666, + "step": 1302 + }, + { + "epoch": 0.8732146596858639, + "grad_norm": 2.1892104148864746, + "learning_rate": 1.9930385414865388e-07, + "loss": 5.0949, + "step": 1303 + }, + { + "epoch": 0.8738848167539267, + "grad_norm": 2.136589527130127, + "learning_rate": 1.9722850444010704e-07, + "loss": 4.9039, + "step": 1304 + }, + { + "epoch": 0.8745549738219895, + "grad_norm": 2.5759475231170654, + "learning_rate": 1.9516357285349202e-07, + "loss": 5.0665, + "step": 1305 + }, + { + "epoch": 0.8752251308900524, + "grad_norm": 2.103248119354248, + "learning_rate": 1.931090687309023e-07, + "loss": 4.8426, + "step": 1306 + }, + { + "epoch": 0.8758952879581152, + "grad_norm": 2.2694711685180664, + "learning_rate": 1.9106500136725742e-07, + "loss": 5.2192, + "step": 1307 + }, + { + "epoch": 0.8765654450261781, + "grad_norm": 1.8070567846298218, + "learning_rate": 1.890313800102589e-07, + "loss": 5.0335, + "step": 1308 + }, + { + "epoch": 0.8772356020942408, + "grad_norm": 1.9571316242218018, + "learning_rate": 1.870082138603485e-07, + "loss": 5.0606, + "step": 1309 + }, + { + "epoch": 0.8779057591623036, + "grad_norm": 2.1865131855010986, + "learning_rate": 1.849955120706673e-07, + "loss": 4.8543, + "step": 1310 + }, + { + "epoch": 0.8785759162303665, + "grad_norm": 2.1236016750335693, + "learning_rate": 1.8299328374701246e-07, + "loss": 4.9756, + "step": 1311 + }, + { + "epoch": 0.8792460732984293, + "grad_norm": 2.2098305225372314, + "learning_rate": 1.8100153794779946e-07, + "loss": 5.0787, + "step": 1312 + }, + { + "epoch": 0.8799162303664921, + "grad_norm": 2.4154696464538574, + "learning_rate": 1.790202836840177e-07, + "loss": 4.9587, + "step": 1313 + }, + { + "epoch": 0.880586387434555, + "grad_norm": 2.911160707473755, + "learning_rate": 1.7704952991919184e-07, + "loss": 4.8429, + "step": 1314 + }, + { + "epoch": 0.8812565445026178, + "grad_norm": 2.913119077682495, + "learning_rate": 1.7508928556934062e-07, + "loss": 5.0933, + "step": 1315 + }, + { + "epoch": 0.8819267015706806, + "grad_norm": 2.209881544113159, + "learning_rate": 1.731395595029353e-07, + "loss": 5.0659, + "step": 1316 + }, + { + "epoch": 0.8825968586387435, + "grad_norm": 1.6870235204696655, + "learning_rate": 1.712003605408624e-07, + "loss": 4.7823, + "step": 1317 + }, + { + "epoch": 0.8832670157068063, + "grad_norm": 2.3089218139648438, + "learning_rate": 1.6927169745638016e-07, + "loss": 4.9966, + "step": 1318 + }, + { + "epoch": 0.8839371727748692, + "grad_norm": 2.4036853313446045, + "learning_rate": 1.6735357897508216e-07, + "loss": 5.3279, + "step": 1319 + }, + { + "epoch": 0.8846073298429319, + "grad_norm": 2.5057334899902344, + "learning_rate": 1.6544601377485648e-07, + "loss": 5.2201, + "step": 1320 + }, + { + "epoch": 0.8852774869109947, + "grad_norm": 2.2471020221710205, + "learning_rate": 1.6354901048584477e-07, + "loss": 5.0172, + "step": 1321 + }, + { + "epoch": 0.8859476439790576, + "grad_norm": 2.294185161590576, + "learning_rate": 1.6166257769040712e-07, + "loss": 5.1012, + "step": 1322 + }, + { + "epoch": 0.8866178010471204, + "grad_norm": 2.232290267944336, + "learning_rate": 1.5978672392307935e-07, + "loss": 4.9362, + "step": 1323 + }, + { + "epoch": 0.8872879581151832, + "grad_norm": 3.2191405296325684, + "learning_rate": 1.5792145767053623e-07, + "loss": 4.772, + "step": 1324 + }, + { + "epoch": 0.8879581151832461, + "grad_norm": 2.6714560985565186, + "learning_rate": 1.560667873715538e-07, + "loss": 4.962, + "step": 1325 + }, + { + "epoch": 0.8886282722513089, + "grad_norm": 2.2959508895874023, + "learning_rate": 1.5422272141696866e-07, + "loss": 5.0707, + "step": 1326 + }, + { + "epoch": 0.8892984293193718, + "grad_norm": 2.202547073364258, + "learning_rate": 1.5238926814964276e-07, + "loss": 5.0551, + "step": 1327 + }, + { + "epoch": 0.8899685863874346, + "grad_norm": 2.037724256515503, + "learning_rate": 1.5056643586442403e-07, + "loss": 4.9772, + "step": 1328 + }, + { + "epoch": 0.8906387434554974, + "grad_norm": 2.147706985473633, + "learning_rate": 1.4875423280810986e-07, + "loss": 4.984, + "step": 1329 + }, + { + "epoch": 0.8913089005235602, + "grad_norm": 2.203908681869507, + "learning_rate": 1.4695266717940887e-07, + "loss": 4.8907, + "step": 1330 + }, + { + "epoch": 0.891979057591623, + "grad_norm": 2.007119655609131, + "learning_rate": 1.4516174712890406e-07, + "loss": 5.0029, + "step": 1331 + }, + { + "epoch": 0.8926492146596858, + "grad_norm": 2.464463949203491, + "learning_rate": 1.4338148075901558e-07, + "loss": 5.1036, + "step": 1332 + }, + { + "epoch": 0.8933193717277487, + "grad_norm": 2.6474461555480957, + "learning_rate": 1.4161187612396543e-07, + "loss": 4.9669, + "step": 1333 + }, + { + "epoch": 0.8939895287958115, + "grad_norm": 2.2933366298675537, + "learning_rate": 1.3985294122973974e-07, + "loss": 5.3269, + "step": 1334 + }, + { + "epoch": 0.8946596858638743, + "grad_norm": 2.7613108158111572, + "learning_rate": 1.3810468403405358e-07, + "loss": 4.5326, + "step": 1335 + }, + { + "epoch": 0.8953298429319372, + "grad_norm": 2.812737464904785, + "learning_rate": 1.3636711244631312e-07, + "loss": 4.9299, + "step": 1336 + }, + { + "epoch": 0.896, + "grad_norm": 2.1628639698028564, + "learning_rate": 1.346402343275824e-07, + "loss": 5.2307, + "step": 1337 + }, + { + "epoch": 0.8966701570680629, + "grad_norm": 2.1737706661224365, + "learning_rate": 1.329240574905452e-07, + "loss": 5.2518, + "step": 1338 + }, + { + "epoch": 0.8973403141361257, + "grad_norm": 2.3390467166900635, + "learning_rate": 1.3121858969947194e-07, + "loss": 5.104, + "step": 1339 + }, + { + "epoch": 0.8980104712041885, + "grad_norm": 2.431335210800171, + "learning_rate": 1.295238386701822e-07, + "loss": 5.0695, + "step": 1340 + }, + { + "epoch": 0.8986806282722513, + "grad_norm": 2.235074043273926, + "learning_rate": 1.2783981207001277e-07, + "loss": 4.969, + "step": 1341 + }, + { + "epoch": 0.8993507853403141, + "grad_norm": 2.1779608726501465, + "learning_rate": 1.2616651751778035e-07, + "loss": 5.036, + "step": 1342 + }, + { + "epoch": 0.9000209424083769, + "grad_norm": 2.16412353515625, + "learning_rate": 1.2450396258374854e-07, + "loss": 5.0072, + "step": 1343 + }, + { + "epoch": 0.9006910994764398, + "grad_norm": 2.2823896408081055, + "learning_rate": 1.2285215478959278e-07, + "loss": 4.9887, + "step": 1344 + }, + { + "epoch": 0.9013612565445026, + "grad_norm": 2.1547162532806396, + "learning_rate": 1.2121110160836697e-07, + "loss": 4.7598, + "step": 1345 + }, + { + "epoch": 0.9020314136125654, + "grad_norm": 2.2301156520843506, + "learning_rate": 1.1958081046446828e-07, + "loss": 4.9489, + "step": 1346 + }, + { + "epoch": 0.9027015706806283, + "grad_norm": 2.637312412261963, + "learning_rate": 1.1796128873360624e-07, + "loss": 5.1715, + "step": 1347 + }, + { + "epoch": 0.9033717277486911, + "grad_norm": 1.954455018043518, + "learning_rate": 1.1635254374276649e-07, + "loss": 4.8564, + "step": 1348 + }, + { + "epoch": 0.904041884816754, + "grad_norm": 2.5579347610473633, + "learning_rate": 1.1475458277018042e-07, + "loss": 4.9343, + "step": 1349 + }, + { + "epoch": 0.9047120418848168, + "grad_norm": 2.069652557373047, + "learning_rate": 1.1316741304528839e-07, + "loss": 5.1306, + "step": 1350 + }, + { + "epoch": 0.9053821989528796, + "grad_norm": 2.0289247035980225, + "learning_rate": 1.1159104174871177e-07, + "loss": 5.1948, + "step": 1351 + }, + { + "epoch": 0.9060523560209424, + "grad_norm": 2.011836528778076, + "learning_rate": 1.100254760122163e-07, + "loss": 5.0701, + "step": 1352 + }, + { + "epoch": 0.9067225130890052, + "grad_norm": 2.4269700050354004, + "learning_rate": 1.084707229186821e-07, + "loss": 5.0405, + "step": 1353 + }, + { + "epoch": 0.907392670157068, + "grad_norm": 1.9985160827636719, + "learning_rate": 1.069267895020723e-07, + "loss": 4.858, + "step": 1354 + }, + { + "epoch": 0.9080628272251309, + "grad_norm": 2.3926138877868652, + "learning_rate": 1.0539368274739753e-07, + "loss": 5.2444, + "step": 1355 + }, + { + "epoch": 0.9087329842931937, + "grad_norm": 2.214724540710449, + "learning_rate": 1.0387140959068898e-07, + "loss": 4.9224, + "step": 1356 + }, + { + "epoch": 0.9094031413612566, + "grad_norm": 2.054424285888672, + "learning_rate": 1.0235997691896398e-07, + "loss": 4.9698, + "step": 1357 + }, + { + "epoch": 0.9100732984293194, + "grad_norm": 2.2345471382141113, + "learning_rate": 1.0085939157019609e-07, + "loss": 5.2301, + "step": 1358 + }, + { + "epoch": 0.9107434554973822, + "grad_norm": 2.892404794692993, + "learning_rate": 9.936966033328277e-08, + "loss": 4.9301, + "step": 1359 + }, + { + "epoch": 0.9114136125654451, + "grad_norm": 2.082559823989868, + "learning_rate": 9.789078994801665e-08, + "loss": 5.0013, + "step": 1360 + }, + { + "epoch": 0.9120837696335079, + "grad_norm": 2.0129618644714355, + "learning_rate": 9.642278710505354e-08, + "loss": 4.9005, + "step": 1361 + }, + { + "epoch": 0.9127539267015706, + "grad_norm": 2.6280977725982666, + "learning_rate": 9.496565844588329e-08, + "loss": 5.0724, + "step": 1362 + }, + { + "epoch": 0.9134240837696335, + "grad_norm": 1.7958965301513672, + "learning_rate": 9.351941056279845e-08, + "loss": 5.0119, + "step": 1363 + }, + { + "epoch": 0.9140942408376963, + "grad_norm": 2.725520372390747, + "learning_rate": 9.208404999886594e-08, + "loss": 5.0868, + "step": 1364 + }, + { + "epoch": 0.9147643979057591, + "grad_norm": 2.4962949752807617, + "learning_rate": 9.065958324789542e-08, + "loss": 4.9649, + "step": 1365 + }, + { + "epoch": 0.915434554973822, + "grad_norm": 1.9726396799087524, + "learning_rate": 8.924601675441207e-08, + "loss": 4.7758, + "step": 1366 + }, + { + "epoch": 0.9161047120418848, + "grad_norm": 1.9447152614593506, + "learning_rate": 8.784335691362638e-08, + "loss": 5.0546, + "step": 1367 + }, + { + "epoch": 0.9167748691099477, + "grad_norm": 1.860182523727417, + "learning_rate": 8.645161007140468e-08, + "loss": 5.1615, + "step": 1368 + }, + { + "epoch": 0.9174450261780105, + "grad_norm": 1.9219021797180176, + "learning_rate": 8.50707825242425e-08, + "loss": 5.3586, + "step": 1369 + }, + { + "epoch": 0.9181151832460733, + "grad_norm": 2.7881152629852295, + "learning_rate": 8.370088051923297e-08, + "loss": 5.122, + "step": 1370 + }, + { + "epoch": 0.9187853403141362, + "grad_norm": 2.0921435356140137, + "learning_rate": 8.234191025404126e-08, + "loss": 4.9688, + "step": 1371 + }, + { + "epoch": 0.919455497382199, + "grad_norm": 1.88462495803833, + "learning_rate": 8.099387787687541e-08, + "loss": 4.6999, + "step": 1372 + }, + { + "epoch": 0.9201256544502617, + "grad_norm": 2.3097009658813477, + "learning_rate": 7.965678948645833e-08, + "loss": 5.2093, + "step": 1373 + }, + { + "epoch": 0.9207958115183246, + "grad_norm": 1.9321610927581787, + "learning_rate": 7.833065113200034e-08, + "loss": 5.155, + "step": 1374 + }, + { + "epoch": 0.9214659685863874, + "grad_norm": 2.0562613010406494, + "learning_rate": 7.701546881317273e-08, + "loss": 4.8406, + "step": 1375 + }, + { + "epoch": 0.9221361256544502, + "grad_norm": 2.438230037689209, + "learning_rate": 7.571124848007954e-08, + "loss": 5.0072, + "step": 1376 + }, + { + "epoch": 0.9228062827225131, + "grad_norm": 2.011482000350952, + "learning_rate": 7.441799603323058e-08, + "loss": 4.9704, + "step": 1377 + }, + { + "epoch": 0.9234764397905759, + "grad_norm": 2.19347882270813, + "learning_rate": 7.313571732351588e-08, + "loss": 4.7632, + "step": 1378 + }, + { + "epoch": 0.9241465968586388, + "grad_norm": 1.9491517543792725, + "learning_rate": 7.186441815217771e-08, + "loss": 5.0813, + "step": 1379 + }, + { + "epoch": 0.9248167539267016, + "grad_norm": 2.243320941925049, + "learning_rate": 7.060410427078473e-08, + "loss": 4.8204, + "step": 1380 + }, + { + "epoch": 0.9254869109947644, + "grad_norm": 2.298475503921509, + "learning_rate": 6.93547813812076e-08, + "loss": 4.7645, + "step": 1381 + }, + { + "epoch": 0.9261570680628273, + "grad_norm": 1.9371122121810913, + "learning_rate": 6.811645513559118e-08, + "loss": 5.1322, + "step": 1382 + }, + { + "epoch": 0.92682722513089, + "grad_norm": 2.0925161838531494, + "learning_rate": 6.688913113632983e-08, + "loss": 5.0049, + "step": 1383 + }, + { + "epoch": 0.9274973821989528, + "grad_norm": 2.1542677879333496, + "learning_rate": 6.567281493604139e-08, + "loss": 5.146, + "step": 1384 + }, + { + "epoch": 0.9281675392670157, + "grad_norm": 2.136967897415161, + "learning_rate": 6.446751203754347e-08, + "loss": 4.8245, + "step": 1385 + }, + { + "epoch": 0.9288376963350785, + "grad_norm": 2.5279605388641357, + "learning_rate": 6.327322789382695e-08, + "loss": 4.8866, + "step": 1386 + }, + { + "epoch": 0.9295078534031413, + "grad_norm": 2.2963929176330566, + "learning_rate": 6.208996790803223e-08, + "loss": 4.6499, + "step": 1387 + }, + { + "epoch": 0.9301780104712042, + "grad_norm": 2.017291784286499, + "learning_rate": 6.091773743342493e-08, + "loss": 5.1354, + "step": 1388 + }, + { + "epoch": 0.930848167539267, + "grad_norm": 2.0148367881774902, + "learning_rate": 5.97565417733706e-08, + "loss": 4.6275, + "step": 1389 + }, + { + "epoch": 0.9315183246073299, + "grad_norm": 2.604938268661499, + "learning_rate": 5.8606386181312147e-08, + "loss": 5.2862, + "step": 1390 + }, + { + "epoch": 0.9321884816753927, + "grad_norm": 1.9097956418991089, + "learning_rate": 5.74672758607453e-08, + "loss": 4.7748, + "step": 1391 + }, + { + "epoch": 0.9328586387434555, + "grad_norm": 2.368349552154541, + "learning_rate": 5.633921596519487e-08, + "loss": 5.1818, + "step": 1392 + }, + { + "epoch": 0.9335287958115184, + "grad_norm": 2.3302204608917236, + "learning_rate": 5.522221159819152e-08, + "loss": 5.0694, + "step": 1393 + }, + { + "epoch": 0.9341989528795811, + "grad_norm": 2.0187971591949463, + "learning_rate": 5.411626781324897e-08, + "loss": 5.2536, + "step": 1394 + }, + { + "epoch": 0.9348691099476439, + "grad_norm": 2.059431314468384, + "learning_rate": 5.302138961384179e-08, + "loss": 4.9142, + "step": 1395 + }, + { + "epoch": 0.9355392670157068, + "grad_norm": 2.138587713241577, + "learning_rate": 5.193758195338072e-08, + "loss": 5.185, + "step": 1396 + }, + { + "epoch": 0.9362094240837696, + "grad_norm": 1.9335336685180664, + "learning_rate": 5.0864849735192676e-08, + "loss": 4.9316, + "step": 1397 + }, + { + "epoch": 0.9368795811518325, + "grad_norm": 1.8651350736618042, + "learning_rate": 4.980319781249687e-08, + "loss": 4.8454, + "step": 1398 + }, + { + "epoch": 0.9375497382198953, + "grad_norm": 2.0417473316192627, + "learning_rate": 4.875263098838345e-08, + "loss": 4.9571, + "step": 1399 + }, + { + "epoch": 0.9382198952879581, + "grad_norm": 1.8700650930404663, + "learning_rate": 4.771315401579185e-08, + "loss": 4.6554, + "step": 1400 + }, + { + "epoch": 0.938890052356021, + "grad_norm": 2.453132390975952, + "learning_rate": 4.668477159748858e-08, + "loss": 4.8134, + "step": 1401 + }, + { + "epoch": 0.9395602094240838, + "grad_norm": 1.904445767402649, + "learning_rate": 4.5667488386047784e-08, + "loss": 5.1193, + "step": 1402 + }, + { + "epoch": 0.9402303664921466, + "grad_norm": 1.8896658420562744, + "learning_rate": 4.466130898382798e-08, + "loss": 4.8977, + "step": 1403 + }, + { + "epoch": 0.9409005235602095, + "grad_norm": 2.006617546081543, + "learning_rate": 4.366623794295199e-08, + "loss": 5.0047, + "step": 1404 + }, + { + "epoch": 0.9415706806282722, + "grad_norm": 1.9613410234451294, + "learning_rate": 4.2682279765287324e-08, + "loss": 5.0321, + "step": 1405 + }, + { + "epoch": 0.942240837696335, + "grad_norm": 2.4916062355041504, + "learning_rate": 4.170943890242501e-08, + "loss": 4.7501, + "step": 1406 + }, + { + "epoch": 0.9429109947643979, + "grad_norm": 2.0068774223327637, + "learning_rate": 4.074771975565911e-08, + "loss": 4.8988, + "step": 1407 + }, + { + "epoch": 0.9435811518324607, + "grad_norm": 2.543067216873169, + "learning_rate": 3.979712667596669e-08, + "loss": 5.1493, + "step": 1408 + }, + { + "epoch": 0.9442513089005236, + "grad_norm": 2.2659168243408203, + "learning_rate": 3.8857663963989825e-08, + "loss": 4.9611, + "step": 1409 + }, + { + "epoch": 0.9449214659685864, + "grad_norm": 2.5053818225860596, + "learning_rate": 3.792933587001418e-08, + "loss": 4.8455, + "step": 1410 + }, + { + "epoch": 0.9455916230366492, + "grad_norm": 2.22271990776062, + "learning_rate": 3.701214659395047e-08, + "loss": 4.995, + "step": 1411 + }, + { + "epoch": 0.9462617801047121, + "grad_norm": 1.9299851655960083, + "learning_rate": 3.6106100285316634e-08, + "loss": 4.7568, + "step": 1412 + }, + { + "epoch": 0.9469319371727749, + "grad_norm": 2.163027763366699, + "learning_rate": 3.5211201043216234e-08, + "loss": 4.8645, + "step": 1413 + }, + { + "epoch": 0.9476020942408377, + "grad_norm": 2.464020252227783, + "learning_rate": 3.432745291632289e-08, + "loss": 4.8882, + "step": 1414 + }, + { + "epoch": 0.9482722513089005, + "grad_norm": 2.6676039695739746, + "learning_rate": 3.3454859902860295e-08, + "loss": 4.9683, + "step": 1415 + }, + { + "epoch": 0.9489424083769633, + "grad_norm": 2.4233171939849854, + "learning_rate": 3.2593425950584736e-08, + "loss": 5.2675, + "step": 1416 + }, + { + "epoch": 0.9496125654450261, + "grad_norm": 1.5752209424972534, + "learning_rate": 3.174315495676705e-08, + "loss": 4.576, + "step": 1417 + }, + { + "epoch": 0.950282722513089, + "grad_norm": 2.3115358352661133, + "learning_rate": 3.0904050768174596e-08, + "loss": 4.9546, + "step": 1418 + }, + { + "epoch": 0.9509528795811518, + "grad_norm": 2.4787373542785645, + "learning_rate": 3.007611718105485e-08, + "loss": 5.0709, + "step": 1419 + }, + { + "epoch": 0.9516230366492147, + "grad_norm": 2.1984169483184814, + "learning_rate": 2.9259357941117117e-08, + "loss": 4.6754, + "step": 1420 + }, + { + "epoch": 0.9522931937172775, + "grad_norm": 2.377700090408325, + "learning_rate": 2.8453776743516127e-08, + "loss": 5.0916, + "step": 1421 + }, + { + "epoch": 0.9529633507853403, + "grad_norm": 1.8702716827392578, + "learning_rate": 2.7659377232835683e-08, + "loss": 4.9365, + "step": 1422 + }, + { + "epoch": 0.9536335078534032, + "grad_norm": 1.924991250038147, + "learning_rate": 2.687616300307172e-08, + "loss": 4.8889, + "step": 1423 + }, + { + "epoch": 0.954303664921466, + "grad_norm": 2.0711193084716797, + "learning_rate": 2.6104137597615375e-08, + "loss": 4.8053, + "step": 1424 + }, + { + "epoch": 0.9549738219895288, + "grad_norm": 1.9230101108551025, + "learning_rate": 2.5343304509238552e-08, + "loss": 5.077, + "step": 1425 + }, + { + "epoch": 0.9556439790575916, + "grad_norm": 1.9874987602233887, + "learning_rate": 2.4593667180076996e-08, + "loss": 4.8518, + "step": 1426 + }, + { + "epoch": 0.9563141361256544, + "grad_norm": 2.0824201107025146, + "learning_rate": 2.3855229001614744e-08, + "loss": 5.1043, + "step": 1427 + }, + { + "epoch": 0.9569842931937173, + "grad_norm": 2.187917947769165, + "learning_rate": 2.3127993314669138e-08, + "loss": 4.8822, + "step": 1428 + }, + { + "epoch": 0.9576544502617801, + "grad_norm": 1.7651984691619873, + "learning_rate": 2.241196340937557e-08, + "loss": 5.0213, + "step": 1429 + }, + { + "epoch": 0.9583246073298429, + "grad_norm": 2.359253168106079, + "learning_rate": 2.170714252517275e-08, + "loss": 5.1379, + "step": 1430 + }, + { + "epoch": 0.9589947643979058, + "grad_norm": 1.8880469799041748, + "learning_rate": 2.1013533850787747e-08, + "loss": 4.7338, + "step": 1431 + }, + { + "epoch": 0.9596649214659686, + "grad_norm": 2.318695068359375, + "learning_rate": 2.033114052422208e-08, + "loss": 4.7872, + "step": 1432 + }, + { + "epoch": 0.9603350785340314, + "grad_norm": 2.3342957496643066, + "learning_rate": 1.96599656327362e-08, + "loss": 4.7796, + "step": 1433 + }, + { + "epoch": 0.9610052356020943, + "grad_norm": 1.8491644859313965, + "learning_rate": 1.9000012212837537e-08, + "loss": 4.8163, + "step": 1434 + }, + { + "epoch": 0.9616753926701571, + "grad_norm": 2.1779308319091797, + "learning_rate": 1.835128325026525e-08, + "loss": 5.0043, + "step": 1435 + }, + { + "epoch": 0.9623455497382198, + "grad_norm": 2.096520185470581, + "learning_rate": 1.771378167997745e-08, + "loss": 4.9094, + "step": 1436 + }, + { + "epoch": 0.9630157068062827, + "grad_norm": 2.06124210357666, + "learning_rate": 1.7087510386137042e-08, + "loss": 5.0354, + "step": 1437 + }, + { + "epoch": 0.9636858638743455, + "grad_norm": 2.6854307651519775, + "learning_rate": 1.6472472202100074e-08, + "loss": 5.0915, + "step": 1438 + }, + { + "epoch": 0.9643560209424084, + "grad_norm": 2.3049659729003906, + "learning_rate": 1.586866991040187e-08, + "loss": 5.0065, + "step": 1439 + }, + { + "epoch": 0.9650261780104712, + "grad_norm": 2.1497740745544434, + "learning_rate": 1.5276106242744225e-08, + "loss": 4.9907, + "step": 1440 + }, + { + "epoch": 0.965696335078534, + "grad_norm": 2.1462044715881348, + "learning_rate": 1.4694783879984076e-08, + "loss": 4.8249, + "step": 1441 + }, + { + "epoch": 0.9663664921465969, + "grad_norm": 2.2082231044769287, + "learning_rate": 1.4124705452120691e-08, + "loss": 5.104, + "step": 1442 + }, + { + "epoch": 0.9670366492146597, + "grad_norm": 2.118767023086548, + "learning_rate": 1.3565873538283758e-08, + "loss": 4.9168, + "step": 1443 + }, + { + "epoch": 0.9677068062827225, + "grad_norm": 1.9635920524597168, + "learning_rate": 1.3018290666721723e-08, + "loss": 5.0158, + "step": 1444 + }, + { + "epoch": 0.9683769633507854, + "grad_norm": 2.3437256813049316, + "learning_rate": 1.2481959314791514e-08, + "loss": 5.1937, + "step": 1445 + }, + { + "epoch": 0.9690471204188482, + "grad_norm": 1.8330011367797852, + "learning_rate": 1.195688190894495e-08, + "loss": 4.8805, + "step": 1446 + }, + { + "epoch": 0.969717277486911, + "grad_norm": 2.258427858352661, + "learning_rate": 1.1443060824719576e-08, + "loss": 4.9438, + "step": 1447 + }, + { + "epoch": 0.9703874345549738, + "grad_norm": 2.3859939575195312, + "learning_rate": 1.0940498386728116e-08, + "loss": 5.1079, + "step": 1448 + }, + { + "epoch": 0.9710575916230366, + "grad_norm": 1.903747797012329, + "learning_rate": 1.0449196868646261e-08, + "loss": 5.1166, + "step": 1449 + }, + { + "epoch": 0.9717277486910995, + "grad_norm": 2.1217775344848633, + "learning_rate": 9.969158493204067e-09, + "loss": 4.7963, + "step": 1450 + }, + { + "epoch": 0.9723979057591623, + "grad_norm": 2.065089464187622, + "learning_rate": 9.500385432175684e-09, + "loss": 5.1951, + "step": 1451 + }, + { + "epoch": 0.9730680628272251, + "grad_norm": 2.248767852783203, + "learning_rate": 9.0428798063677e-09, + "loss": 4.9754, + "step": 1452 + }, + { + "epoch": 0.973738219895288, + "grad_norm": 2.0079345703125, + "learning_rate": 8.596643685612194e-09, + "loss": 4.9236, + "step": 1453 + }, + { + "epoch": 0.9744083769633508, + "grad_norm": 2.086193799972534, + "learning_rate": 8.161679088755925e-09, + "loss": 5.1244, + "step": 1454 + }, + { + "epoch": 0.9750785340314136, + "grad_norm": 2.2422029972076416, + "learning_rate": 7.737987983650607e-09, + "loss": 5.0332, + "step": 1455 + }, + { + "epoch": 0.9757486910994765, + "grad_norm": 2.0613884925842285, + "learning_rate": 7.325572287146254e-09, + "loss": 4.8539, + "step": 1456 + }, + { + "epoch": 0.9764188481675393, + "grad_norm": 2.1857516765594482, + "learning_rate": 6.9244338650792385e-09, + "loss": 4.835, + "step": 1457 + }, + { + "epoch": 0.977089005235602, + "grad_norm": 2.3052470684051514, + "learning_rate": 6.534574532267302e-09, + "loss": 4.8526, + "step": 1458 + }, + { + "epoch": 0.9777591623036649, + "grad_norm": 2.3822693824768066, + "learning_rate": 6.155996052499002e-09, + "loss": 5.2308, + "step": 1459 + }, + { + "epoch": 0.9784293193717277, + "grad_norm": 2.422942876815796, + "learning_rate": 5.788700138525949e-09, + "loss": 4.9636, + "step": 1460 + }, + { + "epoch": 0.9790994764397906, + "grad_norm": 1.6775134801864624, + "learning_rate": 5.432688452056689e-09, + "loss": 5.1564, + "step": 1461 + }, + { + "epoch": 0.9797696335078534, + "grad_norm": 2.2838222980499268, + "learning_rate": 5.087962603746999e-09, + "loss": 5.0561, + "step": 1462 + }, + { + "epoch": 0.9804397905759162, + "grad_norm": 2.447178363800049, + "learning_rate": 4.754524153194051e-09, + "loss": 5.1673, + "step": 1463 + }, + { + "epoch": 0.9811099476439791, + "grad_norm": 2.1302945613861084, + "learning_rate": 4.432374608929202e-09, + "loss": 4.8763, + "step": 1464 + }, + { + "epoch": 0.9817801047120419, + "grad_norm": 1.8921369314193726, + "learning_rate": 4.121515428410217e-09, + "loss": 5.0894, + "step": 1465 + }, + { + "epoch": 0.9824502617801047, + "grad_norm": 1.892519235610962, + "learning_rate": 3.821948018016552e-09, + "loss": 4.8003, + "step": 1466 + }, + { + "epoch": 0.9831204188481676, + "grad_norm": 1.7763696908950806, + "learning_rate": 3.533673733040477e-09, + "loss": 4.9118, + "step": 1467 + }, + { + "epoch": 0.9837905759162303, + "grad_norm": 2.504063844680786, + "learning_rate": 3.2566938776831814e-09, + "loss": 5.0741, + "step": 1468 + }, + { + "epoch": 0.9844607329842932, + "grad_norm": 2.338676929473877, + "learning_rate": 2.9910097050472877e-09, + "loss": 4.751, + "step": 1469 + }, + { + "epoch": 0.985130890052356, + "grad_norm": 2.104109048843384, + "learning_rate": 2.736622417132684e-09, + "loss": 5.079, + "step": 1470 + }, + { + "epoch": 0.9858010471204188, + "grad_norm": 1.9825674295425415, + "learning_rate": 2.4935331648298644e-09, + "loss": 4.7784, + "step": 1471 + }, + { + "epoch": 0.9864712041884817, + "grad_norm": 2.10986328125, + "learning_rate": 2.2617430479149327e-09, + "loss": 4.8066, + "step": 1472 + }, + { + "epoch": 0.9871413612565445, + "grad_norm": 2.6451609134674072, + "learning_rate": 2.0412531150448833e-09, + "loss": 5.0877, + "step": 1473 + }, + { + "epoch": 0.9878115183246073, + "grad_norm": 2.6569290161132812, + "learning_rate": 1.832064363753161e-09, + "loss": 4.9329, + "step": 1474 + }, + { + "epoch": 0.9884816753926702, + "grad_norm": 2.2146356105804443, + "learning_rate": 1.634177740444387e-09, + "loss": 5.0452, + "step": 1475 + }, + { + "epoch": 0.989151832460733, + "grad_norm": 2.198579788208008, + "learning_rate": 1.4475941403904737e-09, + "loss": 4.8974, + "step": 1476 + }, + { + "epoch": 0.9898219895287959, + "grad_norm": 2.7071197032928467, + "learning_rate": 1.272314407727293e-09, + "loss": 4.9834, + "step": 1477 + }, + { + "epoch": 0.9904921465968587, + "grad_norm": 1.946290135383606, + "learning_rate": 1.1083393354488492e-09, + "loss": 4.9904, + "step": 1478 + }, + { + "epoch": 0.9911623036649214, + "grad_norm": 2.483842611312866, + "learning_rate": 9.556696654058894e-10, + "loss": 4.8588, + "step": 1479 + }, + { + "epoch": 0.9918324607329843, + "grad_norm": 2.2516233921051025, + "learning_rate": 8.143060883017418e-10, + "loss": 4.8331, + "step": 1480 + }, + { + "epoch": 0.9925026178010471, + "grad_norm": 2.159393787384033, + "learning_rate": 6.842492436887061e-10, + "loss": 5.2052, + "step": 1481 + }, + { + "epoch": 0.9931727748691099, + "grad_norm": 2.3281965255737305, + "learning_rate": 5.654997199652789e-10, + "loss": 5.2328, + "step": 1482 + }, + { + "epoch": 0.9938429319371728, + "grad_norm": 2.1843552589416504, + "learning_rate": 4.5805805437448834e-10, + "loss": 4.9036, + "step": 1483 + }, + { + "epoch": 0.9945130890052356, + "grad_norm": 2.202324867248535, + "learning_rate": 3.6192473300000754e-10, + "loss": 4.9878, + "step": 1484 + }, + { + "epoch": 0.9951832460732984, + "grad_norm": 2.2752175331115723, + "learning_rate": 2.7710019076532257e-10, + "loss": 4.9163, + "step": 1485 + }, + { + "epoch": 0.9958534031413613, + "grad_norm": 2.1237142086029053, + "learning_rate": 2.0358481143067932e-10, + "loss": 4.9672, + "step": 1486 + }, + { + "epoch": 0.9965235602094241, + "grad_norm": 1.9874749183654785, + "learning_rate": 1.4137892759169546e-10, + "loss": 4.8258, + "step": 1487 + }, + { + "epoch": 0.997193717277487, + "grad_norm": 2.537884473800659, + "learning_rate": 9.048282067825043e-11, + "loss": 5.03, + "step": 1488 + }, + { + "epoch": 0.9978638743455497, + "grad_norm": 1.966549277305603, + "learning_rate": 5.0896720952819945e-11, + "loss": 5.0325, + "step": 1489 + }, + { + "epoch": 0.9985340314136125, + "grad_norm": 2.476804733276367, + "learning_rate": 2.262080750964346e-11, + "loss": 5.1559, + "step": 1490 + }, + { + "epoch": 0.9992041884816754, + "grad_norm": 1.8347021341323853, + "learning_rate": 5.655208273613877e-12, + "loss": 5.0006, + "step": 1491 + }, + { + "epoch": 0.9998743455497382, + "grad_norm": 2.3418211936950684, + "learning_rate": 0.0, + "loss": 5.0768, + "step": 1492 + }, + { + "epoch": 0.9998743455497382, + "step": 1492, + "total_flos": 2.186841498544033e+19, + "train_loss": 5.3191524389282305, + "train_runtime": 50561.1303, + "train_samples_per_second": 7.555, + "train_steps_per_second": 0.03 + } + ], + "logging_steps": 1, + "max_steps": 1492, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.186841498544033e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}