Veronica / trainer_state.json
MhaWay's picture
Veronica-Polymorphic 551M — Pretrained v1
77877d8
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9999925976919604,
"eval_steps": 1000,
"global_step": 33773,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002960923215858705,
"grad_norm": 41.32149887084961,
"learning_rate": 1.98e-06,
"loss": 87.7468,
"step": 100
},
{
"epoch": 0.00592184643171741,
"grad_norm": 39.51910400390625,
"learning_rate": 3.98e-06,
"loss": 85.766,
"step": 200
},
{
"epoch": 0.008882769647576115,
"grad_norm": 26.557823181152344,
"learning_rate": 5.98e-06,
"loss": 80.1376,
"step": 300
},
{
"epoch": 0.01184369286343482,
"grad_norm": 21.80652618408203,
"learning_rate": 7.98e-06,
"loss": 74.3306,
"step": 400
},
{
"epoch": 0.014804616079293524,
"grad_norm": 16.2612247467041,
"learning_rate": 9.980000000000001e-06,
"loss": 72.3247,
"step": 500
},
{
"epoch": 0.01776553929515223,
"grad_norm": 17.281190872192383,
"learning_rate": 1.198e-05,
"loss": 71.0703,
"step": 600
},
{
"epoch": 0.020726462511010933,
"grad_norm": 16.140579223632812,
"learning_rate": 1.3980000000000002e-05,
"loss": 69.5824,
"step": 700
},
{
"epoch": 0.02368738572686964,
"grad_norm": 13.456184387207031,
"learning_rate": 1.598e-05,
"loss": 67.2135,
"step": 800
},
{
"epoch": 0.026648308942728342,
"grad_norm": 11.778711318969727,
"learning_rate": 1.798e-05,
"loss": 65.1397,
"step": 900
},
{
"epoch": 0.02960923215858705,
"grad_norm": 11.965922355651855,
"learning_rate": 1.9980000000000002e-05,
"loss": 63.204,
"step": 1000
},
{
"epoch": 0.02960923215858705,
"eval_loss": 7.691287040710449,
"eval_runtime": 37.9485,
"eval_samples_per_second": 28.486,
"eval_steps_per_second": 7.141,
"step": 1000
},
{
"epoch": 0.032570155374445756,
"grad_norm": 8.482980728149414,
"learning_rate": 2.198e-05,
"loss": 60.9402,
"step": 1100
},
{
"epoch": 0.03553107859030446,
"grad_norm": 49.948341369628906,
"learning_rate": 2.398e-05,
"loss": 58.9346,
"step": 1200
},
{
"epoch": 0.03849200180616316,
"grad_norm": 10.039616584777832,
"learning_rate": 2.5980000000000002e-05,
"loss": 57.309,
"step": 1300
},
{
"epoch": 0.041452925022021865,
"grad_norm": 8.930785179138184,
"learning_rate": 2.798e-05,
"loss": 56.3741,
"step": 1400
},
{
"epoch": 0.04441384823788057,
"grad_norm": 8.14844036102295,
"learning_rate": 2.998e-05,
"loss": 55.8969,
"step": 1500
},
{
"epoch": 0.04737477145373928,
"grad_norm": 16.170246124267578,
"learning_rate": 3.198e-05,
"loss": 55.12,
"step": 1600
},
{
"epoch": 0.05033569466959798,
"grad_norm": 10.525145530700684,
"learning_rate": 3.398e-05,
"loss": 54.5077,
"step": 1700
},
{
"epoch": 0.053296617885456685,
"grad_norm": 29.427160263061523,
"learning_rate": 3.5980000000000004e-05,
"loss": 54.2087,
"step": 1800
},
{
"epoch": 0.05625754110131539,
"grad_norm": 47.125083923339844,
"learning_rate": 3.798e-05,
"loss": 53.4388,
"step": 1900
},
{
"epoch": 0.0592184643171741,
"grad_norm": 10.07633113861084,
"learning_rate": 3.998e-05,
"loss": 53.5798,
"step": 2000
},
{
"epoch": 0.0592184643171741,
"eval_loss": 6.5744218826293945,
"eval_runtime": 38.8407,
"eval_samples_per_second": 27.832,
"eval_steps_per_second": 6.977,
"step": 2000
},
{
"epoch": 0.0621793875330328,
"grad_norm": 10.918025970458984,
"learning_rate": 4.198e-05,
"loss": 52.5816,
"step": 2100
},
{
"epoch": 0.06514031074889151,
"grad_norm": 12.68106460571289,
"learning_rate": 4.398e-05,
"loss": 52.6681,
"step": 2200
},
{
"epoch": 0.06810123396475021,
"grad_norm": 15.503605842590332,
"learning_rate": 4.598e-05,
"loss": 52.5443,
"step": 2300
},
{
"epoch": 0.07106215718060892,
"grad_norm": 10.995290756225586,
"learning_rate": 4.798e-05,
"loss": 51.4841,
"step": 2400
},
{
"epoch": 0.07402308039646761,
"grad_norm": 22.71038055419922,
"learning_rate": 4.998e-05,
"loss": 51.05,
"step": 2500
},
{
"epoch": 0.07698400361232632,
"grad_norm": 11.035310745239258,
"learning_rate": 5.198e-05,
"loss": 50.8273,
"step": 2600
},
{
"epoch": 0.07994492682818503,
"grad_norm": 14.107246398925781,
"learning_rate": 5.398e-05,
"loss": 50.9043,
"step": 2700
},
{
"epoch": 0.08290585004404373,
"grad_norm": 11.570377349853516,
"learning_rate": 5.598e-05,
"loss": 50.4862,
"step": 2800
},
{
"epoch": 0.08586677325990244,
"grad_norm": 16.12681770324707,
"learning_rate": 5.7980000000000004e-05,
"loss": 50.0248,
"step": 2900
},
{
"epoch": 0.08882769647576114,
"grad_norm": 14.925129890441895,
"learning_rate": 5.9980000000000005e-05,
"loss": 49.789,
"step": 3000
},
{
"epoch": 0.08882769647576114,
"eval_loss": 6.141844272613525,
"eval_runtime": 38.2773,
"eval_samples_per_second": 28.241,
"eval_steps_per_second": 7.08,
"step": 3000
},
{
"epoch": 0.09178861969161985,
"grad_norm": 16.748519897460938,
"learning_rate": 6.198e-05,
"loss": 49.4085,
"step": 3100
},
{
"epoch": 0.09474954290747856,
"grad_norm": 12.9891939163208,
"learning_rate": 6.398000000000001e-05,
"loss": 49.1003,
"step": 3200
},
{
"epoch": 0.09771046612333725,
"grad_norm": 11.456365585327148,
"learning_rate": 6.598e-05,
"loss": 48.6685,
"step": 3300
},
{
"epoch": 0.10067138933919596,
"grad_norm": 19.153154373168945,
"learning_rate": 6.798e-05,
"loss": 48.3342,
"step": 3400
},
{
"epoch": 0.10363231255505466,
"grad_norm": 13.501580238342285,
"learning_rate": 6.998e-05,
"loss": 47.7641,
"step": 3500
},
{
"epoch": 0.10659323577091337,
"grad_norm": 14.034686088562012,
"learning_rate": 7.198e-05,
"loss": 47.8053,
"step": 3600
},
{
"epoch": 0.10955415898677208,
"grad_norm": 14.023058891296387,
"learning_rate": 7.398e-05,
"loss": 47.4925,
"step": 3700
},
{
"epoch": 0.11251508220263078,
"grad_norm": 16.410221099853516,
"learning_rate": 7.598e-05,
"loss": 47.1501,
"step": 3800
},
{
"epoch": 0.11547600541848949,
"grad_norm": 17.547571182250977,
"learning_rate": 7.798000000000001e-05,
"loss": 47.3132,
"step": 3900
},
{
"epoch": 0.1184369286343482,
"grad_norm": 11.753161430358887,
"learning_rate": 7.998e-05,
"loss": 47.071,
"step": 4000
},
{
"epoch": 0.1184369286343482,
"eval_loss": 5.719655990600586,
"eval_runtime": 38.3575,
"eval_samples_per_second": 28.182,
"eval_steps_per_second": 7.065,
"step": 4000
},
{
"epoch": 0.12139785185020689,
"grad_norm": 51.37761306762695,
"learning_rate": 8.198000000000001e-05,
"loss": 46.1728,
"step": 4100
},
{
"epoch": 0.1243587750660656,
"grad_norm": 12.61581802368164,
"learning_rate": 8.398e-05,
"loss": 45.9941,
"step": 4200
},
{
"epoch": 0.1273196982819243,
"grad_norm": 12.667135238647461,
"learning_rate": 8.598e-05,
"loss": 46.1649,
"step": 4300
},
{
"epoch": 0.13028062149778302,
"grad_norm": 10.368157386779785,
"learning_rate": 8.798e-05,
"loss": 45.9172,
"step": 4400
},
{
"epoch": 0.13324154471364172,
"grad_norm": 12.5702543258667,
"learning_rate": 8.998e-05,
"loss": 45.5541,
"step": 4500
},
{
"epoch": 0.13620246792950041,
"grad_norm": 10.535380363464355,
"learning_rate": 9.198e-05,
"loss": 44.45,
"step": 4600
},
{
"epoch": 0.1391633911453591,
"grad_norm": 11.035446166992188,
"learning_rate": 9.398e-05,
"loss": 44.3243,
"step": 4700
},
{
"epoch": 0.14212431436121784,
"grad_norm": 10.718255043029785,
"learning_rate": 9.598e-05,
"loss": 43.8677,
"step": 4800
},
{
"epoch": 0.14508523757707653,
"grad_norm": 9.816108703613281,
"learning_rate": 9.798000000000001e-05,
"loss": 43.6948,
"step": 4900
},
{
"epoch": 0.14804616079293523,
"grad_norm": 9.837696075439453,
"learning_rate": 9.998000000000002e-05,
"loss": 43.6361,
"step": 5000
},
{
"epoch": 0.14804616079293523,
"eval_loss": 5.329010486602783,
"eval_runtime": 38.0693,
"eval_samples_per_second": 28.396,
"eval_steps_per_second": 7.119,
"step": 5000
},
{
"epoch": 0.15100708400879395,
"grad_norm": 10.140490531921387,
"learning_rate": 0.00010198,
"loss": 42.8106,
"step": 5100
},
{
"epoch": 0.15396800722465265,
"grad_norm": 9.330647468566895,
"learning_rate": 0.00010398,
"loss": 42.4439,
"step": 5200
},
{
"epoch": 0.15692893044051134,
"grad_norm": 9.038117408752441,
"learning_rate": 0.00010598,
"loss": 41.8943,
"step": 5300
},
{
"epoch": 0.15988985365637007,
"grad_norm": 10.28738021850586,
"learning_rate": 0.00010798,
"loss": 41.5117,
"step": 5400
},
{
"epoch": 0.16285077687222876,
"grad_norm": 9.869328498840332,
"learning_rate": 0.00010998,
"loss": 41.3489,
"step": 5500
},
{
"epoch": 0.16581170008808746,
"grad_norm": 10.985088348388672,
"learning_rate": 0.00011198000000000001,
"loss": 40.7585,
"step": 5600
},
{
"epoch": 0.16877262330394618,
"grad_norm": 11.46516227722168,
"learning_rate": 0.00011398,
"loss": 40.2893,
"step": 5700
},
{
"epoch": 0.17173354651980488,
"grad_norm": 9.691688537597656,
"learning_rate": 0.00011598000000000001,
"loss": 40.0513,
"step": 5800
},
{
"epoch": 0.17469446973566358,
"grad_norm": 10.819178581237793,
"learning_rate": 0.00011798,
"loss": 39.986,
"step": 5900
},
{
"epoch": 0.17765539295152227,
"grad_norm": 9.694029808044434,
"learning_rate": 0.00011998,
"loss": 39.3918,
"step": 6000
},
{
"epoch": 0.17765539295152227,
"eval_loss": 4.900777339935303,
"eval_runtime": 38.052,
"eval_samples_per_second": 28.409,
"eval_steps_per_second": 7.122,
"step": 6000
},
{
"epoch": 0.180616316167381,
"grad_norm": 9.988055229187012,
"learning_rate": 0.00011999900481764066,
"loss": 39.336,
"step": 6100
},
{
"epoch": 0.1835772393832397,
"grad_norm": 13.242379188537598,
"learning_rate": 0.00011999597899343296,
"loss": 39.0612,
"step": 6200
},
{
"epoch": 0.1865381625990984,
"grad_norm": 13.935639381408691,
"learning_rate": 0.00011999092252825071,
"loss": 38.585,
"step": 6300
},
{
"epoch": 0.1894990858149571,
"grad_norm": 9.50368881225586,
"learning_rate": 0.00011998383559323646,
"loss": 38.3112,
"step": 6400
},
{
"epoch": 0.1924600090308158,
"grad_norm": 10.878887176513672,
"learning_rate": 0.00011997471842825661,
"loss": 38.3219,
"step": 6500
},
{
"epoch": 0.1954209322466745,
"grad_norm": 10.517402648925781,
"learning_rate": 0.00011996357134189334,
"loss": 37.8246,
"step": 6600
},
{
"epoch": 0.19838185546253323,
"grad_norm": 10.922290802001953,
"learning_rate": 0.0001199503947114341,
"loss": 37.6387,
"step": 6700
},
{
"epoch": 0.20134277867839193,
"grad_norm": 11.845630645751953,
"learning_rate": 0.00011993518898285887,
"loss": 37.8343,
"step": 6800
},
{
"epoch": 0.20430370189425062,
"grad_norm": 8.628484725952148,
"learning_rate": 0.00011991795467082508,
"loss": 37.5011,
"step": 6900
},
{
"epoch": 0.20726462511010932,
"grad_norm": 9.489052772521973,
"learning_rate": 0.00011989869235865012,
"loss": 37.132,
"step": 7000
},
{
"epoch": 0.20726462511010932,
"eval_loss": 4.595886707305908,
"eval_runtime": 38.0814,
"eval_samples_per_second": 28.387,
"eval_steps_per_second": 7.116,
"step": 7000
},
{
"epoch": 0.21022554832596804,
"grad_norm": 9.687568664550781,
"learning_rate": 0.00011987740269829175,
"loss": 36.9362,
"step": 7100
},
{
"epoch": 0.21318647154182674,
"grad_norm": 8.676931381225586,
"learning_rate": 0.0001198540864103258,
"loss": 37.0267,
"step": 7200
},
{
"epoch": 0.21614739475768543,
"grad_norm": 9.232645988464355,
"learning_rate": 0.00011982874428392204,
"loss": 36.5181,
"step": 7300
},
{
"epoch": 0.21910831797354416,
"grad_norm": 8.917469024658203,
"learning_rate": 0.00011980137717681727,
"loss": 36.5812,
"step": 7400
},
{
"epoch": 0.22206924118940286,
"grad_norm": 8.593257904052734,
"learning_rate": 0.0001197719860152864,
"loss": 36.0672,
"step": 7500
},
{
"epoch": 0.22503016440526155,
"grad_norm": 10.630696296691895,
"learning_rate": 0.00011974057179411103,
"loss": 36.2405,
"step": 7600
},
{
"epoch": 0.22799108762112028,
"grad_norm": 9.975415229797363,
"learning_rate": 0.00011970713557654582,
"loss": 35.9903,
"step": 7700
},
{
"epoch": 0.23095201083697897,
"grad_norm": 8.622698783874512,
"learning_rate": 0.00011967167849428251,
"loss": 35.8196,
"step": 7800
},
{
"epoch": 0.23391293405283767,
"grad_norm": 14.828067779541016,
"learning_rate": 0.00011963420174741161,
"loss": 35.7946,
"step": 7900
},
{
"epoch": 0.2368738572686964,
"grad_norm": 9.303028106689453,
"learning_rate": 0.00011959470660438173,
"loss": 35.5493,
"step": 8000
},
{
"epoch": 0.2368738572686964,
"eval_loss": 4.408100128173828,
"eval_runtime": 37.807,
"eval_samples_per_second": 28.593,
"eval_steps_per_second": 7.168,
"step": 8000
},
{
"epoch": 0.2398347804845551,
"grad_norm": 11.987268447875977,
"learning_rate": 0.00011955319440195674,
"loss": 35.6014,
"step": 8100
},
{
"epoch": 0.24279570370041378,
"grad_norm": 10.032620429992676,
"learning_rate": 0.00011950966654517043,
"loss": 35.5302,
"step": 8200
},
{
"epoch": 0.24575662691627248,
"grad_norm": 9.362653732299805,
"learning_rate": 0.00011946412450727906,
"loss": 35.2124,
"step": 8300
},
{
"epoch": 0.2487175501321312,
"grad_norm": 9.706056594848633,
"learning_rate": 0.00011941656982971138,
"loss": 34.9229,
"step": 8400
},
{
"epoch": 0.25167847334798993,
"grad_norm": 10.424148559570312,
"learning_rate": 0.00011936700412201653,
"loss": 35.1602,
"step": 8500
},
{
"epoch": 0.2546393965638486,
"grad_norm": 10.900792121887207,
"learning_rate": 0.00011931542906180957,
"loss": 34.9212,
"step": 8600
},
{
"epoch": 0.2576003197797073,
"grad_norm": 10.541563034057617,
"learning_rate": 0.00011926184639471465,
"loss": 34.8347,
"step": 8700
},
{
"epoch": 0.26056124299556604,
"grad_norm": 8.576896667480469,
"learning_rate": 0.00011920625793430596,
"loss": 34.9933,
"step": 8800
},
{
"epoch": 0.2635221662114247,
"grad_norm": 10.162493705749512,
"learning_rate": 0.00011914866556204637,
"loss": 34.3925,
"step": 8900
},
{
"epoch": 0.26648308942728344,
"grad_norm": 11.247607231140137,
"learning_rate": 0.0001190890712272237,
"loss": 34.4828,
"step": 9000
},
{
"epoch": 0.26648308942728344,
"eval_loss": 4.2549567222595215,
"eval_runtime": 37.961,
"eval_samples_per_second": 28.477,
"eval_steps_per_second": 7.139,
"step": 9000
},
{
"epoch": 0.2694440126431421,
"grad_norm": 9.189545631408691,
"learning_rate": 0.00011902747694688472,
"loss": 34.3655,
"step": 9100
},
{
"epoch": 0.27240493585900083,
"grad_norm": 11.199912071228027,
"learning_rate": 0.000118963884805767,
"loss": 34.4358,
"step": 9200
},
{
"epoch": 0.27536585907485955,
"grad_norm": 9.673705101013184,
"learning_rate": 0.00011889829695622823,
"loss": 34.3689,
"step": 9300
},
{
"epoch": 0.2783267822907182,
"grad_norm": 10.03848934173584,
"learning_rate": 0.00011883071561817344,
"loss": 33.9158,
"step": 9400
},
{
"epoch": 0.28128770550657695,
"grad_norm": 11.581180572509766,
"learning_rate": 0.00011876114307897981,
"loss": 33.992,
"step": 9500
},
{
"epoch": 0.28424862872243567,
"grad_norm": 10.81711483001709,
"learning_rate": 0.00011868958169341929,
"loss": 34.1195,
"step": 9600
},
{
"epoch": 0.28720955193829434,
"grad_norm": 9.648648262023926,
"learning_rate": 0.00011861603388357893,
"loss": 34.1664,
"step": 9700
},
{
"epoch": 0.29017047515415306,
"grad_norm": 11.37558364868164,
"learning_rate": 0.00011854050213877877,
"loss": 33.9937,
"step": 9800
},
{
"epoch": 0.2931313983700118,
"grad_norm": 9.346961975097656,
"learning_rate": 0.0001184629890154878,
"loss": 33.6917,
"step": 9900
},
{
"epoch": 0.29609232158587045,
"grad_norm": 11.31644058227539,
"learning_rate": 0.0001183834971372372,
"loss": 33.7808,
"step": 10000
},
{
"epoch": 0.29609232158587045,
"eval_loss": 4.157764434814453,
"eval_runtime": 39.9196,
"eval_samples_per_second": 27.079,
"eval_steps_per_second": 6.789,
"step": 10000
},
{
"epoch": 0.2990532448017292,
"grad_norm": 18.920991897583008,
"learning_rate": 0.00011180531798567065,
"loss": 44.0571,
"step": 10100
},
{
"epoch": 0.3020141680175879,
"grad_norm": 15.393646240234375,
"learning_rate": 0.0001116461207502148,
"loss": 39.8888,
"step": 10200
},
{
"epoch": 0.30497509123344657,
"grad_norm": 13.2774076461792,
"learning_rate": 0.00011148550761026972,
"loss": 38.6529,
"step": 10300
},
{
"epoch": 0.3079360144493053,
"grad_norm": 13.391098976135254,
"learning_rate": 0.00011132348296912578,
"loss": 37.759,
"step": 10400
},
{
"epoch": 0.310896937665164,
"grad_norm": 11.917950630187988,
"learning_rate": 0.00011116005126877037,
"loss": 37.1968,
"step": 10500
},
{
"epoch": 0.3138578608810227,
"grad_norm": 11.100213050842285,
"learning_rate": 0.0001109952169897661,
"loss": 37.213,
"step": 10600
},
{
"epoch": 0.3168187840968814,
"grad_norm": 14.579487800598145,
"learning_rate": 0.00011082898465112802,
"loss": 36.7415,
"step": 10700
},
{
"epoch": 0.31977970731274014,
"grad_norm": 11.359614372253418,
"learning_rate": 0.00011066135881019965,
"loss": 36.361,
"step": 10800
},
{
"epoch": 0.3227406305285988,
"grad_norm": 12.316486358642578,
"learning_rate": 0.00011049234406252809,
"loss": 36.0591,
"step": 10900
},
{
"epoch": 0.32570155374445753,
"grad_norm": 13.45693588256836,
"learning_rate": 0.00011032194504173804,
"loss": 35.6357,
"step": 11000
},
{
"epoch": 0.32570155374445753,
"eval_loss": 4.41249418258667,
"eval_runtime": 39.8412,
"eval_samples_per_second": 27.133,
"eval_steps_per_second": 6.802,
"step": 11000
},
{
"epoch": 0.32866247696031625,
"grad_norm": 10.018808364868164,
"learning_rate": 0.00011412090151135696,
"loss": 33.7098,
"step": 11100
},
{
"epoch": 0.3316234001761749,
"grad_norm": 10.30320930480957,
"learning_rate": 0.00011397731809339621,
"loss": 33.7831,
"step": 11200
},
{
"epoch": 0.33458432339203364,
"grad_norm": 9.650611877441406,
"learning_rate": 0.0001138320949911399,
"loss": 33.5415,
"step": 11300
},
{
"epoch": 0.33754524660789237,
"grad_norm": 8.77065372467041,
"learning_rate": 0.0001136852366160714,
"loss": 33.2261,
"step": 11400
},
{
"epoch": 0.34050616982375104,
"grad_norm": 12.062385559082031,
"learning_rate": 0.00011353674742934919,
"loss": 33.0819,
"step": 11500
},
{
"epoch": 0.34346709303960976,
"grad_norm": 10.947739601135254,
"learning_rate": 0.00011338663194167138,
"loss": 33.3451,
"step": 11600
},
{
"epoch": 0.34642801625546843,
"grad_norm": 9.377535820007324,
"learning_rate": 0.00011323489471313875,
"loss": 32.8928,
"step": 11700
},
{
"epoch": 0.34938893947132715,
"grad_norm": 8.902270317077637,
"learning_rate": 0.00011308154035311608,
"loss": 33.1756,
"step": 11800
},
{
"epoch": 0.3523498626871859,
"grad_norm": 10.434513092041016,
"learning_rate": 0.00011292657352009224,
"loss": 33.1595,
"step": 11900
},
{
"epoch": 0.35531078590304455,
"grad_norm": 11.084539413452148,
"learning_rate": 0.00011276999892153867,
"loss": 33.359,
"step": 12000
},
{
"epoch": 0.35531078590304455,
"eval_loss": 4.073917865753174,
"eval_runtime": 38.304,
"eval_samples_per_second": 28.222,
"eval_steps_per_second": 7.075,
"step": 12000
},
{
"epoch": 0.35827170911890327,
"grad_norm": 7.943862438201904,
"learning_rate": 9.143653002276282e-05,
"loss": 32.5648,
"step": 12100
},
{
"epoch": 0.361232632334762,
"grad_norm": 8.098073959350586,
"learning_rate": 9.09346201340685e-05,
"loss": 32.1551,
"step": 12200
},
{
"epoch": 0.36419355555062066,
"grad_norm": 7.46992826461792,
"learning_rate": 9.042974429385753e-05,
"loss": 32.3569,
"step": 12300
},
{
"epoch": 0.3671544787664794,
"grad_norm": 7.480947971343994,
"learning_rate": 8.992195090864853e-05,
"loss": 32.4467,
"step": 12400
},
{
"epoch": 0.3701154019823381,
"grad_norm": 7.488786220550537,
"learning_rate": 8.941128866468864e-05,
"loss": 32.4447,
"step": 12500
},
{
"epoch": 0.3730763251981968,
"grad_norm": 8.124217987060547,
"learning_rate": 8.889780652328559e-05,
"loss": 32.3657,
"step": 12600
},
{
"epoch": 0.3760372484140555,
"grad_norm": 8.322397232055664,
"learning_rate": 8.83815537161135e-05,
"loss": 31.9431,
"step": 12700
},
{
"epoch": 0.3789981716299142,
"grad_norm": 8.59915828704834,
"learning_rate": 8.786257974049245e-05,
"loss": 31.9211,
"step": 12800
},
{
"epoch": 0.3819590948457729,
"grad_norm": 8.048558235168457,
"learning_rate": 8.734093435464301e-05,
"loss": 32.437,
"step": 12900
},
{
"epoch": 0.3849200180616316,
"grad_norm": 7.816276550292969,
"learning_rate": 8.681666757291531e-05,
"loss": 32.0396,
"step": 13000
},
{
"epoch": 0.3849200180616316,
"eval_loss": 3.9447479248046875,
"eval_runtime": 112.3499,
"eval_samples_per_second": 9.622,
"eval_steps_per_second": 2.412,
"step": 13000
},
{
"epoch": 0.38788094127749034,
"grad_norm": 8.613288879394531,
"learning_rate": 8.628982966099388e-05,
"loss": 31.874,
"step": 13100
},
{
"epoch": 0.390841864493349,
"grad_norm": 7.478573799133301,
"learning_rate": 8.576047113107821e-05,
"loss": 31.7233,
"step": 13200
},
{
"epoch": 0.39380278770920774,
"grad_norm": 7.845474720001221,
"learning_rate": 8.52286427370398e-05,
"loss": 31.628,
"step": 13300
},
{
"epoch": 0.39676371092506646,
"grad_norm": 7.7132158279418945,
"learning_rate": 8.469439546955592e-05,
"loss": 31.8516,
"step": 13400
},
{
"epoch": 0.39972463414092513,
"grad_norm": 9.245190620422363,
"learning_rate": 8.415778055122073e-05,
"loss": 31.8406,
"step": 13500
},
{
"epoch": 0.40268555735678385,
"grad_norm": 8.426488876342773,
"learning_rate": 8.361884943163423e-05,
"loss": 31.7148,
"step": 13600
},
{
"epoch": 0.4056464805726426,
"grad_norm": 7.879675388336182,
"learning_rate": 8.307765378246925e-05,
"loss": 31.9798,
"step": 13700
},
{
"epoch": 0.40860740378850124,
"grad_norm": 8.469719886779785,
"learning_rate": 8.253424549251735e-05,
"loss": 31.6741,
"step": 13800
},
{
"epoch": 0.41156832700435997,
"grad_norm": 8.198810577392578,
"learning_rate": 8.198867666271385e-05,
"loss": 31.6722,
"step": 13900
},
{
"epoch": 0.41452925022021864,
"grad_norm": 7.881684303283691,
"learning_rate": 8.144099960114239e-05,
"loss": 31.8682,
"step": 14000
},
{
"epoch": 0.41452925022021864,
"eval_loss": 3.904888153076172,
"eval_runtime": 110.4703,
"eval_samples_per_second": 9.785,
"eval_steps_per_second": 2.453,
"step": 14000
},
{
"epoch": 0.41749017343607736,
"grad_norm": 7.772391319274902,
"learning_rate": 8.089126681801981e-05,
"loss": 32.0349,
"step": 14100
},
{
"epoch": 0.4204510966519361,
"grad_norm": 8.459504127502441,
"learning_rate": 8.033953102066161e-05,
"loss": 31.5844,
"step": 14200
},
{
"epoch": 0.42341201986779475,
"grad_norm": 7.765544414520264,
"learning_rate": 7.978584510842833e-05,
"loss": 31.6879,
"step": 14300
},
{
"epoch": 0.4263729430836535,
"grad_norm": 8.06749153137207,
"learning_rate": 7.923026216765381e-05,
"loss": 31.5893,
"step": 14400
},
{
"epoch": 0.4293338662995122,
"grad_norm": 8.966425895690918,
"learning_rate": 7.86728354665553e-05,
"loss": 31.392,
"step": 14500
},
{
"epoch": 0.43229478951537087,
"grad_norm": 8.47319221496582,
"learning_rate": 7.81136184501262e-05,
"loss": 31.3068,
"step": 14600
},
{
"epoch": 0.4352557127312296,
"grad_norm": 8.642230033874512,
"learning_rate": 7.755266473501193e-05,
"loss": 31.5877,
"step": 14700
},
{
"epoch": 0.4382166359470883,
"grad_norm": 8.412428855895996,
"learning_rate": 7.699002810436915e-05,
"loss": 31.6239,
"step": 14800
},
{
"epoch": 0.441177559162947,
"grad_norm": 6.971558094024658,
"learning_rate": 7.642576250270929e-05,
"loss": 31.7946,
"step": 14900
},
{
"epoch": 0.4441384823788057,
"grad_norm": 7.922480583190918,
"learning_rate": 7.585992203072628e-05,
"loss": 31.4474,
"step": 15000
},
{
"epoch": 0.4441384823788057,
"eval_loss": 3.875948667526245,
"eval_runtime": 109.3049,
"eval_samples_per_second": 9.89,
"eval_steps_per_second": 2.479,
"step": 15000
},
{
"epoch": 0.44709940559466443,
"grad_norm": 8.747485160827637,
"learning_rate": 7.529256094010965e-05,
"loss": 31.6016,
"step": 15100
},
{
"epoch": 0.4500603288105231,
"grad_norm": 8.723346710205078,
"learning_rate": 7.472373362834283e-05,
"loss": 31.2744,
"step": 15200
},
{
"epoch": 0.4530212520263818,
"grad_norm": 8.310611724853516,
"learning_rate": 7.415349463348775e-05,
"loss": 31.7448,
"step": 15300
},
{
"epoch": 0.45598217524224055,
"grad_norm": 8.236388206481934,
"learning_rate": 7.358189862895577e-05,
"loss": 30.9859,
"step": 15400
},
{
"epoch": 0.4589430984580992,
"grad_norm": 8.104386329650879,
"learning_rate": 7.300900041826566e-05,
"loss": 31.1935,
"step": 15500
},
{
"epoch": 0.46190402167395794,
"grad_norm": 8.219923973083496,
"learning_rate": 7.243485492978928e-05,
"loss": 30.9099,
"step": 15600
},
{
"epoch": 0.46486494488981667,
"grad_norm": 8.872945785522461,
"learning_rate": 7.185951721148502e-05,
"loss": 31.3423,
"step": 15700
},
{
"epoch": 0.46782586810567534,
"grad_norm": 8.087647438049316,
"learning_rate": 7.128304242561999e-05,
"loss": 31.1816,
"step": 15800
},
{
"epoch": 0.47078679132153406,
"grad_norm": 8.805392265319824,
"learning_rate": 7.070548584348108e-05,
"loss": 31.0977,
"step": 15900
},
{
"epoch": 0.4737477145373928,
"grad_norm": 8.469452857971191,
"learning_rate": 7.012690284007577e-05,
"loss": 31.5828,
"step": 16000
},
{
"epoch": 0.4737477145373928,
"eval_loss": 3.8530030250549316,
"eval_runtime": 109.275,
"eval_samples_per_second": 9.892,
"eval_steps_per_second": 2.48,
"step": 16000
},
{
"epoch": 0.47670863775325145,
"grad_norm": 8.871159553527832,
"learning_rate": 6.954734888882281e-05,
"loss": 30.9753,
"step": 16100
},
{
"epoch": 0.4796695609691102,
"grad_norm": 8.81116008758545,
"learning_rate": 6.896687955623357e-05,
"loss": 31.2067,
"step": 16200
},
{
"epoch": 0.4826304841849689,
"grad_norm": 7.77982759475708,
"learning_rate": 6.838555049658432e-05,
"loss": 31.089,
"step": 16300
},
{
"epoch": 0.48559140740082757,
"grad_norm": 8.370245933532715,
"learning_rate": 6.780341744658044e-05,
"loss": 30.9776,
"step": 16400
},
{
"epoch": 0.4885523306166863,
"grad_norm": 8.41613483428955,
"learning_rate": 6.722053622001221e-05,
"loss": 31.1095,
"step": 16500
},
{
"epoch": 0.49151325383254496,
"grad_norm": 7.951696395874023,
"learning_rate": 6.663696270240373e-05,
"loss": 31.1532,
"step": 16600
},
{
"epoch": 0.4944741770484037,
"grad_norm": 9.02717113494873,
"learning_rate": 6.60527528456546e-05,
"loss": 31.0777,
"step": 16700
},
{
"epoch": 0.4974351002642624,
"grad_norm": 8.57259750366211,
"learning_rate": 6.546796266267535e-05,
"loss": 31.3509,
"step": 16800
},
{
"epoch": 0.5003960234801211,
"grad_norm": 9.129491806030273,
"learning_rate": 6.488264822201711e-05,
"loss": 30.7844,
"step": 16900
},
{
"epoch": 0.5033569466959799,
"grad_norm": 8.600064277648926,
"learning_rate": 6.429686564249579e-05,
"loss": 31.1164,
"step": 17000
},
{
"epoch": 0.5033569466959799,
"eval_loss": 3.836409091949463,
"eval_runtime": 109.0903,
"eval_samples_per_second": 9.909,
"eval_steps_per_second": 2.484,
"step": 17000
},
{
"epoch": 0.5063178699118385,
"grad_norm": 8.62096881866455,
"learning_rate": 6.371067108781158e-05,
"loss": 31.1944,
"step": 17100
},
{
"epoch": 0.5092787931276972,
"grad_norm": 8.052851676940918,
"learning_rate": 6.312412076116401e-05,
"loss": 31.0126,
"step": 17200
},
{
"epoch": 0.5122397163435559,
"grad_norm": 8.32268238067627,
"learning_rate": 6.253727089986337e-05,
"loss": 31.0692,
"step": 17300
},
{
"epoch": 0.5152006395594146,
"grad_norm": 8.130902290344238,
"learning_rate": 6.195017776993876e-05,
"loss": 30.9143,
"step": 17400
},
{
"epoch": 0.5181615627752734,
"grad_norm": 9.245232582092285,
"learning_rate": 6.136289766074334e-05,
"loss": 31.0029,
"step": 17500
},
{
"epoch": 0.5211224859911321,
"grad_norm": 8.296626091003418,
"learning_rate": 6.077548687955759e-05,
"loss": 31.0624,
"step": 17600
},
{
"epoch": 0.5240834092069907,
"grad_norm": 8.933104515075684,
"learning_rate": 6.018800174619048e-05,
"loss": 31.0619,
"step": 17700
},
{
"epoch": 0.5270443324228494,
"grad_norm": 7.37945032119751,
"learning_rate": 5.960049858757974e-05,
"loss": 31.3181,
"step": 17800
},
{
"epoch": 0.5300052556387082,
"grad_norm": 8.817550659179688,
"learning_rate": 5.901303373239133e-05,
"loss": 30.8424,
"step": 17900
},
{
"epoch": 0.5329661788545669,
"grad_norm": 7.71854305267334,
"learning_rate": 5.842566350561879e-05,
"loss": 31.0376,
"step": 18000
},
{
"epoch": 0.5329661788545669,
"eval_loss": 3.822613477706909,
"eval_runtime": 112.0979,
"eval_samples_per_second": 9.643,
"eval_steps_per_second": 2.418,
"step": 18000
},
{
"epoch": 0.5359271020704256,
"grad_norm": 8.84870719909668,
"learning_rate": 5.7838444223182826e-05,
"loss": 30.8901,
"step": 18100
},
{
"epoch": 0.5388880252862842,
"grad_norm": 7.48129415512085,
"learning_rate": 5.725143218653187e-05,
"loss": 31.0275,
"step": 18200
},
{
"epoch": 0.5418489485021429,
"grad_norm": 8.218484878540039,
"learning_rate": 5.666468367724412e-05,
"loss": 31.1443,
"step": 18300
},
{
"epoch": 0.5448098717180017,
"grad_norm": 9.589841842651367,
"learning_rate": 5.607825495163119e-05,
"loss": 30.9756,
"step": 18400
},
{
"epoch": 0.5477707949338604,
"grad_norm": 8.583683013916016,
"learning_rate": 5.549220223534451e-05,
"loss": 31.0641,
"step": 18500
},
{
"epoch": 0.5507317181497191,
"grad_norm": 7.978188991546631,
"learning_rate": 5.490658171798439e-05,
"loss": 30.8899,
"step": 18600
},
{
"epoch": 0.5536926413655778,
"grad_norm": 8.130802154541016,
"learning_rate": 5.432144954771287e-05,
"loss": 31.0812,
"step": 18700
},
{
"epoch": 0.5566535645814364,
"grad_norm": 8.981709480285645,
"learning_rate": 5.37368618258701e-05,
"loss": 31.0612,
"step": 18800
},
{
"epoch": 0.5596144877972952,
"grad_norm": 7.87661075592041,
"learning_rate": 5.315287460159561e-05,
"loss": 30.8581,
"step": 18900
},
{
"epoch": 0.5625754110131539,
"grad_norm": 8.329483032226562,
"learning_rate": 5.256954386645438e-05,
"loss": 31.1805,
"step": 19000
},
{
"epoch": 0.5625754110131539,
"eval_loss": 3.8131661415100098,
"eval_runtime": 111.5683,
"eval_samples_per_second": 9.689,
"eval_steps_per_second": 2.429,
"step": 19000
},
{
"epoch": 0.5655363342290126,
"grad_norm": 8.833015441894531,
"learning_rate": 5.198692554906851e-05,
"loss": 30.9231,
"step": 19100
},
{
"epoch": 0.5684972574448713,
"grad_norm": 7.966989994049072,
"learning_rate": 5.1405075509754834e-05,
"loss": 31.0225,
"step": 19200
},
{
"epoch": 0.5714581806607301,
"grad_norm": 8.791169166564941,
"learning_rate": 5.0824049535169166e-05,
"loss": 31.1551,
"step": 19300
},
{
"epoch": 0.5744191038765887,
"grad_norm": 7.9680023193359375,
"learning_rate": 5.024390333295761e-05,
"loss": 31.0498,
"step": 19400
},
{
"epoch": 0.5773800270924474,
"grad_norm": 8.603718757629395,
"learning_rate": 4.966469252641538e-05,
"loss": 30.9017,
"step": 19500
},
{
"epoch": 0.5803409503083061,
"grad_norm": 12.401627540588379,
"learning_rate": 4.908647264915378e-05,
"loss": 30.9988,
"step": 19600
},
{
"epoch": 0.5833018735241648,
"grad_norm": 8.433266639709473,
"learning_rate": 4.8509299139775734e-05,
"loss": 30.9905,
"step": 19700
},
{
"epoch": 0.5862627967400236,
"grad_norm": 7.99282693862915,
"learning_rate": 4.7933227336560414e-05,
"loss": 31.0604,
"step": 19800
},
{
"epoch": 0.5892237199558823,
"grad_norm": 8.011063575744629,
"learning_rate": 4.735831247215753e-05,
"loss": 30.7471,
"step": 19900
},
{
"epoch": 0.5921846431717409,
"grad_norm": 9.603862762451172,
"learning_rate": 4.67846096682918e-05,
"loss": 30.8428,
"step": 20000
},
{
"epoch": 0.5921846431717409,
"eval_loss": 3.8060901165008545,
"eval_runtime": 112.6154,
"eval_samples_per_second": 9.599,
"eval_steps_per_second": 2.406,
"step": 20000
},
{
"epoch": 0.5951455663875996,
"grad_norm": 8.427188873291016,
"learning_rate": 4.6212173930477874e-05,
"loss": 30.8438,
"step": 20100
},
{
"epoch": 0.5981064896034584,
"grad_norm": 7.692320346832275,
"learning_rate": 4.5641060142746556e-05,
"loss": 30.7664,
"step": 20200
},
{
"epoch": 0.6010674128193171,
"grad_norm": 8.596179962158203,
"learning_rate": 4.507132306238262e-05,
"loss": 30.9387,
"step": 20300
},
{
"epoch": 0.6040283360351758,
"grad_norm": 8.076534271240234,
"learning_rate": 4.450301731467488e-05,
"loss": 30.851,
"step": 20400
},
{
"epoch": 0.6069892592510344,
"grad_norm": 9.05728816986084,
"learning_rate": 4.3936197387678665e-05,
"loss": 30.7486,
"step": 20500
},
{
"epoch": 0.6099501824668931,
"grad_norm": 8.477595329284668,
"learning_rate": 4.3370917626991706e-05,
"loss": 30.6843,
"step": 20600
},
{
"epoch": 0.6129111056827519,
"grad_norm": 8.171915054321289,
"learning_rate": 4.2807232230543625e-05,
"loss": 30.9551,
"step": 20700
},
{
"epoch": 0.6158720288986106,
"grad_norm": 8.333806991577148,
"learning_rate": 4.22451952433994e-05,
"loss": 30.8566,
"step": 20800
},
{
"epoch": 0.6188329521144693,
"grad_norm": 7.9477715492248535,
"learning_rate": 4.168486055257777e-05,
"loss": 30.8577,
"step": 20900
},
{
"epoch": 0.621793875330328,
"grad_norm": 8.560218811035156,
"learning_rate": 4.112628188188457e-05,
"loss": 30.7203,
"step": 21000
},
{
"epoch": 0.621793875330328,
"eval_loss": 3.7986109256744385,
"eval_runtime": 109.4771,
"eval_samples_per_second": 9.874,
"eval_steps_per_second": 2.475,
"step": 21000
},
{
"epoch": 0.6247547985461867,
"grad_norm": 8.963776588439941,
"learning_rate": 4.056951278676187e-05,
"loss": 30.9418,
"step": 21100
},
{
"epoch": 0.6277157217620454,
"grad_norm": 8.338837623596191,
"learning_rate": 4.001460664915308e-05,
"loss": 30.756,
"step": 21200
},
{
"epoch": 0.6306766449779041,
"grad_norm": 8.323155403137207,
"learning_rate": 3.946161667238485e-05,
"loss": 30.6959,
"step": 21300
},
{
"epoch": 0.6336375681937628,
"grad_norm": 9.881996154785156,
"learning_rate": 3.8910595876066085e-05,
"loss": 30.9333,
"step": 21400
},
{
"epoch": 0.6365984914096215,
"grad_norm": 8.089996337890625,
"learning_rate": 3.836159709100446e-05,
"loss": 30.6899,
"step": 21500
},
{
"epoch": 0.6395594146254803,
"grad_norm": 7.9427289962768555,
"learning_rate": 3.7814672954141055e-05,
"loss": 30.8046,
"step": 21600
},
{
"epoch": 0.6425203378413389,
"grad_norm": 8.468146324157715,
"learning_rate": 3.7269875903503826e-05,
"loss": 31.2292,
"step": 21700
},
{
"epoch": 0.6454812610571976,
"grad_norm": 8.63842487335205,
"learning_rate": 3.672725817317973e-05,
"loss": 30.7721,
"step": 21800
},
{
"epoch": 0.6484421842730563,
"grad_norm": 8.145241737365723,
"learning_rate": 3.6186871788306674e-05,
"loss": 30.5881,
"step": 21900
},
{
"epoch": 0.6514031074889151,
"grad_norm": 8.194993019104004,
"learning_rate": 3.5648768560085604e-05,
"loss": 30.9425,
"step": 22000
},
{
"epoch": 0.6514031074889151,
"eval_loss": 3.7950870990753174,
"eval_runtime": 109.4264,
"eval_samples_per_second": 9.879,
"eval_steps_per_second": 2.477,
"step": 22000
},
{
"epoch": 0.6543640307047738,
"grad_norm": 9.304323196411133,
"learning_rate": 3.511300008081273e-05,
"loss": 30.722,
"step": 22100
},
{
"epoch": 0.6573249539206325,
"grad_norm": 7.82930850982666,
"learning_rate": 3.4579617718933054e-05,
"loss": 30.7943,
"step": 22200
},
{
"epoch": 0.6602858771364911,
"grad_norm": 7.912548542022705,
"learning_rate": 3.4048672614115294e-05,
"loss": 30.8451,
"step": 22300
},
{
"epoch": 0.6632468003523498,
"grad_norm": 8.46181583404541,
"learning_rate": 3.352021567234869e-05,
"loss": 30.9009,
"step": 22400
},
{
"epoch": 0.6662077235682086,
"grad_norm": 7.727646827697754,
"learning_rate": 3.299429756106215e-05,
"loss": 30.8281,
"step": 22500
},
{
"epoch": 0.6691686467840673,
"grad_norm": 8.119136810302734,
"learning_rate": 3.247096870426649e-05,
"loss": 30.7757,
"step": 22600
},
{
"epoch": 0.672129569999926,
"grad_norm": 8.091607093811035,
"learning_rate": 3.195027927771982e-05,
"loss": 30.8661,
"step": 22700
},
{
"epoch": 0.6750904932157847,
"grad_norm": 7.598474979400635,
"learning_rate": 3.1432279204116776e-05,
"loss": 30.6257,
"step": 22800
},
{
"epoch": 0.6780514164316434,
"grad_norm": 9.547100067138672,
"learning_rate": 3.091701814830198e-05,
"loss": 30.8582,
"step": 22900
},
{
"epoch": 0.6810123396475021,
"grad_norm": 7.637078762054443,
"learning_rate": 3.0404545512508415e-05,
"loss": 30.9432,
"step": 23000
},
{
"epoch": 0.6810123396475021,
"eval_loss": 3.791748285293579,
"eval_runtime": 109.2867,
"eval_samples_per_second": 9.891,
"eval_steps_per_second": 2.48,
"step": 23000
},
{
"epoch": 0.6839732628633608,
"grad_norm": 8.485209465026855,
"learning_rate": 2.98949104316207e-05,
"loss": 30.921,
"step": 23100
},
{
"epoch": 0.6869341860792195,
"grad_norm": 7.777042865753174,
"learning_rate": 2.938816176846421e-05,
"loss": 30.8116,
"step": 23200
},
{
"epoch": 0.6898951092950782,
"grad_norm": 7.6587138175964355,
"learning_rate": 2.8884348109120106e-05,
"loss": 30.7965,
"step": 23300
},
{
"epoch": 0.6928560325109369,
"grad_norm": 8.276775360107422,
"learning_rate": 2.8383517758267178e-05,
"loss": 30.6582,
"step": 23400
},
{
"epoch": 0.6958169557267956,
"grad_norm": 7.5494771003723145,
"learning_rate": 2.7885718734550257e-05,
"loss": 30.6483,
"step": 23500
},
{
"epoch": 0.6987778789426543,
"grad_norm": 7.938130855560303,
"learning_rate": 2.739099876597646e-05,
"loss": 30.529,
"step": 23600
},
{
"epoch": 0.701738802158513,
"grad_norm": 8.202885627746582,
"learning_rate": 2.6899405285339026e-05,
"loss": 30.825,
"step": 23700
},
{
"epoch": 0.7046997253743718,
"grad_norm": 8.393240928649902,
"learning_rate": 2.6410985425669622e-05,
"loss": 30.7867,
"step": 23800
},
{
"epoch": 0.7076606485902305,
"grad_norm": 8.32459831237793,
"learning_rate": 2.5925786015719207e-05,
"loss": 30.7898,
"step": 23900
},
{
"epoch": 0.7106215718060891,
"grad_norm": 9.323598861694336,
"learning_rate": 2.544385357546831e-05,
"loss": 30.5684,
"step": 24000
},
{
"epoch": 0.7106215718060891,
"eval_loss": 3.789947271347046,
"eval_runtime": 110.7686,
"eval_samples_per_second": 9.759,
"eval_steps_per_second": 2.447,
"step": 24000
},
{
"epoch": 0.7135824950219478,
"grad_norm": 8.184738159179688,
"learning_rate": 2.4965234311666717e-05,
"loss": 30.7187,
"step": 24100
},
{
"epoch": 0.7165434182378065,
"grad_norm": 7.82784366607666,
"learning_rate": 2.4489974113403275e-05,
"loss": 30.705,
"step": 24200
},
{
"epoch": 0.7195043414536653,
"grad_norm": 7.945186614990234,
"learning_rate": 2.4018118547706078e-05,
"loss": 30.4846,
"step": 24300
},
{
"epoch": 0.722465264669524,
"grad_norm": 9.277371406555176,
"learning_rate": 2.3549712855173688e-05,
"loss": 30.6765,
"step": 24400
},
{
"epoch": 0.7254261878853827,
"grad_norm": 8.619938850402832,
"learning_rate": 2.3084801945637512e-05,
"loss": 30.6503,
"step": 24500
},
{
"epoch": 0.7283871111012413,
"grad_norm": 8.467925071716309,
"learning_rate": 2.262343039385585e-05,
"loss": 30.957,
"step": 24600
},
{
"epoch": 0.7313480343171,
"grad_norm": 8.035057067871094,
"learning_rate": 2.216564243524035e-05,
"loss": 30.6764,
"step": 24700
},
{
"epoch": 0.7343089575329588,
"grad_norm": 7.555221080780029,
"learning_rate": 2.1711481961614565e-05,
"loss": 30.7666,
"step": 24800
},
{
"epoch": 0.7372698807488175,
"grad_norm": 7.959348201751709,
"learning_rate": 2.1260992517005892e-05,
"loss": 30.8212,
"step": 24900
},
{
"epoch": 0.7402308039646762,
"grad_norm": 7.882981300354004,
"learning_rate": 2.0814217293470476e-05,
"loss": 30.8312,
"step": 25000
},
{
"epoch": 0.7402308039646762,
"eval_loss": 3.7874350547790527,
"eval_runtime": 107.8316,
"eval_samples_per_second": 10.025,
"eval_steps_per_second": 2.513,
"step": 25000
},
{
"epoch": 0.743191727180535,
"grad_norm": 7.499105930328369,
"learning_rate": 2.0371199126952268e-05,
"loss": 30.9958,
"step": 25100
},
{
"epoch": 0.7461526503963936,
"grad_norm": 7.973631381988525,
"learning_rate": 1.9931980493175735e-05,
"loss": 30.6469,
"step": 25200
},
{
"epoch": 0.7491135736122523,
"grad_norm": 7.996872425079346,
"learning_rate": 1.949660350357356e-05,
"loss": 30.6363,
"step": 25300
},
{
"epoch": 0.752074496828111,
"grad_norm": 8.139349937438965,
"learning_rate": 1.9065109901249e-05,
"loss": 30.924,
"step": 25400
},
{
"epoch": 0.7550354200439697,
"grad_norm": 8.981887817382812,
"learning_rate": 1.863754105697369e-05,
"loss": 30.9555,
"step": 25500
},
{
"epoch": 0.7579963432598285,
"grad_norm": 7.660996913909912,
"learning_rate": 1.821393796522096e-05,
"loss": 30.8007,
"step": 25600
},
{
"epoch": 0.7609572664756871,
"grad_norm": 7.750844955444336,
"learning_rate": 1.7794341240235615e-05,
"loss": 30.7227,
"step": 25700
},
{
"epoch": 0.7639181896915458,
"grad_norm": 7.581575870513916,
"learning_rate": 1.737879111213961e-05,
"loss": 30.6509,
"step": 25800
},
{
"epoch": 0.7668791129074045,
"grad_norm": 8.771635055541992,
"learning_rate": 1.6967327423075142e-05,
"loss": 30.7893,
"step": 25900
},
{
"epoch": 0.7698400361232632,
"grad_norm": 8.594512939453125,
"learning_rate": 1.6559989623384456e-05,
"loss": 30.6874,
"step": 26000
},
{
"epoch": 0.7698400361232632,
"eval_loss": 3.7861363887786865,
"eval_runtime": 112.2096,
"eval_samples_per_second": 9.634,
"eval_steps_per_second": 2.415,
"step": 26000
},
{
"epoch": 0.772800959339122,
"grad_norm": 7.919267177581787,
"learning_rate": 1.615681676782755e-05,
"loss": 30.7685,
"step": 26100
},
{
"epoch": 0.7757618825549807,
"grad_norm": 7.744143009185791,
"learning_rate": 1.5757847511837648e-05,
"loss": 30.7558,
"step": 26200
},
{
"epoch": 0.7787228057708393,
"grad_norm": 7.894962787628174,
"learning_rate": 1.5363120107814955e-05,
"loss": 30.7543,
"step": 26300
},
{
"epoch": 0.781683728986698,
"grad_norm": 9.573600769042969,
"learning_rate": 1.4972672401459143e-05,
"loss": 30.808,
"step": 26400
},
{
"epoch": 0.7846446522025567,
"grad_norm": 7.708218574523926,
"learning_rate": 1.4586541828140706e-05,
"loss": 30.6115,
"step": 26500
},
{
"epoch": 0.7876055754184155,
"grad_norm": 8.170422554016113,
"learning_rate": 1.4204765409311852e-05,
"loss": 30.8811,
"step": 26600
},
{
"epoch": 0.7905664986342742,
"grad_norm": 8.293937683105469,
"learning_rate": 1.3827379748956783e-05,
"loss": 30.8484,
"step": 26700
},
{
"epoch": 0.7935274218501329,
"grad_norm": 7.64206075668335,
"learning_rate": 1.3454421030082402e-05,
"loss": 30.7768,
"step": 26800
},
{
"epoch": 0.7964883450659915,
"grad_norm": 7.780085563659668,
"learning_rate": 1.3085925011248902e-05,
"loss": 30.6903,
"step": 26900
},
{
"epoch": 0.7994492682818503,
"grad_norm": 7.651244640350342,
"learning_rate": 1.2721927023141509e-05,
"loss": 30.8888,
"step": 27000
},
{
"epoch": 0.7994492682818503,
"eval_loss": 3.7866110801696777,
"eval_runtime": 111.3993,
"eval_samples_per_second": 9.704,
"eval_steps_per_second": 2.433,
"step": 27000
},
{
"epoch": 0.802410191497709,
"grad_norm": 7.893172740936279,
"learning_rate": 1.2362461965182951e-05,
"loss": 30.8551,
"step": 27100
},
{
"epoch": 0.8053711147135677,
"grad_norm": 8.348461151123047,
"learning_rate": 1.2007564302187395e-05,
"loss": 30.9086,
"step": 27200
},
{
"epoch": 0.8083320379294264,
"grad_norm": 8.005925178527832,
"learning_rate": 1.1657268061055954e-05,
"loss": 30.6258,
"step": 27300
},
{
"epoch": 0.8112929611452852,
"grad_norm": 7.919161319732666,
"learning_rate": 1.1311606827514432e-05,
"loss": 30.4614,
"step": 27400
},
{
"epoch": 0.8142538843611438,
"grad_norm": 8.806751251220703,
"learning_rate": 1.0970613742892959e-05,
"loss": 30.9882,
"step": 27500
},
{
"epoch": 0.8172148075770025,
"grad_norm": 8.126434326171875,
"learning_rate": 1.0634321500948665e-05,
"loss": 30.6459,
"step": 27600
},
{
"epoch": 0.8201757307928612,
"grad_norm": 7.643808364868164,
"learning_rate": 1.0302762344730893e-05,
"loss": 30.6614,
"step": 27700
},
{
"epoch": 0.8231366540087199,
"grad_norm": 8.046734809875488,
"learning_rate": 9.97596806349001e-06,
"loss": 30.6958,
"step": 27800
},
{
"epoch": 0.8260975772245787,
"grad_norm": 8.094582557678223,
"learning_rate": 9.653969989629268e-06,
"loss": 30.5807,
"step": 27900
},
{
"epoch": 0.8290585004404373,
"grad_norm": 8.062453269958496,
"learning_rate": 9.336798995700899e-06,
"loss": 30.8323,
"step": 28000
},
{
"epoch": 0.8290585004404373,
"eval_loss": 3.786661386489868,
"eval_runtime": 106.9454,
"eval_samples_per_second": 10.108,
"eval_steps_per_second": 2.534,
"step": 28000
},
{
"epoch": 0.832019423656296,
"grad_norm": 7.68911075592041,
"learning_rate": 9.024485491446045e-06,
"loss": 30.9853,
"step": 28100
},
{
"epoch": 0.8349803468721547,
"grad_norm": 7.82414436340332,
"learning_rate": 8.717059420879143e-06,
"loss": 30.5061,
"step": 28200
},
{
"epoch": 0.8379412700880134,
"grad_norm": 7.392062664031982,
"learning_rate": 8.414550259416917e-06,
"loss": 30.9525,
"step": 28300
},
{
"epoch": 0.8409021933038722,
"grad_norm": 7.675992965698242,
"learning_rate": 8.116987011052387e-06,
"loss": 30.8296,
"step": 28400
},
{
"epoch": 0.8438631165197309,
"grad_norm": 8.038030624389648,
"learning_rate": 7.824398205574006e-06,
"loss": 30.8155,
"step": 28500
},
{
"epoch": 0.8468240397355895,
"grad_norm": 7.427101135253906,
"learning_rate": 7.536811895830222e-06,
"loss": 30.9259,
"step": 28600
},
{
"epoch": 0.8497849629514482,
"grad_norm": 8.095186233520508,
"learning_rate": 7.254255655039919e-06,
"loss": 30.824,
"step": 28700
},
{
"epoch": 0.852745886167307,
"grad_norm": 7.521733283996582,
"learning_rate": 6.9767565741486815e-06,
"loss": 30.7226,
"step": 28800
},
{
"epoch": 0.8557068093831657,
"grad_norm": 7.494954586029053,
"learning_rate": 6.704341259231415e-06,
"loss": 30.7789,
"step": 28900
},
{
"epoch": 0.8586677325990244,
"grad_norm": 7.641082763671875,
"learning_rate": 6.437035828941324e-06,
"loss": 30.8001,
"step": 29000
},
{
"epoch": 0.8586677325990244,
"eval_loss": 3.786005973815918,
"eval_runtime": 111.1521,
"eval_samples_per_second": 9.725,
"eval_steps_per_second": 2.438,
"step": 29000
},
{
"epoch": 0.8616286558148831,
"grad_norm": 7.96475887298584,
"learning_rate": 6.1748659120058386e-06,
"loss": 30.8879,
"step": 29100
},
{
"epoch": 0.8645895790307417,
"grad_norm": 6.990954875946045,
"learning_rate": 5.917856644769242e-06,
"loss": 30.6077,
"step": 29200
},
{
"epoch": 0.8675505022466005,
"grad_norm": 7.170067310333252,
"learning_rate": 5.666032668782735e-06,
"loss": 30.8456,
"step": 29300
},
{
"epoch": 0.8705114254624592,
"grad_norm": 8.4426851272583,
"learning_rate": 5.419418128441846e-06,
"loss": 30.9228,
"step": 29400
},
{
"epoch": 0.8734723486783179,
"grad_norm": 8.034204483032227,
"learning_rate": 5.178036668671475e-06,
"loss": 30.7785,
"step": 29500
},
{
"epoch": 0.8764332718941766,
"grad_norm": 7.411805629730225,
"learning_rate": 4.941911432658868e-06,
"loss": 30.7495,
"step": 29600
},
{
"epoch": 0.8793941951100354,
"grad_norm": 7.887239456176758,
"learning_rate": 4.7110650596347335e-06,
"loss": 30.7797,
"step": 29700
},
{
"epoch": 0.882355118325894,
"grad_norm": 8.600279808044434,
"learning_rate": 4.48551968270261e-06,
"loss": 30.8267,
"step": 29800
},
{
"epoch": 0.8853160415417527,
"grad_norm": 8.055954933166504,
"learning_rate": 4.26529692671679e-06,
"loss": 30.8123,
"step": 29900
},
{
"epoch": 0.8882769647576114,
"grad_norm": 7.540750503540039,
"learning_rate": 4.050417906208945e-06,
"loss": 30.8866,
"step": 30000
},
{
"epoch": 0.8882769647576114,
"eval_loss": 3.7849574089050293,
"eval_runtime": 108.0072,
"eval_samples_per_second": 10.009,
"eval_steps_per_second": 2.509,
"step": 30000
},
{
"epoch": 0.8912378879734701,
"grad_norm": 7.607705593109131,
"learning_rate": 3.840903223363752e-06,
"loss": 30.7932,
"step": 30100
},
{
"epoch": 0.8941988111893289,
"grad_norm": 7.834300518035889,
"learning_rate": 3.636772966043571e-06,
"loss": 30.6935,
"step": 30200
},
{
"epoch": 0.8971597344051876,
"grad_norm": 9.865922927856445,
"learning_rate": 3.4380467058624585e-06,
"loss": 30.5129,
"step": 30300
},
{
"epoch": 0.9001206576210462,
"grad_norm": 7.9707865715026855,
"learning_rate": 3.244743496309701e-06,
"loss": 30.8035,
"step": 30400
},
{
"epoch": 0.9030815808369049,
"grad_norm": 8.035768508911133,
"learning_rate": 3.0568818709229364e-06,
"loss": 30.4973,
"step": 30500
},
{
"epoch": 0.9060425040527637,
"grad_norm": 8.816192626953125,
"learning_rate": 2.8744798415113015e-06,
"loss": 30.5553,
"step": 30600
},
{
"epoch": 0.9090034272686224,
"grad_norm": 7.411801338195801,
"learning_rate": 2.6975548964283823e-06,
"loss": 30.6758,
"step": 30700
},
{
"epoch": 0.9119643504844811,
"grad_norm": 7.46308708190918,
"learning_rate": 2.5261239988955733e-06,
"loss": 30.8337,
"step": 30800
},
{
"epoch": 0.9149252737003397,
"grad_norm": 8.57913875579834,
"learning_rate": 2.360203585375571e-06,
"loss": 31.0671,
"step": 30900
},
{
"epoch": 0.9178861969161984,
"grad_norm": 7.983087062835693,
"learning_rate": 2.1998095639965577e-06,
"loss": 30.913,
"step": 31000
},
{
"epoch": 0.9178861969161984,
"eval_loss": 3.785719394683838,
"eval_runtime": 110.9703,
"eval_samples_per_second": 9.741,
"eval_steps_per_second": 2.442,
"step": 31000
},
{
"epoch": 0.9208471201320572,
"grad_norm": 8.11637020111084,
"learning_rate": 2.044957313026925e-06,
"loss": 30.7294,
"step": 31100
},
{
"epoch": 0.9238080433479159,
"grad_norm": 7.882040977478027,
"learning_rate": 1.895661679400842e-06,
"loss": 30.7816,
"step": 31200
},
{
"epoch": 0.9267689665637746,
"grad_norm": 7.475772857666016,
"learning_rate": 1.7519369772947525e-06,
"loss": 30.5198,
"step": 31300
},
{
"epoch": 0.9297298897796333,
"grad_norm": 8.094454765319824,
"learning_rate": 1.6137969867549674e-06,
"loss": 30.8313,
"step": 31400
},
{
"epoch": 0.932690812995492,
"grad_norm": 8.635899543762207,
"learning_rate": 1.4812549523764674e-06,
"loss": 30.6539,
"step": 31500
},
{
"epoch": 0.9356517362113507,
"grad_norm": 7.975414752960205,
"learning_rate": 1.354323582033039e-06,
"loss": 30.5804,
"step": 31600
},
{
"epoch": 0.9386126594272094,
"grad_norm": 7.660233020782471,
"learning_rate": 1.233015045658823e-06,
"loss": 30.6357,
"step": 31700
},
{
"epoch": 0.9415735826430681,
"grad_norm": 8.09595012664795,
"learning_rate": 1.1173409740815532e-06,
"loss": 30.7201,
"step": 31800
},
{
"epoch": 0.9445345058589268,
"grad_norm": 8.44491958618164,
"learning_rate": 1.0073124579073701e-06,
"loss": 30.7462,
"step": 31900
},
{
"epoch": 0.9474954290747856,
"grad_norm": 8.275026321411133,
"learning_rate": 9.0294004645749e-07,
"loss": 30.7256,
"step": 32000
},
{
"epoch": 0.9474954290747856,
"eval_loss": 3.7850279808044434,
"eval_runtime": 109.0824,
"eval_samples_per_second": 9.91,
"eval_steps_per_second": 2.484,
"step": 32000
},
{
"epoch": 0.9504563522906442,
"grad_norm": 7.571169376373291,
"learning_rate": 8.042337467567484e-07,
"loss": 30.7194,
"step": 32100
},
{
"epoch": 0.9534172755065029,
"grad_norm": 8.020681381225586,
"learning_rate": 7.112030225741472e-07,
"loss": 30.5828,
"step": 32200
},
{
"epoch": 0.9563781987223616,
"grad_norm": 7.482342720031738,
"learning_rate": 6.238567935155004e-07,
"loss": 30.5888,
"step": 32300
},
{
"epoch": 0.9593391219382204,
"grad_norm": 8.336071014404297,
"learning_rate": 5.422034341682314e-07,
"loss": 30.858,
"step": 32400
},
{
"epoch": 0.9623000451540791,
"grad_norm": 7.819650173187256,
"learning_rate": 4.6625077329842224e-07,
"loss": 30.6983,
"step": 32500
},
{
"epoch": 0.9652609683699378,
"grad_norm": 8.101078987121582,
"learning_rate": 3.960060931002141e-07,
"loss": 30.7803,
"step": 32600
},
{
"epoch": 0.9682218915857964,
"grad_norm": 9.275129318237305,
"learning_rate": 3.3147612849762533e-07,
"loss": 30.8961,
"step": 32700
},
{
"epoch": 0.9711828148016551,
"grad_norm": 8.00763988494873,
"learning_rate": 2.7266706649877516e-07,
"loss": 30.9344,
"step": 32800
},
{
"epoch": 0.9741437380175139,
"grad_norm": 8.840792655944824,
"learning_rate": 2.1958454560274455e-07,
"loss": 30.7027,
"step": 32900
},
{
"epoch": 0.9771046612333726,
"grad_norm": 8.015409469604492,
"learning_rate": 1.722336552589021e-07,
"loss": 30.7569,
"step": 33000
},
{
"epoch": 0.9771046612333726,
"eval_loss": 3.785550117492676,
"eval_runtime": 109.3256,
"eval_samples_per_second": 9.888,
"eval_steps_per_second": 2.479,
"step": 33000
},
{
"epoch": 0.9800655844492313,
"grad_norm": 8.226040840148926,
"learning_rate": 1.3061893537898773e-07,
"loss": 30.6858,
"step": 33100
},
{
"epoch": 0.9830265076650899,
"grad_norm": 7.274777889251709,
"learning_rate": 9.474437590182072e-08,
"loss": 30.701,
"step": 33200
},
{
"epoch": 0.9859874308809486,
"grad_norm": 7.866406440734863,
"learning_rate": 6.46134164107326e-08,
"loss": 30.6392,
"step": 33300
},
{
"epoch": 0.9889483540968074,
"grad_norm": 7.722043514251709,
"learning_rate": 4.022894580381742e-08,
"loss": 30.8502,
"step": 33400
},
{
"epoch": 0.9919092773126661,
"grad_norm": 7.612312316894531,
"learning_rate": 2.1593302016933437e-08,
"loss": 30.7914,
"step": 33500
},
{
"epoch": 0.9948702005285248,
"grad_norm": 8.195243835449219,
"learning_rate": 8.708271799542367e-09,
"loss": 30.8885,
"step": 33600
},
{
"epoch": 0.9978311237443835,
"grad_norm": 8.127638816833496,
"learning_rate": 1.5750905434130935e-09,
"loss": 30.9894,
"step": 33700
}
],
"logging_steps": 100,
"max_steps": 33773,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.598282561239384e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}