{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999925976919604, "eval_steps": 1000, "global_step": 33773, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002960923215858705, "grad_norm": 41.32149887084961, "learning_rate": 1.98e-06, "loss": 87.7468, "step": 100 }, { "epoch": 0.00592184643171741, "grad_norm": 39.51910400390625, "learning_rate": 3.98e-06, "loss": 85.766, "step": 200 }, { "epoch": 0.008882769647576115, "grad_norm": 26.557823181152344, "learning_rate": 5.98e-06, "loss": 80.1376, "step": 300 }, { "epoch": 0.01184369286343482, "grad_norm": 21.80652618408203, "learning_rate": 7.98e-06, "loss": 74.3306, "step": 400 }, { "epoch": 0.014804616079293524, "grad_norm": 16.2612247467041, "learning_rate": 9.980000000000001e-06, "loss": 72.3247, "step": 500 }, { "epoch": 0.01776553929515223, "grad_norm": 17.281190872192383, "learning_rate": 1.198e-05, "loss": 71.0703, "step": 600 }, { "epoch": 0.020726462511010933, "grad_norm": 16.140579223632812, "learning_rate": 1.3980000000000002e-05, "loss": 69.5824, "step": 700 }, { "epoch": 0.02368738572686964, "grad_norm": 13.456184387207031, "learning_rate": 1.598e-05, "loss": 67.2135, "step": 800 }, { "epoch": 0.026648308942728342, "grad_norm": 11.778711318969727, "learning_rate": 1.798e-05, "loss": 65.1397, "step": 900 }, { "epoch": 0.02960923215858705, "grad_norm": 11.965922355651855, "learning_rate": 1.9980000000000002e-05, "loss": 63.204, "step": 1000 }, { "epoch": 0.02960923215858705, "eval_loss": 7.691287040710449, "eval_runtime": 37.9485, "eval_samples_per_second": 28.486, "eval_steps_per_second": 7.141, "step": 1000 }, { "epoch": 0.032570155374445756, "grad_norm": 8.482980728149414, "learning_rate": 2.198e-05, "loss": 60.9402, "step": 1100 }, { "epoch": 0.03553107859030446, "grad_norm": 49.948341369628906, "learning_rate": 2.398e-05, "loss": 58.9346, "step": 1200 }, { "epoch": 0.03849200180616316, "grad_norm": 10.039616584777832, "learning_rate": 2.5980000000000002e-05, "loss": 57.309, "step": 1300 }, { "epoch": 0.041452925022021865, "grad_norm": 8.930785179138184, "learning_rate": 2.798e-05, "loss": 56.3741, "step": 1400 }, { "epoch": 0.04441384823788057, "grad_norm": 8.14844036102295, "learning_rate": 2.998e-05, "loss": 55.8969, "step": 1500 }, { "epoch": 0.04737477145373928, "grad_norm": 16.170246124267578, "learning_rate": 3.198e-05, "loss": 55.12, "step": 1600 }, { "epoch": 0.05033569466959798, "grad_norm": 10.525145530700684, "learning_rate": 3.398e-05, "loss": 54.5077, "step": 1700 }, { "epoch": 0.053296617885456685, "grad_norm": 29.427160263061523, "learning_rate": 3.5980000000000004e-05, "loss": 54.2087, "step": 1800 }, { "epoch": 0.05625754110131539, "grad_norm": 47.125083923339844, "learning_rate": 3.798e-05, "loss": 53.4388, "step": 1900 }, { "epoch": 0.0592184643171741, "grad_norm": 10.07633113861084, "learning_rate": 3.998e-05, "loss": 53.5798, "step": 2000 }, { "epoch": 0.0592184643171741, "eval_loss": 6.5744218826293945, "eval_runtime": 38.8407, "eval_samples_per_second": 27.832, "eval_steps_per_second": 6.977, "step": 2000 }, { "epoch": 0.0621793875330328, "grad_norm": 10.918025970458984, "learning_rate": 4.198e-05, "loss": 52.5816, "step": 2100 }, { "epoch": 0.06514031074889151, "grad_norm": 12.68106460571289, "learning_rate": 4.398e-05, "loss": 52.6681, "step": 2200 }, { "epoch": 0.06810123396475021, "grad_norm": 15.503605842590332, "learning_rate": 4.598e-05, "loss": 52.5443, "step": 2300 }, { "epoch": 0.07106215718060892, "grad_norm": 10.995290756225586, "learning_rate": 4.798e-05, "loss": 51.4841, "step": 2400 }, { "epoch": 0.07402308039646761, "grad_norm": 22.71038055419922, "learning_rate": 4.998e-05, "loss": 51.05, "step": 2500 }, { "epoch": 0.07698400361232632, "grad_norm": 11.035310745239258, "learning_rate": 5.198e-05, "loss": 50.8273, "step": 2600 }, { "epoch": 0.07994492682818503, "grad_norm": 14.107246398925781, "learning_rate": 5.398e-05, "loss": 50.9043, "step": 2700 }, { "epoch": 0.08290585004404373, "grad_norm": 11.570377349853516, "learning_rate": 5.598e-05, "loss": 50.4862, "step": 2800 }, { "epoch": 0.08586677325990244, "grad_norm": 16.12681770324707, "learning_rate": 5.7980000000000004e-05, "loss": 50.0248, "step": 2900 }, { "epoch": 0.08882769647576114, "grad_norm": 14.925129890441895, "learning_rate": 5.9980000000000005e-05, "loss": 49.789, "step": 3000 }, { "epoch": 0.08882769647576114, "eval_loss": 6.141844272613525, "eval_runtime": 38.2773, "eval_samples_per_second": 28.241, "eval_steps_per_second": 7.08, "step": 3000 }, { "epoch": 0.09178861969161985, "grad_norm": 16.748519897460938, "learning_rate": 6.198e-05, "loss": 49.4085, "step": 3100 }, { "epoch": 0.09474954290747856, "grad_norm": 12.9891939163208, "learning_rate": 6.398000000000001e-05, "loss": 49.1003, "step": 3200 }, { "epoch": 0.09771046612333725, "grad_norm": 11.456365585327148, "learning_rate": 6.598e-05, "loss": 48.6685, "step": 3300 }, { "epoch": 0.10067138933919596, "grad_norm": 19.153154373168945, "learning_rate": 6.798e-05, "loss": 48.3342, "step": 3400 }, { "epoch": 0.10363231255505466, "grad_norm": 13.501580238342285, "learning_rate": 6.998e-05, "loss": 47.7641, "step": 3500 }, { "epoch": 0.10659323577091337, "grad_norm": 14.034686088562012, "learning_rate": 7.198e-05, "loss": 47.8053, "step": 3600 }, { "epoch": 0.10955415898677208, "grad_norm": 14.023058891296387, "learning_rate": 7.398e-05, "loss": 47.4925, "step": 3700 }, { "epoch": 0.11251508220263078, "grad_norm": 16.410221099853516, "learning_rate": 7.598e-05, "loss": 47.1501, "step": 3800 }, { "epoch": 0.11547600541848949, "grad_norm": 17.547571182250977, "learning_rate": 7.798000000000001e-05, "loss": 47.3132, "step": 3900 }, { "epoch": 0.1184369286343482, "grad_norm": 11.753161430358887, "learning_rate": 7.998e-05, "loss": 47.071, "step": 4000 }, { "epoch": 0.1184369286343482, "eval_loss": 5.719655990600586, "eval_runtime": 38.3575, "eval_samples_per_second": 28.182, "eval_steps_per_second": 7.065, "step": 4000 }, { "epoch": 0.12139785185020689, "grad_norm": 51.37761306762695, "learning_rate": 8.198000000000001e-05, "loss": 46.1728, "step": 4100 }, { "epoch": 0.1243587750660656, "grad_norm": 12.61581802368164, "learning_rate": 8.398e-05, "loss": 45.9941, "step": 4200 }, { "epoch": 0.1273196982819243, "grad_norm": 12.667135238647461, "learning_rate": 8.598e-05, "loss": 46.1649, "step": 4300 }, { "epoch": 0.13028062149778302, "grad_norm": 10.368157386779785, "learning_rate": 8.798e-05, "loss": 45.9172, "step": 4400 }, { "epoch": 0.13324154471364172, "grad_norm": 12.5702543258667, "learning_rate": 8.998e-05, "loss": 45.5541, "step": 4500 }, { "epoch": 0.13620246792950041, "grad_norm": 10.535380363464355, "learning_rate": 9.198e-05, "loss": 44.45, "step": 4600 }, { "epoch": 0.1391633911453591, "grad_norm": 11.035446166992188, "learning_rate": 9.398e-05, "loss": 44.3243, "step": 4700 }, { "epoch": 0.14212431436121784, "grad_norm": 10.718255043029785, "learning_rate": 9.598e-05, "loss": 43.8677, "step": 4800 }, { "epoch": 0.14508523757707653, "grad_norm": 9.816108703613281, "learning_rate": 9.798000000000001e-05, "loss": 43.6948, "step": 4900 }, { "epoch": 0.14804616079293523, "grad_norm": 9.837696075439453, "learning_rate": 9.998000000000002e-05, "loss": 43.6361, "step": 5000 }, { "epoch": 0.14804616079293523, "eval_loss": 5.329010486602783, "eval_runtime": 38.0693, "eval_samples_per_second": 28.396, "eval_steps_per_second": 7.119, "step": 5000 }, { "epoch": 0.15100708400879395, "grad_norm": 10.140490531921387, "learning_rate": 0.00010198, "loss": 42.8106, "step": 5100 }, { "epoch": 0.15396800722465265, "grad_norm": 9.330647468566895, "learning_rate": 0.00010398, "loss": 42.4439, "step": 5200 }, { "epoch": 0.15692893044051134, "grad_norm": 9.038117408752441, "learning_rate": 0.00010598, "loss": 41.8943, "step": 5300 }, { "epoch": 0.15988985365637007, "grad_norm": 10.28738021850586, "learning_rate": 0.00010798, "loss": 41.5117, "step": 5400 }, { "epoch": 0.16285077687222876, "grad_norm": 9.869328498840332, "learning_rate": 0.00010998, "loss": 41.3489, "step": 5500 }, { "epoch": 0.16581170008808746, "grad_norm": 10.985088348388672, "learning_rate": 0.00011198000000000001, "loss": 40.7585, "step": 5600 }, { "epoch": 0.16877262330394618, "grad_norm": 11.46516227722168, "learning_rate": 0.00011398, "loss": 40.2893, "step": 5700 }, { "epoch": 0.17173354651980488, "grad_norm": 9.691688537597656, "learning_rate": 0.00011598000000000001, "loss": 40.0513, "step": 5800 }, { "epoch": 0.17469446973566358, "grad_norm": 10.819178581237793, "learning_rate": 0.00011798, "loss": 39.986, "step": 5900 }, { "epoch": 0.17765539295152227, "grad_norm": 9.694029808044434, "learning_rate": 0.00011998, "loss": 39.3918, "step": 6000 }, { "epoch": 0.17765539295152227, "eval_loss": 4.900777339935303, "eval_runtime": 38.052, "eval_samples_per_second": 28.409, "eval_steps_per_second": 7.122, "step": 6000 }, { "epoch": 0.180616316167381, "grad_norm": 9.988055229187012, "learning_rate": 0.00011999900481764066, "loss": 39.336, "step": 6100 }, { "epoch": 0.1835772393832397, "grad_norm": 13.242379188537598, "learning_rate": 0.00011999597899343296, "loss": 39.0612, "step": 6200 }, { "epoch": 0.1865381625990984, "grad_norm": 13.935639381408691, "learning_rate": 0.00011999092252825071, "loss": 38.585, "step": 6300 }, { "epoch": 0.1894990858149571, "grad_norm": 9.50368881225586, "learning_rate": 0.00011998383559323646, "loss": 38.3112, "step": 6400 }, { "epoch": 0.1924600090308158, "grad_norm": 10.878887176513672, "learning_rate": 0.00011997471842825661, "loss": 38.3219, "step": 6500 }, { "epoch": 0.1954209322466745, "grad_norm": 10.517402648925781, "learning_rate": 0.00011996357134189334, "loss": 37.8246, "step": 6600 }, { "epoch": 0.19838185546253323, "grad_norm": 10.922290802001953, "learning_rate": 0.0001199503947114341, "loss": 37.6387, "step": 6700 }, { "epoch": 0.20134277867839193, "grad_norm": 11.845630645751953, "learning_rate": 0.00011993518898285887, "loss": 37.8343, "step": 6800 }, { "epoch": 0.20430370189425062, "grad_norm": 8.628484725952148, "learning_rate": 0.00011991795467082508, "loss": 37.5011, "step": 6900 }, { "epoch": 0.20726462511010932, "grad_norm": 9.489052772521973, "learning_rate": 0.00011989869235865012, "loss": 37.132, "step": 7000 }, { "epoch": 0.20726462511010932, "eval_loss": 4.595886707305908, "eval_runtime": 38.0814, "eval_samples_per_second": 28.387, "eval_steps_per_second": 7.116, "step": 7000 }, { "epoch": 0.21022554832596804, "grad_norm": 9.687568664550781, "learning_rate": 0.00011987740269829175, "loss": 36.9362, "step": 7100 }, { "epoch": 0.21318647154182674, "grad_norm": 8.676931381225586, "learning_rate": 0.0001198540864103258, "loss": 37.0267, "step": 7200 }, { "epoch": 0.21614739475768543, "grad_norm": 9.232645988464355, "learning_rate": 0.00011982874428392204, "loss": 36.5181, "step": 7300 }, { "epoch": 0.21910831797354416, "grad_norm": 8.917469024658203, "learning_rate": 0.00011980137717681727, "loss": 36.5812, "step": 7400 }, { "epoch": 0.22206924118940286, "grad_norm": 8.593257904052734, "learning_rate": 0.0001197719860152864, "loss": 36.0672, "step": 7500 }, { "epoch": 0.22503016440526155, "grad_norm": 10.630696296691895, "learning_rate": 0.00011974057179411103, "loss": 36.2405, "step": 7600 }, { "epoch": 0.22799108762112028, "grad_norm": 9.975415229797363, "learning_rate": 0.00011970713557654582, "loss": 35.9903, "step": 7700 }, { "epoch": 0.23095201083697897, "grad_norm": 8.622698783874512, "learning_rate": 0.00011967167849428251, "loss": 35.8196, "step": 7800 }, { "epoch": 0.23391293405283767, "grad_norm": 14.828067779541016, "learning_rate": 0.00011963420174741161, "loss": 35.7946, "step": 7900 }, { "epoch": 0.2368738572686964, "grad_norm": 9.303028106689453, "learning_rate": 0.00011959470660438173, "loss": 35.5493, "step": 8000 }, { "epoch": 0.2368738572686964, "eval_loss": 4.408100128173828, "eval_runtime": 37.807, "eval_samples_per_second": 28.593, "eval_steps_per_second": 7.168, "step": 8000 }, { "epoch": 0.2398347804845551, "grad_norm": 11.987268447875977, "learning_rate": 0.00011955319440195674, "loss": 35.6014, "step": 8100 }, { "epoch": 0.24279570370041378, "grad_norm": 10.032620429992676, "learning_rate": 0.00011950966654517043, "loss": 35.5302, "step": 8200 }, { "epoch": 0.24575662691627248, "grad_norm": 9.362653732299805, "learning_rate": 0.00011946412450727906, "loss": 35.2124, "step": 8300 }, { "epoch": 0.2487175501321312, "grad_norm": 9.706056594848633, "learning_rate": 0.00011941656982971138, "loss": 34.9229, "step": 8400 }, { "epoch": 0.25167847334798993, "grad_norm": 10.424148559570312, "learning_rate": 0.00011936700412201653, "loss": 35.1602, "step": 8500 }, { "epoch": 0.2546393965638486, "grad_norm": 10.900792121887207, "learning_rate": 0.00011931542906180957, "loss": 34.9212, "step": 8600 }, { "epoch": 0.2576003197797073, "grad_norm": 10.541563034057617, "learning_rate": 0.00011926184639471465, "loss": 34.8347, "step": 8700 }, { "epoch": 0.26056124299556604, "grad_norm": 8.576896667480469, "learning_rate": 0.00011920625793430596, "loss": 34.9933, "step": 8800 }, { "epoch": 0.2635221662114247, "grad_norm": 10.162493705749512, "learning_rate": 0.00011914866556204637, "loss": 34.3925, "step": 8900 }, { "epoch": 0.26648308942728344, "grad_norm": 11.247607231140137, "learning_rate": 0.0001190890712272237, "loss": 34.4828, "step": 9000 }, { "epoch": 0.26648308942728344, "eval_loss": 4.2549567222595215, "eval_runtime": 37.961, "eval_samples_per_second": 28.477, "eval_steps_per_second": 7.139, "step": 9000 }, { "epoch": 0.2694440126431421, "grad_norm": 9.189545631408691, "learning_rate": 0.00011902747694688472, "loss": 34.3655, "step": 9100 }, { "epoch": 0.27240493585900083, "grad_norm": 11.199912071228027, "learning_rate": 0.000118963884805767, "loss": 34.4358, "step": 9200 }, { "epoch": 0.27536585907485955, "grad_norm": 9.673705101013184, "learning_rate": 0.00011889829695622823, "loss": 34.3689, "step": 9300 }, { "epoch": 0.2783267822907182, "grad_norm": 10.03848934173584, "learning_rate": 0.00011883071561817344, "loss": 33.9158, "step": 9400 }, { "epoch": 0.28128770550657695, "grad_norm": 11.581180572509766, "learning_rate": 0.00011876114307897981, "loss": 33.992, "step": 9500 }, { "epoch": 0.28424862872243567, "grad_norm": 10.81711483001709, "learning_rate": 0.00011868958169341929, "loss": 34.1195, "step": 9600 }, { "epoch": 0.28720955193829434, "grad_norm": 9.648648262023926, "learning_rate": 0.00011861603388357893, "loss": 34.1664, "step": 9700 }, { "epoch": 0.29017047515415306, "grad_norm": 11.37558364868164, "learning_rate": 0.00011854050213877877, "loss": 33.9937, "step": 9800 }, { "epoch": 0.2931313983700118, "grad_norm": 9.346961975097656, "learning_rate": 0.0001184629890154878, "loss": 33.6917, "step": 9900 }, { "epoch": 0.29609232158587045, "grad_norm": 11.31644058227539, "learning_rate": 0.0001183834971372372, "loss": 33.7808, "step": 10000 }, { "epoch": 0.29609232158587045, "eval_loss": 4.157764434814453, "eval_runtime": 39.9196, "eval_samples_per_second": 27.079, "eval_steps_per_second": 6.789, "step": 10000 }, { "epoch": 0.2990532448017292, "grad_norm": 18.920991897583008, "learning_rate": 0.00011180531798567065, "loss": 44.0571, "step": 10100 }, { "epoch": 0.3020141680175879, "grad_norm": 15.393646240234375, "learning_rate": 0.0001116461207502148, "loss": 39.8888, "step": 10200 }, { "epoch": 0.30497509123344657, "grad_norm": 13.2774076461792, "learning_rate": 0.00011148550761026972, "loss": 38.6529, "step": 10300 }, { "epoch": 0.3079360144493053, "grad_norm": 13.391098976135254, "learning_rate": 0.00011132348296912578, "loss": 37.759, "step": 10400 }, { "epoch": 0.310896937665164, "grad_norm": 11.917950630187988, "learning_rate": 0.00011116005126877037, "loss": 37.1968, "step": 10500 }, { "epoch": 0.3138578608810227, "grad_norm": 11.100213050842285, "learning_rate": 0.0001109952169897661, "loss": 37.213, "step": 10600 }, { "epoch": 0.3168187840968814, "grad_norm": 14.579487800598145, "learning_rate": 0.00011082898465112802, "loss": 36.7415, "step": 10700 }, { "epoch": 0.31977970731274014, "grad_norm": 11.359614372253418, "learning_rate": 0.00011066135881019965, "loss": 36.361, "step": 10800 }, { "epoch": 0.3227406305285988, "grad_norm": 12.316486358642578, "learning_rate": 0.00011049234406252809, "loss": 36.0591, "step": 10900 }, { "epoch": 0.32570155374445753, "grad_norm": 13.45693588256836, "learning_rate": 0.00011032194504173804, "loss": 35.6357, "step": 11000 }, { "epoch": 0.32570155374445753, "eval_loss": 4.41249418258667, "eval_runtime": 39.8412, "eval_samples_per_second": 27.133, "eval_steps_per_second": 6.802, "step": 11000 }, { "epoch": 0.32866247696031625, "grad_norm": 10.018808364868164, "learning_rate": 0.00011412090151135696, "loss": 33.7098, "step": 11100 }, { "epoch": 0.3316234001761749, "grad_norm": 10.30320930480957, "learning_rate": 0.00011397731809339621, "loss": 33.7831, "step": 11200 }, { "epoch": 0.33458432339203364, "grad_norm": 9.650611877441406, "learning_rate": 0.0001138320949911399, "loss": 33.5415, "step": 11300 }, { "epoch": 0.33754524660789237, "grad_norm": 8.77065372467041, "learning_rate": 0.0001136852366160714, "loss": 33.2261, "step": 11400 }, { "epoch": 0.34050616982375104, "grad_norm": 12.062385559082031, "learning_rate": 0.00011353674742934919, "loss": 33.0819, "step": 11500 }, { "epoch": 0.34346709303960976, "grad_norm": 10.947739601135254, "learning_rate": 0.00011338663194167138, "loss": 33.3451, "step": 11600 }, { "epoch": 0.34642801625546843, "grad_norm": 9.377535820007324, "learning_rate": 0.00011323489471313875, "loss": 32.8928, "step": 11700 }, { "epoch": 0.34938893947132715, "grad_norm": 8.902270317077637, "learning_rate": 0.00011308154035311608, "loss": 33.1756, "step": 11800 }, { "epoch": 0.3523498626871859, "grad_norm": 10.434513092041016, "learning_rate": 0.00011292657352009224, "loss": 33.1595, "step": 11900 }, { "epoch": 0.35531078590304455, "grad_norm": 11.084539413452148, "learning_rate": 0.00011276999892153867, "loss": 33.359, "step": 12000 }, { "epoch": 0.35531078590304455, "eval_loss": 4.073917865753174, "eval_runtime": 38.304, "eval_samples_per_second": 28.222, "eval_steps_per_second": 7.075, "step": 12000 }, { "epoch": 0.35827170911890327, "grad_norm": 7.943862438201904, "learning_rate": 9.143653002276282e-05, "loss": 32.5648, "step": 12100 }, { "epoch": 0.361232632334762, "grad_norm": 8.098073959350586, "learning_rate": 9.09346201340685e-05, "loss": 32.1551, "step": 12200 }, { "epoch": 0.36419355555062066, "grad_norm": 7.46992826461792, "learning_rate": 9.042974429385753e-05, "loss": 32.3569, "step": 12300 }, { "epoch": 0.3671544787664794, "grad_norm": 7.480947971343994, "learning_rate": 8.992195090864853e-05, "loss": 32.4467, "step": 12400 }, { "epoch": 0.3701154019823381, "grad_norm": 7.488786220550537, "learning_rate": 8.941128866468864e-05, "loss": 32.4447, "step": 12500 }, { "epoch": 0.3730763251981968, "grad_norm": 8.124217987060547, "learning_rate": 8.889780652328559e-05, "loss": 32.3657, "step": 12600 }, { "epoch": 0.3760372484140555, "grad_norm": 8.322397232055664, "learning_rate": 8.83815537161135e-05, "loss": 31.9431, "step": 12700 }, { "epoch": 0.3789981716299142, "grad_norm": 8.59915828704834, "learning_rate": 8.786257974049245e-05, "loss": 31.9211, "step": 12800 }, { "epoch": 0.3819590948457729, "grad_norm": 8.048558235168457, "learning_rate": 8.734093435464301e-05, "loss": 32.437, "step": 12900 }, { "epoch": 0.3849200180616316, "grad_norm": 7.816276550292969, "learning_rate": 8.681666757291531e-05, "loss": 32.0396, "step": 13000 }, { "epoch": 0.3849200180616316, "eval_loss": 3.9447479248046875, "eval_runtime": 112.3499, "eval_samples_per_second": 9.622, "eval_steps_per_second": 2.412, "step": 13000 }, { "epoch": 0.38788094127749034, "grad_norm": 8.613288879394531, "learning_rate": 8.628982966099388e-05, "loss": 31.874, "step": 13100 }, { "epoch": 0.390841864493349, "grad_norm": 7.478573799133301, "learning_rate": 8.576047113107821e-05, "loss": 31.7233, "step": 13200 }, { "epoch": 0.39380278770920774, "grad_norm": 7.845474720001221, "learning_rate": 8.52286427370398e-05, "loss": 31.628, "step": 13300 }, { "epoch": 0.39676371092506646, "grad_norm": 7.7132158279418945, "learning_rate": 8.469439546955592e-05, "loss": 31.8516, "step": 13400 }, { "epoch": 0.39972463414092513, "grad_norm": 9.245190620422363, "learning_rate": 8.415778055122073e-05, "loss": 31.8406, "step": 13500 }, { "epoch": 0.40268555735678385, "grad_norm": 8.426488876342773, "learning_rate": 8.361884943163423e-05, "loss": 31.7148, "step": 13600 }, { "epoch": 0.4056464805726426, "grad_norm": 7.879675388336182, "learning_rate": 8.307765378246925e-05, "loss": 31.9798, "step": 13700 }, { "epoch": 0.40860740378850124, "grad_norm": 8.469719886779785, "learning_rate": 8.253424549251735e-05, "loss": 31.6741, "step": 13800 }, { "epoch": 0.41156832700435997, "grad_norm": 8.198810577392578, "learning_rate": 8.198867666271385e-05, "loss": 31.6722, "step": 13900 }, { "epoch": 0.41452925022021864, "grad_norm": 7.881684303283691, "learning_rate": 8.144099960114239e-05, "loss": 31.8682, "step": 14000 }, { "epoch": 0.41452925022021864, "eval_loss": 3.904888153076172, "eval_runtime": 110.4703, "eval_samples_per_second": 9.785, "eval_steps_per_second": 2.453, "step": 14000 }, { "epoch": 0.41749017343607736, "grad_norm": 7.772391319274902, "learning_rate": 8.089126681801981e-05, "loss": 32.0349, "step": 14100 }, { "epoch": 0.4204510966519361, "grad_norm": 8.459504127502441, "learning_rate": 8.033953102066161e-05, "loss": 31.5844, "step": 14200 }, { "epoch": 0.42341201986779475, "grad_norm": 7.765544414520264, "learning_rate": 7.978584510842833e-05, "loss": 31.6879, "step": 14300 }, { "epoch": 0.4263729430836535, "grad_norm": 8.06749153137207, "learning_rate": 7.923026216765381e-05, "loss": 31.5893, "step": 14400 }, { "epoch": 0.4293338662995122, "grad_norm": 8.966425895690918, "learning_rate": 7.86728354665553e-05, "loss": 31.392, "step": 14500 }, { "epoch": 0.43229478951537087, "grad_norm": 8.47319221496582, "learning_rate": 7.81136184501262e-05, "loss": 31.3068, "step": 14600 }, { "epoch": 0.4352557127312296, "grad_norm": 8.642230033874512, "learning_rate": 7.755266473501193e-05, "loss": 31.5877, "step": 14700 }, { "epoch": 0.4382166359470883, "grad_norm": 8.412428855895996, "learning_rate": 7.699002810436915e-05, "loss": 31.6239, "step": 14800 }, { "epoch": 0.441177559162947, "grad_norm": 6.971558094024658, "learning_rate": 7.642576250270929e-05, "loss": 31.7946, "step": 14900 }, { "epoch": 0.4441384823788057, "grad_norm": 7.922480583190918, "learning_rate": 7.585992203072628e-05, "loss": 31.4474, "step": 15000 }, { "epoch": 0.4441384823788057, "eval_loss": 3.875948667526245, "eval_runtime": 109.3049, "eval_samples_per_second": 9.89, "eval_steps_per_second": 2.479, "step": 15000 }, { "epoch": 0.44709940559466443, "grad_norm": 8.747485160827637, "learning_rate": 7.529256094010965e-05, "loss": 31.6016, "step": 15100 }, { "epoch": 0.4500603288105231, "grad_norm": 8.723346710205078, "learning_rate": 7.472373362834283e-05, "loss": 31.2744, "step": 15200 }, { "epoch": 0.4530212520263818, "grad_norm": 8.310611724853516, "learning_rate": 7.415349463348775e-05, "loss": 31.7448, "step": 15300 }, { "epoch": 0.45598217524224055, "grad_norm": 8.236388206481934, "learning_rate": 7.358189862895577e-05, "loss": 30.9859, "step": 15400 }, { "epoch": 0.4589430984580992, "grad_norm": 8.104386329650879, "learning_rate": 7.300900041826566e-05, "loss": 31.1935, "step": 15500 }, { "epoch": 0.46190402167395794, "grad_norm": 8.219923973083496, "learning_rate": 7.243485492978928e-05, "loss": 30.9099, "step": 15600 }, { "epoch": 0.46486494488981667, "grad_norm": 8.872945785522461, "learning_rate": 7.185951721148502e-05, "loss": 31.3423, "step": 15700 }, { "epoch": 0.46782586810567534, "grad_norm": 8.087647438049316, "learning_rate": 7.128304242561999e-05, "loss": 31.1816, "step": 15800 }, { "epoch": 0.47078679132153406, "grad_norm": 8.805392265319824, "learning_rate": 7.070548584348108e-05, "loss": 31.0977, "step": 15900 }, { "epoch": 0.4737477145373928, "grad_norm": 8.469452857971191, "learning_rate": 7.012690284007577e-05, "loss": 31.5828, "step": 16000 }, { "epoch": 0.4737477145373928, "eval_loss": 3.8530030250549316, "eval_runtime": 109.275, "eval_samples_per_second": 9.892, "eval_steps_per_second": 2.48, "step": 16000 }, { "epoch": 0.47670863775325145, "grad_norm": 8.871159553527832, "learning_rate": 6.954734888882281e-05, "loss": 30.9753, "step": 16100 }, { "epoch": 0.4796695609691102, "grad_norm": 8.81116008758545, "learning_rate": 6.896687955623357e-05, "loss": 31.2067, "step": 16200 }, { "epoch": 0.4826304841849689, "grad_norm": 7.77982759475708, "learning_rate": 6.838555049658432e-05, "loss": 31.089, "step": 16300 }, { "epoch": 0.48559140740082757, "grad_norm": 8.370245933532715, "learning_rate": 6.780341744658044e-05, "loss": 30.9776, "step": 16400 }, { "epoch": 0.4885523306166863, "grad_norm": 8.41613483428955, "learning_rate": 6.722053622001221e-05, "loss": 31.1095, "step": 16500 }, { "epoch": 0.49151325383254496, "grad_norm": 7.951696395874023, "learning_rate": 6.663696270240373e-05, "loss": 31.1532, "step": 16600 }, { "epoch": 0.4944741770484037, "grad_norm": 9.02717113494873, "learning_rate": 6.60527528456546e-05, "loss": 31.0777, "step": 16700 }, { "epoch": 0.4974351002642624, "grad_norm": 8.57259750366211, "learning_rate": 6.546796266267535e-05, "loss": 31.3509, "step": 16800 }, { "epoch": 0.5003960234801211, "grad_norm": 9.129491806030273, "learning_rate": 6.488264822201711e-05, "loss": 30.7844, "step": 16900 }, { "epoch": 0.5033569466959799, "grad_norm": 8.600064277648926, "learning_rate": 6.429686564249579e-05, "loss": 31.1164, "step": 17000 }, { "epoch": 0.5033569466959799, "eval_loss": 3.836409091949463, "eval_runtime": 109.0903, "eval_samples_per_second": 9.909, "eval_steps_per_second": 2.484, "step": 17000 }, { "epoch": 0.5063178699118385, "grad_norm": 8.62096881866455, "learning_rate": 6.371067108781158e-05, "loss": 31.1944, "step": 17100 }, { "epoch": 0.5092787931276972, "grad_norm": 8.052851676940918, "learning_rate": 6.312412076116401e-05, "loss": 31.0126, "step": 17200 }, { "epoch": 0.5122397163435559, "grad_norm": 8.32268238067627, "learning_rate": 6.253727089986337e-05, "loss": 31.0692, "step": 17300 }, { "epoch": 0.5152006395594146, "grad_norm": 8.130902290344238, "learning_rate": 6.195017776993876e-05, "loss": 30.9143, "step": 17400 }, { "epoch": 0.5181615627752734, "grad_norm": 9.245232582092285, "learning_rate": 6.136289766074334e-05, "loss": 31.0029, "step": 17500 }, { "epoch": 0.5211224859911321, "grad_norm": 8.296626091003418, "learning_rate": 6.077548687955759e-05, "loss": 31.0624, "step": 17600 }, { "epoch": 0.5240834092069907, "grad_norm": 8.933104515075684, "learning_rate": 6.018800174619048e-05, "loss": 31.0619, "step": 17700 }, { "epoch": 0.5270443324228494, "grad_norm": 7.37945032119751, "learning_rate": 5.960049858757974e-05, "loss": 31.3181, "step": 17800 }, { "epoch": 0.5300052556387082, "grad_norm": 8.817550659179688, "learning_rate": 5.901303373239133e-05, "loss": 30.8424, "step": 17900 }, { "epoch": 0.5329661788545669, "grad_norm": 7.71854305267334, "learning_rate": 5.842566350561879e-05, "loss": 31.0376, "step": 18000 }, { "epoch": 0.5329661788545669, "eval_loss": 3.822613477706909, "eval_runtime": 112.0979, "eval_samples_per_second": 9.643, "eval_steps_per_second": 2.418, "step": 18000 }, { "epoch": 0.5359271020704256, "grad_norm": 8.84870719909668, "learning_rate": 5.7838444223182826e-05, "loss": 30.8901, "step": 18100 }, { "epoch": 0.5388880252862842, "grad_norm": 7.48129415512085, "learning_rate": 5.725143218653187e-05, "loss": 31.0275, "step": 18200 }, { "epoch": 0.5418489485021429, "grad_norm": 8.218484878540039, "learning_rate": 5.666468367724412e-05, "loss": 31.1443, "step": 18300 }, { "epoch": 0.5448098717180017, "grad_norm": 9.589841842651367, "learning_rate": 5.607825495163119e-05, "loss": 30.9756, "step": 18400 }, { "epoch": 0.5477707949338604, "grad_norm": 8.583683013916016, "learning_rate": 5.549220223534451e-05, "loss": 31.0641, "step": 18500 }, { "epoch": 0.5507317181497191, "grad_norm": 7.978188991546631, "learning_rate": 5.490658171798439e-05, "loss": 30.8899, "step": 18600 }, { "epoch": 0.5536926413655778, "grad_norm": 8.130802154541016, "learning_rate": 5.432144954771287e-05, "loss": 31.0812, "step": 18700 }, { "epoch": 0.5566535645814364, "grad_norm": 8.981709480285645, "learning_rate": 5.37368618258701e-05, "loss": 31.0612, "step": 18800 }, { "epoch": 0.5596144877972952, "grad_norm": 7.87661075592041, "learning_rate": 5.315287460159561e-05, "loss": 30.8581, "step": 18900 }, { "epoch": 0.5625754110131539, "grad_norm": 8.329483032226562, "learning_rate": 5.256954386645438e-05, "loss": 31.1805, "step": 19000 }, { "epoch": 0.5625754110131539, "eval_loss": 3.8131661415100098, "eval_runtime": 111.5683, "eval_samples_per_second": 9.689, "eval_steps_per_second": 2.429, "step": 19000 }, { "epoch": 0.5655363342290126, "grad_norm": 8.833015441894531, "learning_rate": 5.198692554906851e-05, "loss": 30.9231, "step": 19100 }, { "epoch": 0.5684972574448713, "grad_norm": 7.966989994049072, "learning_rate": 5.1405075509754834e-05, "loss": 31.0225, "step": 19200 }, { "epoch": 0.5714581806607301, "grad_norm": 8.791169166564941, "learning_rate": 5.0824049535169166e-05, "loss": 31.1551, "step": 19300 }, { "epoch": 0.5744191038765887, "grad_norm": 7.9680023193359375, "learning_rate": 5.024390333295761e-05, "loss": 31.0498, "step": 19400 }, { "epoch": 0.5773800270924474, "grad_norm": 8.603718757629395, "learning_rate": 4.966469252641538e-05, "loss": 30.9017, "step": 19500 }, { "epoch": 0.5803409503083061, "grad_norm": 12.401627540588379, "learning_rate": 4.908647264915378e-05, "loss": 30.9988, "step": 19600 }, { "epoch": 0.5833018735241648, "grad_norm": 8.433266639709473, "learning_rate": 4.8509299139775734e-05, "loss": 30.9905, "step": 19700 }, { "epoch": 0.5862627967400236, "grad_norm": 7.99282693862915, "learning_rate": 4.7933227336560414e-05, "loss": 31.0604, "step": 19800 }, { "epoch": 0.5892237199558823, "grad_norm": 8.011063575744629, "learning_rate": 4.735831247215753e-05, "loss": 30.7471, "step": 19900 }, { "epoch": 0.5921846431717409, "grad_norm": 9.603862762451172, "learning_rate": 4.67846096682918e-05, "loss": 30.8428, "step": 20000 }, { "epoch": 0.5921846431717409, "eval_loss": 3.8060901165008545, "eval_runtime": 112.6154, "eval_samples_per_second": 9.599, "eval_steps_per_second": 2.406, "step": 20000 }, { "epoch": 0.5951455663875996, "grad_norm": 8.427188873291016, "learning_rate": 4.6212173930477874e-05, "loss": 30.8438, "step": 20100 }, { "epoch": 0.5981064896034584, "grad_norm": 7.692320346832275, "learning_rate": 4.5641060142746556e-05, "loss": 30.7664, "step": 20200 }, { "epoch": 0.6010674128193171, "grad_norm": 8.596179962158203, "learning_rate": 4.507132306238262e-05, "loss": 30.9387, "step": 20300 }, { "epoch": 0.6040283360351758, "grad_norm": 8.076534271240234, "learning_rate": 4.450301731467488e-05, "loss": 30.851, "step": 20400 }, { "epoch": 0.6069892592510344, "grad_norm": 9.05728816986084, "learning_rate": 4.3936197387678665e-05, "loss": 30.7486, "step": 20500 }, { "epoch": 0.6099501824668931, "grad_norm": 8.477595329284668, "learning_rate": 4.3370917626991706e-05, "loss": 30.6843, "step": 20600 }, { "epoch": 0.6129111056827519, "grad_norm": 8.171915054321289, "learning_rate": 4.2807232230543625e-05, "loss": 30.9551, "step": 20700 }, { "epoch": 0.6158720288986106, "grad_norm": 8.333806991577148, "learning_rate": 4.22451952433994e-05, "loss": 30.8566, "step": 20800 }, { "epoch": 0.6188329521144693, "grad_norm": 7.9477715492248535, "learning_rate": 4.168486055257777e-05, "loss": 30.8577, "step": 20900 }, { "epoch": 0.621793875330328, "grad_norm": 8.560218811035156, "learning_rate": 4.112628188188457e-05, "loss": 30.7203, "step": 21000 }, { "epoch": 0.621793875330328, "eval_loss": 3.7986109256744385, "eval_runtime": 109.4771, "eval_samples_per_second": 9.874, "eval_steps_per_second": 2.475, "step": 21000 }, { "epoch": 0.6247547985461867, "grad_norm": 8.963776588439941, "learning_rate": 4.056951278676187e-05, "loss": 30.9418, "step": 21100 }, { "epoch": 0.6277157217620454, "grad_norm": 8.338837623596191, "learning_rate": 4.001460664915308e-05, "loss": 30.756, "step": 21200 }, { "epoch": 0.6306766449779041, "grad_norm": 8.323155403137207, "learning_rate": 3.946161667238485e-05, "loss": 30.6959, "step": 21300 }, { "epoch": 0.6336375681937628, "grad_norm": 9.881996154785156, "learning_rate": 3.8910595876066085e-05, "loss": 30.9333, "step": 21400 }, { "epoch": 0.6365984914096215, "grad_norm": 8.089996337890625, "learning_rate": 3.836159709100446e-05, "loss": 30.6899, "step": 21500 }, { "epoch": 0.6395594146254803, "grad_norm": 7.9427289962768555, "learning_rate": 3.7814672954141055e-05, "loss": 30.8046, "step": 21600 }, { "epoch": 0.6425203378413389, "grad_norm": 8.468146324157715, "learning_rate": 3.7269875903503826e-05, "loss": 31.2292, "step": 21700 }, { "epoch": 0.6454812610571976, "grad_norm": 8.63842487335205, "learning_rate": 3.672725817317973e-05, "loss": 30.7721, "step": 21800 }, { "epoch": 0.6484421842730563, "grad_norm": 8.145241737365723, "learning_rate": 3.6186871788306674e-05, "loss": 30.5881, "step": 21900 }, { "epoch": 0.6514031074889151, "grad_norm": 8.194993019104004, "learning_rate": 3.5648768560085604e-05, "loss": 30.9425, "step": 22000 }, { "epoch": 0.6514031074889151, "eval_loss": 3.7950870990753174, "eval_runtime": 109.4264, "eval_samples_per_second": 9.879, "eval_steps_per_second": 2.477, "step": 22000 }, { "epoch": 0.6543640307047738, "grad_norm": 9.304323196411133, "learning_rate": 3.511300008081273e-05, "loss": 30.722, "step": 22100 }, { "epoch": 0.6573249539206325, "grad_norm": 7.82930850982666, "learning_rate": 3.4579617718933054e-05, "loss": 30.7943, "step": 22200 }, { "epoch": 0.6602858771364911, "grad_norm": 7.912548542022705, "learning_rate": 3.4048672614115294e-05, "loss": 30.8451, "step": 22300 }, { "epoch": 0.6632468003523498, "grad_norm": 8.46181583404541, "learning_rate": 3.352021567234869e-05, "loss": 30.9009, "step": 22400 }, { "epoch": 0.6662077235682086, "grad_norm": 7.727646827697754, "learning_rate": 3.299429756106215e-05, "loss": 30.8281, "step": 22500 }, { "epoch": 0.6691686467840673, "grad_norm": 8.119136810302734, "learning_rate": 3.247096870426649e-05, "loss": 30.7757, "step": 22600 }, { "epoch": 0.672129569999926, "grad_norm": 8.091607093811035, "learning_rate": 3.195027927771982e-05, "loss": 30.8661, "step": 22700 }, { "epoch": 0.6750904932157847, "grad_norm": 7.598474979400635, "learning_rate": 3.1432279204116776e-05, "loss": 30.6257, "step": 22800 }, { "epoch": 0.6780514164316434, "grad_norm": 9.547100067138672, "learning_rate": 3.091701814830198e-05, "loss": 30.8582, "step": 22900 }, { "epoch": 0.6810123396475021, "grad_norm": 7.637078762054443, "learning_rate": 3.0404545512508415e-05, "loss": 30.9432, "step": 23000 }, { "epoch": 0.6810123396475021, "eval_loss": 3.791748285293579, "eval_runtime": 109.2867, "eval_samples_per_second": 9.891, "eval_steps_per_second": 2.48, "step": 23000 }, { "epoch": 0.6839732628633608, "grad_norm": 8.485209465026855, "learning_rate": 2.98949104316207e-05, "loss": 30.921, "step": 23100 }, { "epoch": 0.6869341860792195, "grad_norm": 7.777042865753174, "learning_rate": 2.938816176846421e-05, "loss": 30.8116, "step": 23200 }, { "epoch": 0.6898951092950782, "grad_norm": 7.6587138175964355, "learning_rate": 2.8884348109120106e-05, "loss": 30.7965, "step": 23300 }, { "epoch": 0.6928560325109369, "grad_norm": 8.276775360107422, "learning_rate": 2.8383517758267178e-05, "loss": 30.6582, "step": 23400 }, { "epoch": 0.6958169557267956, "grad_norm": 7.5494771003723145, "learning_rate": 2.7885718734550257e-05, "loss": 30.6483, "step": 23500 }, { "epoch": 0.6987778789426543, "grad_norm": 7.938130855560303, "learning_rate": 2.739099876597646e-05, "loss": 30.529, "step": 23600 }, { "epoch": 0.701738802158513, "grad_norm": 8.202885627746582, "learning_rate": 2.6899405285339026e-05, "loss": 30.825, "step": 23700 }, { "epoch": 0.7046997253743718, "grad_norm": 8.393240928649902, "learning_rate": 2.6410985425669622e-05, "loss": 30.7867, "step": 23800 }, { "epoch": 0.7076606485902305, "grad_norm": 8.32459831237793, "learning_rate": 2.5925786015719207e-05, "loss": 30.7898, "step": 23900 }, { "epoch": 0.7106215718060891, "grad_norm": 9.323598861694336, "learning_rate": 2.544385357546831e-05, "loss": 30.5684, "step": 24000 }, { "epoch": 0.7106215718060891, "eval_loss": 3.789947271347046, "eval_runtime": 110.7686, "eval_samples_per_second": 9.759, "eval_steps_per_second": 2.447, "step": 24000 }, { "epoch": 0.7135824950219478, "grad_norm": 8.184738159179688, "learning_rate": 2.4965234311666717e-05, "loss": 30.7187, "step": 24100 }, { "epoch": 0.7165434182378065, "grad_norm": 7.82784366607666, "learning_rate": 2.4489974113403275e-05, "loss": 30.705, "step": 24200 }, { "epoch": 0.7195043414536653, "grad_norm": 7.945186614990234, "learning_rate": 2.4018118547706078e-05, "loss": 30.4846, "step": 24300 }, { "epoch": 0.722465264669524, "grad_norm": 9.277371406555176, "learning_rate": 2.3549712855173688e-05, "loss": 30.6765, "step": 24400 }, { "epoch": 0.7254261878853827, "grad_norm": 8.619938850402832, "learning_rate": 2.3084801945637512e-05, "loss": 30.6503, "step": 24500 }, { "epoch": 0.7283871111012413, "grad_norm": 8.467925071716309, "learning_rate": 2.262343039385585e-05, "loss": 30.957, "step": 24600 }, { "epoch": 0.7313480343171, "grad_norm": 8.035057067871094, "learning_rate": 2.216564243524035e-05, "loss": 30.6764, "step": 24700 }, { "epoch": 0.7343089575329588, "grad_norm": 7.555221080780029, "learning_rate": 2.1711481961614565e-05, "loss": 30.7666, "step": 24800 }, { "epoch": 0.7372698807488175, "grad_norm": 7.959348201751709, "learning_rate": 2.1260992517005892e-05, "loss": 30.8212, "step": 24900 }, { "epoch": 0.7402308039646762, "grad_norm": 7.882981300354004, "learning_rate": 2.0814217293470476e-05, "loss": 30.8312, "step": 25000 }, { "epoch": 0.7402308039646762, "eval_loss": 3.7874350547790527, "eval_runtime": 107.8316, "eval_samples_per_second": 10.025, "eval_steps_per_second": 2.513, "step": 25000 }, { "epoch": 0.743191727180535, "grad_norm": 7.499105930328369, "learning_rate": 2.0371199126952268e-05, "loss": 30.9958, "step": 25100 }, { "epoch": 0.7461526503963936, "grad_norm": 7.973631381988525, "learning_rate": 1.9931980493175735e-05, "loss": 30.6469, "step": 25200 }, { "epoch": 0.7491135736122523, "grad_norm": 7.996872425079346, "learning_rate": 1.949660350357356e-05, "loss": 30.6363, "step": 25300 }, { "epoch": 0.752074496828111, "grad_norm": 8.139349937438965, "learning_rate": 1.9065109901249e-05, "loss": 30.924, "step": 25400 }, { "epoch": 0.7550354200439697, "grad_norm": 8.981887817382812, "learning_rate": 1.863754105697369e-05, "loss": 30.9555, "step": 25500 }, { "epoch": 0.7579963432598285, "grad_norm": 7.660996913909912, "learning_rate": 1.821393796522096e-05, "loss": 30.8007, "step": 25600 }, { "epoch": 0.7609572664756871, "grad_norm": 7.750844955444336, "learning_rate": 1.7794341240235615e-05, "loss": 30.7227, "step": 25700 }, { "epoch": 0.7639181896915458, "grad_norm": 7.581575870513916, "learning_rate": 1.737879111213961e-05, "loss": 30.6509, "step": 25800 }, { "epoch": 0.7668791129074045, "grad_norm": 8.771635055541992, "learning_rate": 1.6967327423075142e-05, "loss": 30.7893, "step": 25900 }, { "epoch": 0.7698400361232632, "grad_norm": 8.594512939453125, "learning_rate": 1.6559989623384456e-05, "loss": 30.6874, "step": 26000 }, { "epoch": 0.7698400361232632, "eval_loss": 3.7861363887786865, "eval_runtime": 112.2096, "eval_samples_per_second": 9.634, "eval_steps_per_second": 2.415, "step": 26000 }, { "epoch": 0.772800959339122, "grad_norm": 7.919267177581787, "learning_rate": 1.615681676782755e-05, "loss": 30.7685, "step": 26100 }, { "epoch": 0.7757618825549807, "grad_norm": 7.744143009185791, "learning_rate": 1.5757847511837648e-05, "loss": 30.7558, "step": 26200 }, { "epoch": 0.7787228057708393, "grad_norm": 7.894962787628174, "learning_rate": 1.5363120107814955e-05, "loss": 30.7543, "step": 26300 }, { "epoch": 0.781683728986698, "grad_norm": 9.573600769042969, "learning_rate": 1.4972672401459143e-05, "loss": 30.808, "step": 26400 }, { "epoch": 0.7846446522025567, "grad_norm": 7.708218574523926, "learning_rate": 1.4586541828140706e-05, "loss": 30.6115, "step": 26500 }, { "epoch": 0.7876055754184155, "grad_norm": 8.170422554016113, "learning_rate": 1.4204765409311852e-05, "loss": 30.8811, "step": 26600 }, { "epoch": 0.7905664986342742, "grad_norm": 8.293937683105469, "learning_rate": 1.3827379748956783e-05, "loss": 30.8484, "step": 26700 }, { "epoch": 0.7935274218501329, "grad_norm": 7.64206075668335, "learning_rate": 1.3454421030082402e-05, "loss": 30.7768, "step": 26800 }, { "epoch": 0.7964883450659915, "grad_norm": 7.780085563659668, "learning_rate": 1.3085925011248902e-05, "loss": 30.6903, "step": 26900 }, { "epoch": 0.7994492682818503, "grad_norm": 7.651244640350342, "learning_rate": 1.2721927023141509e-05, "loss": 30.8888, "step": 27000 }, { "epoch": 0.7994492682818503, "eval_loss": 3.7866110801696777, "eval_runtime": 111.3993, "eval_samples_per_second": 9.704, "eval_steps_per_second": 2.433, "step": 27000 }, { "epoch": 0.802410191497709, "grad_norm": 7.893172740936279, "learning_rate": 1.2362461965182951e-05, "loss": 30.8551, "step": 27100 }, { "epoch": 0.8053711147135677, "grad_norm": 8.348461151123047, "learning_rate": 1.2007564302187395e-05, "loss": 30.9086, "step": 27200 }, { "epoch": 0.8083320379294264, "grad_norm": 8.005925178527832, "learning_rate": 1.1657268061055954e-05, "loss": 30.6258, "step": 27300 }, { "epoch": 0.8112929611452852, "grad_norm": 7.919161319732666, "learning_rate": 1.1311606827514432e-05, "loss": 30.4614, "step": 27400 }, { "epoch": 0.8142538843611438, "grad_norm": 8.806751251220703, "learning_rate": 1.0970613742892959e-05, "loss": 30.9882, "step": 27500 }, { "epoch": 0.8172148075770025, "grad_norm": 8.126434326171875, "learning_rate": 1.0634321500948665e-05, "loss": 30.6459, "step": 27600 }, { "epoch": 0.8201757307928612, "grad_norm": 7.643808364868164, "learning_rate": 1.0302762344730893e-05, "loss": 30.6614, "step": 27700 }, { "epoch": 0.8231366540087199, "grad_norm": 8.046734809875488, "learning_rate": 9.97596806349001e-06, "loss": 30.6958, "step": 27800 }, { "epoch": 0.8260975772245787, "grad_norm": 8.094582557678223, "learning_rate": 9.653969989629268e-06, "loss": 30.5807, "step": 27900 }, { "epoch": 0.8290585004404373, "grad_norm": 8.062453269958496, "learning_rate": 9.336798995700899e-06, "loss": 30.8323, "step": 28000 }, { "epoch": 0.8290585004404373, "eval_loss": 3.786661386489868, "eval_runtime": 106.9454, "eval_samples_per_second": 10.108, "eval_steps_per_second": 2.534, "step": 28000 }, { "epoch": 0.832019423656296, "grad_norm": 7.68911075592041, "learning_rate": 9.024485491446045e-06, "loss": 30.9853, "step": 28100 }, { "epoch": 0.8349803468721547, "grad_norm": 7.82414436340332, "learning_rate": 8.717059420879143e-06, "loss": 30.5061, "step": 28200 }, { "epoch": 0.8379412700880134, "grad_norm": 7.392062664031982, "learning_rate": 8.414550259416917e-06, "loss": 30.9525, "step": 28300 }, { "epoch": 0.8409021933038722, "grad_norm": 7.675992965698242, "learning_rate": 8.116987011052387e-06, "loss": 30.8296, "step": 28400 }, { "epoch": 0.8438631165197309, "grad_norm": 8.038030624389648, "learning_rate": 7.824398205574006e-06, "loss": 30.8155, "step": 28500 }, { "epoch": 0.8468240397355895, "grad_norm": 7.427101135253906, "learning_rate": 7.536811895830222e-06, "loss": 30.9259, "step": 28600 }, { "epoch": 0.8497849629514482, "grad_norm": 8.095186233520508, "learning_rate": 7.254255655039919e-06, "loss": 30.824, "step": 28700 }, { "epoch": 0.852745886167307, "grad_norm": 7.521733283996582, "learning_rate": 6.9767565741486815e-06, "loss": 30.7226, "step": 28800 }, { "epoch": 0.8557068093831657, "grad_norm": 7.494954586029053, "learning_rate": 6.704341259231415e-06, "loss": 30.7789, "step": 28900 }, { "epoch": 0.8586677325990244, "grad_norm": 7.641082763671875, "learning_rate": 6.437035828941324e-06, "loss": 30.8001, "step": 29000 }, { "epoch": 0.8586677325990244, "eval_loss": 3.786005973815918, "eval_runtime": 111.1521, "eval_samples_per_second": 9.725, "eval_steps_per_second": 2.438, "step": 29000 }, { "epoch": 0.8616286558148831, "grad_norm": 7.96475887298584, "learning_rate": 6.1748659120058386e-06, "loss": 30.8879, "step": 29100 }, { "epoch": 0.8645895790307417, "grad_norm": 6.990954875946045, "learning_rate": 5.917856644769242e-06, "loss": 30.6077, "step": 29200 }, { "epoch": 0.8675505022466005, "grad_norm": 7.170067310333252, "learning_rate": 5.666032668782735e-06, "loss": 30.8456, "step": 29300 }, { "epoch": 0.8705114254624592, "grad_norm": 8.4426851272583, "learning_rate": 5.419418128441846e-06, "loss": 30.9228, "step": 29400 }, { "epoch": 0.8734723486783179, "grad_norm": 8.034204483032227, "learning_rate": 5.178036668671475e-06, "loss": 30.7785, "step": 29500 }, { "epoch": 0.8764332718941766, "grad_norm": 7.411805629730225, "learning_rate": 4.941911432658868e-06, "loss": 30.7495, "step": 29600 }, { "epoch": 0.8793941951100354, "grad_norm": 7.887239456176758, "learning_rate": 4.7110650596347335e-06, "loss": 30.7797, "step": 29700 }, { "epoch": 0.882355118325894, "grad_norm": 8.600279808044434, "learning_rate": 4.48551968270261e-06, "loss": 30.8267, "step": 29800 }, { "epoch": 0.8853160415417527, "grad_norm": 8.055954933166504, "learning_rate": 4.26529692671679e-06, "loss": 30.8123, "step": 29900 }, { "epoch": 0.8882769647576114, "grad_norm": 7.540750503540039, "learning_rate": 4.050417906208945e-06, "loss": 30.8866, "step": 30000 }, { "epoch": 0.8882769647576114, "eval_loss": 3.7849574089050293, "eval_runtime": 108.0072, "eval_samples_per_second": 10.009, "eval_steps_per_second": 2.509, "step": 30000 }, { "epoch": 0.8912378879734701, "grad_norm": 7.607705593109131, "learning_rate": 3.840903223363752e-06, "loss": 30.7932, "step": 30100 }, { "epoch": 0.8941988111893289, "grad_norm": 7.834300518035889, "learning_rate": 3.636772966043571e-06, "loss": 30.6935, "step": 30200 }, { "epoch": 0.8971597344051876, "grad_norm": 9.865922927856445, "learning_rate": 3.4380467058624585e-06, "loss": 30.5129, "step": 30300 }, { "epoch": 0.9001206576210462, "grad_norm": 7.9707865715026855, "learning_rate": 3.244743496309701e-06, "loss": 30.8035, "step": 30400 }, { "epoch": 0.9030815808369049, "grad_norm": 8.035768508911133, "learning_rate": 3.0568818709229364e-06, "loss": 30.4973, "step": 30500 }, { "epoch": 0.9060425040527637, "grad_norm": 8.816192626953125, "learning_rate": 2.8744798415113015e-06, "loss": 30.5553, "step": 30600 }, { "epoch": 0.9090034272686224, "grad_norm": 7.411801338195801, "learning_rate": 2.6975548964283823e-06, "loss": 30.6758, "step": 30700 }, { "epoch": 0.9119643504844811, "grad_norm": 7.46308708190918, "learning_rate": 2.5261239988955733e-06, "loss": 30.8337, "step": 30800 }, { "epoch": 0.9149252737003397, "grad_norm": 8.57913875579834, "learning_rate": 2.360203585375571e-06, "loss": 31.0671, "step": 30900 }, { "epoch": 0.9178861969161984, "grad_norm": 7.983087062835693, "learning_rate": 2.1998095639965577e-06, "loss": 30.913, "step": 31000 }, { "epoch": 0.9178861969161984, "eval_loss": 3.785719394683838, "eval_runtime": 110.9703, "eval_samples_per_second": 9.741, "eval_steps_per_second": 2.442, "step": 31000 }, { "epoch": 0.9208471201320572, "grad_norm": 8.11637020111084, "learning_rate": 2.044957313026925e-06, "loss": 30.7294, "step": 31100 }, { "epoch": 0.9238080433479159, "grad_norm": 7.882040977478027, "learning_rate": 1.895661679400842e-06, "loss": 30.7816, "step": 31200 }, { "epoch": 0.9267689665637746, "grad_norm": 7.475772857666016, "learning_rate": 1.7519369772947525e-06, "loss": 30.5198, "step": 31300 }, { "epoch": 0.9297298897796333, "grad_norm": 8.094454765319824, "learning_rate": 1.6137969867549674e-06, "loss": 30.8313, "step": 31400 }, { "epoch": 0.932690812995492, "grad_norm": 8.635899543762207, "learning_rate": 1.4812549523764674e-06, "loss": 30.6539, "step": 31500 }, { "epoch": 0.9356517362113507, "grad_norm": 7.975414752960205, "learning_rate": 1.354323582033039e-06, "loss": 30.5804, "step": 31600 }, { "epoch": 0.9386126594272094, "grad_norm": 7.660233020782471, "learning_rate": 1.233015045658823e-06, "loss": 30.6357, "step": 31700 }, { "epoch": 0.9415735826430681, "grad_norm": 8.09595012664795, "learning_rate": 1.1173409740815532e-06, "loss": 30.7201, "step": 31800 }, { "epoch": 0.9445345058589268, "grad_norm": 8.44491958618164, "learning_rate": 1.0073124579073701e-06, "loss": 30.7462, "step": 31900 }, { "epoch": 0.9474954290747856, "grad_norm": 8.275026321411133, "learning_rate": 9.0294004645749e-07, "loss": 30.7256, "step": 32000 }, { "epoch": 0.9474954290747856, "eval_loss": 3.7850279808044434, "eval_runtime": 109.0824, "eval_samples_per_second": 9.91, "eval_steps_per_second": 2.484, "step": 32000 }, { "epoch": 0.9504563522906442, "grad_norm": 7.571169376373291, "learning_rate": 8.042337467567484e-07, "loss": 30.7194, "step": 32100 }, { "epoch": 0.9534172755065029, "grad_norm": 8.020681381225586, "learning_rate": 7.112030225741472e-07, "loss": 30.5828, "step": 32200 }, { "epoch": 0.9563781987223616, "grad_norm": 7.482342720031738, "learning_rate": 6.238567935155004e-07, "loss": 30.5888, "step": 32300 }, { "epoch": 0.9593391219382204, "grad_norm": 8.336071014404297, "learning_rate": 5.422034341682314e-07, "loss": 30.858, "step": 32400 }, { "epoch": 0.9623000451540791, "grad_norm": 7.819650173187256, "learning_rate": 4.6625077329842224e-07, "loss": 30.6983, "step": 32500 }, { "epoch": 0.9652609683699378, "grad_norm": 8.101078987121582, "learning_rate": 3.960060931002141e-07, "loss": 30.7803, "step": 32600 }, { "epoch": 0.9682218915857964, "grad_norm": 9.275129318237305, "learning_rate": 3.3147612849762533e-07, "loss": 30.8961, "step": 32700 }, { "epoch": 0.9711828148016551, "grad_norm": 8.00763988494873, "learning_rate": 2.7266706649877516e-07, "loss": 30.9344, "step": 32800 }, { "epoch": 0.9741437380175139, "grad_norm": 8.840792655944824, "learning_rate": 2.1958454560274455e-07, "loss": 30.7027, "step": 32900 }, { "epoch": 0.9771046612333726, "grad_norm": 8.015409469604492, "learning_rate": 1.722336552589021e-07, "loss": 30.7569, "step": 33000 }, { "epoch": 0.9771046612333726, "eval_loss": 3.785550117492676, "eval_runtime": 109.3256, "eval_samples_per_second": 9.888, "eval_steps_per_second": 2.479, "step": 33000 }, { "epoch": 0.9800655844492313, "grad_norm": 8.226040840148926, "learning_rate": 1.3061893537898773e-07, "loss": 30.6858, "step": 33100 }, { "epoch": 0.9830265076650899, "grad_norm": 7.274777889251709, "learning_rate": 9.474437590182072e-08, "loss": 30.701, "step": 33200 }, { "epoch": 0.9859874308809486, "grad_norm": 7.866406440734863, "learning_rate": 6.46134164107326e-08, "loss": 30.6392, "step": 33300 }, { "epoch": 0.9889483540968074, "grad_norm": 7.722043514251709, "learning_rate": 4.022894580381742e-08, "loss": 30.8502, "step": 33400 }, { "epoch": 0.9919092773126661, "grad_norm": 7.612312316894531, "learning_rate": 2.1593302016933437e-08, "loss": 30.7914, "step": 33500 }, { "epoch": 0.9948702005285248, "grad_norm": 8.195243835449219, "learning_rate": 8.708271799542367e-09, "loss": 30.8885, "step": 33600 }, { "epoch": 0.9978311237443835, "grad_norm": 8.127638816833496, "learning_rate": 1.5750905434130935e-09, "loss": 30.9894, "step": 33700 } ], "logging_steps": 100, "max_steps": 33773, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.598282561239384e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }