| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9999925976919604, | |
| "eval_steps": 1000, | |
| "global_step": 33773, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.002960923215858705, | |
| "grad_norm": 41.32149887084961, | |
| "learning_rate": 1.98e-06, | |
| "loss": 87.7468, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.00592184643171741, | |
| "grad_norm": 39.51910400390625, | |
| "learning_rate": 3.98e-06, | |
| "loss": 85.766, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.008882769647576115, | |
| "grad_norm": 26.557823181152344, | |
| "learning_rate": 5.98e-06, | |
| "loss": 80.1376, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.01184369286343482, | |
| "grad_norm": 21.80652618408203, | |
| "learning_rate": 7.98e-06, | |
| "loss": 74.3306, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.014804616079293524, | |
| "grad_norm": 16.2612247467041, | |
| "learning_rate": 9.980000000000001e-06, | |
| "loss": 72.3247, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.01776553929515223, | |
| "grad_norm": 17.281190872192383, | |
| "learning_rate": 1.198e-05, | |
| "loss": 71.0703, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.020726462511010933, | |
| "grad_norm": 16.140579223632812, | |
| "learning_rate": 1.3980000000000002e-05, | |
| "loss": 69.5824, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.02368738572686964, | |
| "grad_norm": 13.456184387207031, | |
| "learning_rate": 1.598e-05, | |
| "loss": 67.2135, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.026648308942728342, | |
| "grad_norm": 11.778711318969727, | |
| "learning_rate": 1.798e-05, | |
| "loss": 65.1397, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.02960923215858705, | |
| "grad_norm": 11.965922355651855, | |
| "learning_rate": 1.9980000000000002e-05, | |
| "loss": 63.204, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.02960923215858705, | |
| "eval_loss": 7.691287040710449, | |
| "eval_runtime": 37.9485, | |
| "eval_samples_per_second": 28.486, | |
| "eval_steps_per_second": 7.141, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.032570155374445756, | |
| "grad_norm": 8.482980728149414, | |
| "learning_rate": 2.198e-05, | |
| "loss": 60.9402, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.03553107859030446, | |
| "grad_norm": 49.948341369628906, | |
| "learning_rate": 2.398e-05, | |
| "loss": 58.9346, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.03849200180616316, | |
| "grad_norm": 10.039616584777832, | |
| "learning_rate": 2.5980000000000002e-05, | |
| "loss": 57.309, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.041452925022021865, | |
| "grad_norm": 8.930785179138184, | |
| "learning_rate": 2.798e-05, | |
| "loss": 56.3741, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.04441384823788057, | |
| "grad_norm": 8.14844036102295, | |
| "learning_rate": 2.998e-05, | |
| "loss": 55.8969, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.04737477145373928, | |
| "grad_norm": 16.170246124267578, | |
| "learning_rate": 3.198e-05, | |
| "loss": 55.12, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.05033569466959798, | |
| "grad_norm": 10.525145530700684, | |
| "learning_rate": 3.398e-05, | |
| "loss": 54.5077, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.053296617885456685, | |
| "grad_norm": 29.427160263061523, | |
| "learning_rate": 3.5980000000000004e-05, | |
| "loss": 54.2087, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.05625754110131539, | |
| "grad_norm": 47.125083923339844, | |
| "learning_rate": 3.798e-05, | |
| "loss": 53.4388, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.0592184643171741, | |
| "grad_norm": 10.07633113861084, | |
| "learning_rate": 3.998e-05, | |
| "loss": 53.5798, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.0592184643171741, | |
| "eval_loss": 6.5744218826293945, | |
| "eval_runtime": 38.8407, | |
| "eval_samples_per_second": 27.832, | |
| "eval_steps_per_second": 6.977, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.0621793875330328, | |
| "grad_norm": 10.918025970458984, | |
| "learning_rate": 4.198e-05, | |
| "loss": 52.5816, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.06514031074889151, | |
| "grad_norm": 12.68106460571289, | |
| "learning_rate": 4.398e-05, | |
| "loss": 52.6681, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.06810123396475021, | |
| "grad_norm": 15.503605842590332, | |
| "learning_rate": 4.598e-05, | |
| "loss": 52.5443, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.07106215718060892, | |
| "grad_norm": 10.995290756225586, | |
| "learning_rate": 4.798e-05, | |
| "loss": 51.4841, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.07402308039646761, | |
| "grad_norm": 22.71038055419922, | |
| "learning_rate": 4.998e-05, | |
| "loss": 51.05, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.07698400361232632, | |
| "grad_norm": 11.035310745239258, | |
| "learning_rate": 5.198e-05, | |
| "loss": 50.8273, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.07994492682818503, | |
| "grad_norm": 14.107246398925781, | |
| "learning_rate": 5.398e-05, | |
| "loss": 50.9043, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.08290585004404373, | |
| "grad_norm": 11.570377349853516, | |
| "learning_rate": 5.598e-05, | |
| "loss": 50.4862, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.08586677325990244, | |
| "grad_norm": 16.12681770324707, | |
| "learning_rate": 5.7980000000000004e-05, | |
| "loss": 50.0248, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.08882769647576114, | |
| "grad_norm": 14.925129890441895, | |
| "learning_rate": 5.9980000000000005e-05, | |
| "loss": 49.789, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.08882769647576114, | |
| "eval_loss": 6.141844272613525, | |
| "eval_runtime": 38.2773, | |
| "eval_samples_per_second": 28.241, | |
| "eval_steps_per_second": 7.08, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.09178861969161985, | |
| "grad_norm": 16.748519897460938, | |
| "learning_rate": 6.198e-05, | |
| "loss": 49.4085, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.09474954290747856, | |
| "grad_norm": 12.9891939163208, | |
| "learning_rate": 6.398000000000001e-05, | |
| "loss": 49.1003, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.09771046612333725, | |
| "grad_norm": 11.456365585327148, | |
| "learning_rate": 6.598e-05, | |
| "loss": 48.6685, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.10067138933919596, | |
| "grad_norm": 19.153154373168945, | |
| "learning_rate": 6.798e-05, | |
| "loss": 48.3342, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.10363231255505466, | |
| "grad_norm": 13.501580238342285, | |
| "learning_rate": 6.998e-05, | |
| "loss": 47.7641, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.10659323577091337, | |
| "grad_norm": 14.034686088562012, | |
| "learning_rate": 7.198e-05, | |
| "loss": 47.8053, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.10955415898677208, | |
| "grad_norm": 14.023058891296387, | |
| "learning_rate": 7.398e-05, | |
| "loss": 47.4925, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.11251508220263078, | |
| "grad_norm": 16.410221099853516, | |
| "learning_rate": 7.598e-05, | |
| "loss": 47.1501, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.11547600541848949, | |
| "grad_norm": 17.547571182250977, | |
| "learning_rate": 7.798000000000001e-05, | |
| "loss": 47.3132, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.1184369286343482, | |
| "grad_norm": 11.753161430358887, | |
| "learning_rate": 7.998e-05, | |
| "loss": 47.071, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.1184369286343482, | |
| "eval_loss": 5.719655990600586, | |
| "eval_runtime": 38.3575, | |
| "eval_samples_per_second": 28.182, | |
| "eval_steps_per_second": 7.065, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.12139785185020689, | |
| "grad_norm": 51.37761306762695, | |
| "learning_rate": 8.198000000000001e-05, | |
| "loss": 46.1728, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.1243587750660656, | |
| "grad_norm": 12.61581802368164, | |
| "learning_rate": 8.398e-05, | |
| "loss": 45.9941, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.1273196982819243, | |
| "grad_norm": 12.667135238647461, | |
| "learning_rate": 8.598e-05, | |
| "loss": 46.1649, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.13028062149778302, | |
| "grad_norm": 10.368157386779785, | |
| "learning_rate": 8.798e-05, | |
| "loss": 45.9172, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.13324154471364172, | |
| "grad_norm": 12.5702543258667, | |
| "learning_rate": 8.998e-05, | |
| "loss": 45.5541, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.13620246792950041, | |
| "grad_norm": 10.535380363464355, | |
| "learning_rate": 9.198e-05, | |
| "loss": 44.45, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.1391633911453591, | |
| "grad_norm": 11.035446166992188, | |
| "learning_rate": 9.398e-05, | |
| "loss": 44.3243, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.14212431436121784, | |
| "grad_norm": 10.718255043029785, | |
| "learning_rate": 9.598e-05, | |
| "loss": 43.8677, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.14508523757707653, | |
| "grad_norm": 9.816108703613281, | |
| "learning_rate": 9.798000000000001e-05, | |
| "loss": 43.6948, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.14804616079293523, | |
| "grad_norm": 9.837696075439453, | |
| "learning_rate": 9.998000000000002e-05, | |
| "loss": 43.6361, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.14804616079293523, | |
| "eval_loss": 5.329010486602783, | |
| "eval_runtime": 38.0693, | |
| "eval_samples_per_second": 28.396, | |
| "eval_steps_per_second": 7.119, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.15100708400879395, | |
| "grad_norm": 10.140490531921387, | |
| "learning_rate": 0.00010198, | |
| "loss": 42.8106, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.15396800722465265, | |
| "grad_norm": 9.330647468566895, | |
| "learning_rate": 0.00010398, | |
| "loss": 42.4439, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.15692893044051134, | |
| "grad_norm": 9.038117408752441, | |
| "learning_rate": 0.00010598, | |
| "loss": 41.8943, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.15988985365637007, | |
| "grad_norm": 10.28738021850586, | |
| "learning_rate": 0.00010798, | |
| "loss": 41.5117, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.16285077687222876, | |
| "grad_norm": 9.869328498840332, | |
| "learning_rate": 0.00010998, | |
| "loss": 41.3489, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.16581170008808746, | |
| "grad_norm": 10.985088348388672, | |
| "learning_rate": 0.00011198000000000001, | |
| "loss": 40.7585, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.16877262330394618, | |
| "grad_norm": 11.46516227722168, | |
| "learning_rate": 0.00011398, | |
| "loss": 40.2893, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.17173354651980488, | |
| "grad_norm": 9.691688537597656, | |
| "learning_rate": 0.00011598000000000001, | |
| "loss": 40.0513, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.17469446973566358, | |
| "grad_norm": 10.819178581237793, | |
| "learning_rate": 0.00011798, | |
| "loss": 39.986, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.17765539295152227, | |
| "grad_norm": 9.694029808044434, | |
| "learning_rate": 0.00011998, | |
| "loss": 39.3918, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.17765539295152227, | |
| "eval_loss": 4.900777339935303, | |
| "eval_runtime": 38.052, | |
| "eval_samples_per_second": 28.409, | |
| "eval_steps_per_second": 7.122, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.180616316167381, | |
| "grad_norm": 9.988055229187012, | |
| "learning_rate": 0.00011999900481764066, | |
| "loss": 39.336, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.1835772393832397, | |
| "grad_norm": 13.242379188537598, | |
| "learning_rate": 0.00011999597899343296, | |
| "loss": 39.0612, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.1865381625990984, | |
| "grad_norm": 13.935639381408691, | |
| "learning_rate": 0.00011999092252825071, | |
| "loss": 38.585, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.1894990858149571, | |
| "grad_norm": 9.50368881225586, | |
| "learning_rate": 0.00011998383559323646, | |
| "loss": 38.3112, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.1924600090308158, | |
| "grad_norm": 10.878887176513672, | |
| "learning_rate": 0.00011997471842825661, | |
| "loss": 38.3219, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.1954209322466745, | |
| "grad_norm": 10.517402648925781, | |
| "learning_rate": 0.00011996357134189334, | |
| "loss": 37.8246, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.19838185546253323, | |
| "grad_norm": 10.922290802001953, | |
| "learning_rate": 0.0001199503947114341, | |
| "loss": 37.6387, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.20134277867839193, | |
| "grad_norm": 11.845630645751953, | |
| "learning_rate": 0.00011993518898285887, | |
| "loss": 37.8343, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.20430370189425062, | |
| "grad_norm": 8.628484725952148, | |
| "learning_rate": 0.00011991795467082508, | |
| "loss": 37.5011, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.20726462511010932, | |
| "grad_norm": 9.489052772521973, | |
| "learning_rate": 0.00011989869235865012, | |
| "loss": 37.132, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.20726462511010932, | |
| "eval_loss": 4.595886707305908, | |
| "eval_runtime": 38.0814, | |
| "eval_samples_per_second": 28.387, | |
| "eval_steps_per_second": 7.116, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.21022554832596804, | |
| "grad_norm": 9.687568664550781, | |
| "learning_rate": 0.00011987740269829175, | |
| "loss": 36.9362, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.21318647154182674, | |
| "grad_norm": 8.676931381225586, | |
| "learning_rate": 0.0001198540864103258, | |
| "loss": 37.0267, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.21614739475768543, | |
| "grad_norm": 9.232645988464355, | |
| "learning_rate": 0.00011982874428392204, | |
| "loss": 36.5181, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.21910831797354416, | |
| "grad_norm": 8.917469024658203, | |
| "learning_rate": 0.00011980137717681727, | |
| "loss": 36.5812, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.22206924118940286, | |
| "grad_norm": 8.593257904052734, | |
| "learning_rate": 0.0001197719860152864, | |
| "loss": 36.0672, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.22503016440526155, | |
| "grad_norm": 10.630696296691895, | |
| "learning_rate": 0.00011974057179411103, | |
| "loss": 36.2405, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.22799108762112028, | |
| "grad_norm": 9.975415229797363, | |
| "learning_rate": 0.00011970713557654582, | |
| "loss": 35.9903, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.23095201083697897, | |
| "grad_norm": 8.622698783874512, | |
| "learning_rate": 0.00011967167849428251, | |
| "loss": 35.8196, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.23391293405283767, | |
| "grad_norm": 14.828067779541016, | |
| "learning_rate": 0.00011963420174741161, | |
| "loss": 35.7946, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.2368738572686964, | |
| "grad_norm": 9.303028106689453, | |
| "learning_rate": 0.00011959470660438173, | |
| "loss": 35.5493, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.2368738572686964, | |
| "eval_loss": 4.408100128173828, | |
| "eval_runtime": 37.807, | |
| "eval_samples_per_second": 28.593, | |
| "eval_steps_per_second": 7.168, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.2398347804845551, | |
| "grad_norm": 11.987268447875977, | |
| "learning_rate": 0.00011955319440195674, | |
| "loss": 35.6014, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.24279570370041378, | |
| "grad_norm": 10.032620429992676, | |
| "learning_rate": 0.00011950966654517043, | |
| "loss": 35.5302, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.24575662691627248, | |
| "grad_norm": 9.362653732299805, | |
| "learning_rate": 0.00011946412450727906, | |
| "loss": 35.2124, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.2487175501321312, | |
| "grad_norm": 9.706056594848633, | |
| "learning_rate": 0.00011941656982971138, | |
| "loss": 34.9229, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.25167847334798993, | |
| "grad_norm": 10.424148559570312, | |
| "learning_rate": 0.00011936700412201653, | |
| "loss": 35.1602, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.2546393965638486, | |
| "grad_norm": 10.900792121887207, | |
| "learning_rate": 0.00011931542906180957, | |
| "loss": 34.9212, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.2576003197797073, | |
| "grad_norm": 10.541563034057617, | |
| "learning_rate": 0.00011926184639471465, | |
| "loss": 34.8347, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.26056124299556604, | |
| "grad_norm": 8.576896667480469, | |
| "learning_rate": 0.00011920625793430596, | |
| "loss": 34.9933, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.2635221662114247, | |
| "grad_norm": 10.162493705749512, | |
| "learning_rate": 0.00011914866556204637, | |
| "loss": 34.3925, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.26648308942728344, | |
| "grad_norm": 11.247607231140137, | |
| "learning_rate": 0.0001190890712272237, | |
| "loss": 34.4828, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.26648308942728344, | |
| "eval_loss": 4.2549567222595215, | |
| "eval_runtime": 37.961, | |
| "eval_samples_per_second": 28.477, | |
| "eval_steps_per_second": 7.139, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.2694440126431421, | |
| "grad_norm": 9.189545631408691, | |
| "learning_rate": 0.00011902747694688472, | |
| "loss": 34.3655, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.27240493585900083, | |
| "grad_norm": 11.199912071228027, | |
| "learning_rate": 0.000118963884805767, | |
| "loss": 34.4358, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.27536585907485955, | |
| "grad_norm": 9.673705101013184, | |
| "learning_rate": 0.00011889829695622823, | |
| "loss": 34.3689, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 0.2783267822907182, | |
| "grad_norm": 10.03848934173584, | |
| "learning_rate": 0.00011883071561817344, | |
| "loss": 33.9158, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 0.28128770550657695, | |
| "grad_norm": 11.581180572509766, | |
| "learning_rate": 0.00011876114307897981, | |
| "loss": 33.992, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.28424862872243567, | |
| "grad_norm": 10.81711483001709, | |
| "learning_rate": 0.00011868958169341929, | |
| "loss": 34.1195, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.28720955193829434, | |
| "grad_norm": 9.648648262023926, | |
| "learning_rate": 0.00011861603388357893, | |
| "loss": 34.1664, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 0.29017047515415306, | |
| "grad_norm": 11.37558364868164, | |
| "learning_rate": 0.00011854050213877877, | |
| "loss": 33.9937, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 0.2931313983700118, | |
| "grad_norm": 9.346961975097656, | |
| "learning_rate": 0.0001184629890154878, | |
| "loss": 33.6917, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 0.29609232158587045, | |
| "grad_norm": 11.31644058227539, | |
| "learning_rate": 0.0001183834971372372, | |
| "loss": 33.7808, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.29609232158587045, | |
| "eval_loss": 4.157764434814453, | |
| "eval_runtime": 39.9196, | |
| "eval_samples_per_second": 27.079, | |
| "eval_steps_per_second": 6.789, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.2990532448017292, | |
| "grad_norm": 18.920991897583008, | |
| "learning_rate": 0.00011180531798567065, | |
| "loss": 44.0571, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 0.3020141680175879, | |
| "grad_norm": 15.393646240234375, | |
| "learning_rate": 0.0001116461207502148, | |
| "loss": 39.8888, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 0.30497509123344657, | |
| "grad_norm": 13.2774076461792, | |
| "learning_rate": 0.00011148550761026972, | |
| "loss": 38.6529, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 0.3079360144493053, | |
| "grad_norm": 13.391098976135254, | |
| "learning_rate": 0.00011132348296912578, | |
| "loss": 37.759, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 0.310896937665164, | |
| "grad_norm": 11.917950630187988, | |
| "learning_rate": 0.00011116005126877037, | |
| "loss": 37.1968, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.3138578608810227, | |
| "grad_norm": 11.100213050842285, | |
| "learning_rate": 0.0001109952169897661, | |
| "loss": 37.213, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 0.3168187840968814, | |
| "grad_norm": 14.579487800598145, | |
| "learning_rate": 0.00011082898465112802, | |
| "loss": 36.7415, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 0.31977970731274014, | |
| "grad_norm": 11.359614372253418, | |
| "learning_rate": 0.00011066135881019965, | |
| "loss": 36.361, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 0.3227406305285988, | |
| "grad_norm": 12.316486358642578, | |
| "learning_rate": 0.00011049234406252809, | |
| "loss": 36.0591, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 0.32570155374445753, | |
| "grad_norm": 13.45693588256836, | |
| "learning_rate": 0.00011032194504173804, | |
| "loss": 35.6357, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.32570155374445753, | |
| "eval_loss": 4.41249418258667, | |
| "eval_runtime": 39.8412, | |
| "eval_samples_per_second": 27.133, | |
| "eval_steps_per_second": 6.802, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.32866247696031625, | |
| "grad_norm": 10.018808364868164, | |
| "learning_rate": 0.00011412090151135696, | |
| "loss": 33.7098, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 0.3316234001761749, | |
| "grad_norm": 10.30320930480957, | |
| "learning_rate": 0.00011397731809339621, | |
| "loss": 33.7831, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 0.33458432339203364, | |
| "grad_norm": 9.650611877441406, | |
| "learning_rate": 0.0001138320949911399, | |
| "loss": 33.5415, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 0.33754524660789237, | |
| "grad_norm": 8.77065372467041, | |
| "learning_rate": 0.0001136852366160714, | |
| "loss": 33.2261, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 0.34050616982375104, | |
| "grad_norm": 12.062385559082031, | |
| "learning_rate": 0.00011353674742934919, | |
| "loss": 33.0819, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.34346709303960976, | |
| "grad_norm": 10.947739601135254, | |
| "learning_rate": 0.00011338663194167138, | |
| "loss": 33.3451, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 0.34642801625546843, | |
| "grad_norm": 9.377535820007324, | |
| "learning_rate": 0.00011323489471313875, | |
| "loss": 32.8928, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 0.34938893947132715, | |
| "grad_norm": 8.902270317077637, | |
| "learning_rate": 0.00011308154035311608, | |
| "loss": 33.1756, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 0.3523498626871859, | |
| "grad_norm": 10.434513092041016, | |
| "learning_rate": 0.00011292657352009224, | |
| "loss": 33.1595, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 0.35531078590304455, | |
| "grad_norm": 11.084539413452148, | |
| "learning_rate": 0.00011276999892153867, | |
| "loss": 33.359, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.35531078590304455, | |
| "eval_loss": 4.073917865753174, | |
| "eval_runtime": 38.304, | |
| "eval_samples_per_second": 28.222, | |
| "eval_steps_per_second": 7.075, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.35827170911890327, | |
| "grad_norm": 7.943862438201904, | |
| "learning_rate": 9.143653002276282e-05, | |
| "loss": 32.5648, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 0.361232632334762, | |
| "grad_norm": 8.098073959350586, | |
| "learning_rate": 9.09346201340685e-05, | |
| "loss": 32.1551, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 0.36419355555062066, | |
| "grad_norm": 7.46992826461792, | |
| "learning_rate": 9.042974429385753e-05, | |
| "loss": 32.3569, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 0.3671544787664794, | |
| "grad_norm": 7.480947971343994, | |
| "learning_rate": 8.992195090864853e-05, | |
| "loss": 32.4467, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 0.3701154019823381, | |
| "grad_norm": 7.488786220550537, | |
| "learning_rate": 8.941128866468864e-05, | |
| "loss": 32.4447, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.3730763251981968, | |
| "grad_norm": 8.124217987060547, | |
| "learning_rate": 8.889780652328559e-05, | |
| "loss": 32.3657, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 0.3760372484140555, | |
| "grad_norm": 8.322397232055664, | |
| "learning_rate": 8.83815537161135e-05, | |
| "loss": 31.9431, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 0.3789981716299142, | |
| "grad_norm": 8.59915828704834, | |
| "learning_rate": 8.786257974049245e-05, | |
| "loss": 31.9211, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 0.3819590948457729, | |
| "grad_norm": 8.048558235168457, | |
| "learning_rate": 8.734093435464301e-05, | |
| "loss": 32.437, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 0.3849200180616316, | |
| "grad_norm": 7.816276550292969, | |
| "learning_rate": 8.681666757291531e-05, | |
| "loss": 32.0396, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.3849200180616316, | |
| "eval_loss": 3.9447479248046875, | |
| "eval_runtime": 112.3499, | |
| "eval_samples_per_second": 9.622, | |
| "eval_steps_per_second": 2.412, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.38788094127749034, | |
| "grad_norm": 8.613288879394531, | |
| "learning_rate": 8.628982966099388e-05, | |
| "loss": 31.874, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 0.390841864493349, | |
| "grad_norm": 7.478573799133301, | |
| "learning_rate": 8.576047113107821e-05, | |
| "loss": 31.7233, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 0.39380278770920774, | |
| "grad_norm": 7.845474720001221, | |
| "learning_rate": 8.52286427370398e-05, | |
| "loss": 31.628, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 0.39676371092506646, | |
| "grad_norm": 7.7132158279418945, | |
| "learning_rate": 8.469439546955592e-05, | |
| "loss": 31.8516, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 0.39972463414092513, | |
| "grad_norm": 9.245190620422363, | |
| "learning_rate": 8.415778055122073e-05, | |
| "loss": 31.8406, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.40268555735678385, | |
| "grad_norm": 8.426488876342773, | |
| "learning_rate": 8.361884943163423e-05, | |
| "loss": 31.7148, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 0.4056464805726426, | |
| "grad_norm": 7.879675388336182, | |
| "learning_rate": 8.307765378246925e-05, | |
| "loss": 31.9798, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 0.40860740378850124, | |
| "grad_norm": 8.469719886779785, | |
| "learning_rate": 8.253424549251735e-05, | |
| "loss": 31.6741, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 0.41156832700435997, | |
| "grad_norm": 8.198810577392578, | |
| "learning_rate": 8.198867666271385e-05, | |
| "loss": 31.6722, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 0.41452925022021864, | |
| "grad_norm": 7.881684303283691, | |
| "learning_rate": 8.144099960114239e-05, | |
| "loss": 31.8682, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.41452925022021864, | |
| "eval_loss": 3.904888153076172, | |
| "eval_runtime": 110.4703, | |
| "eval_samples_per_second": 9.785, | |
| "eval_steps_per_second": 2.453, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.41749017343607736, | |
| "grad_norm": 7.772391319274902, | |
| "learning_rate": 8.089126681801981e-05, | |
| "loss": 32.0349, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 0.4204510966519361, | |
| "grad_norm": 8.459504127502441, | |
| "learning_rate": 8.033953102066161e-05, | |
| "loss": 31.5844, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 0.42341201986779475, | |
| "grad_norm": 7.765544414520264, | |
| "learning_rate": 7.978584510842833e-05, | |
| "loss": 31.6879, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 0.4263729430836535, | |
| "grad_norm": 8.06749153137207, | |
| "learning_rate": 7.923026216765381e-05, | |
| "loss": 31.5893, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 0.4293338662995122, | |
| "grad_norm": 8.966425895690918, | |
| "learning_rate": 7.86728354665553e-05, | |
| "loss": 31.392, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.43229478951537087, | |
| "grad_norm": 8.47319221496582, | |
| "learning_rate": 7.81136184501262e-05, | |
| "loss": 31.3068, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 0.4352557127312296, | |
| "grad_norm": 8.642230033874512, | |
| "learning_rate": 7.755266473501193e-05, | |
| "loss": 31.5877, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 0.4382166359470883, | |
| "grad_norm": 8.412428855895996, | |
| "learning_rate": 7.699002810436915e-05, | |
| "loss": 31.6239, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 0.441177559162947, | |
| "grad_norm": 6.971558094024658, | |
| "learning_rate": 7.642576250270929e-05, | |
| "loss": 31.7946, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 0.4441384823788057, | |
| "grad_norm": 7.922480583190918, | |
| "learning_rate": 7.585992203072628e-05, | |
| "loss": 31.4474, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.4441384823788057, | |
| "eval_loss": 3.875948667526245, | |
| "eval_runtime": 109.3049, | |
| "eval_samples_per_second": 9.89, | |
| "eval_steps_per_second": 2.479, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.44709940559466443, | |
| "grad_norm": 8.747485160827637, | |
| "learning_rate": 7.529256094010965e-05, | |
| "loss": 31.6016, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 0.4500603288105231, | |
| "grad_norm": 8.723346710205078, | |
| "learning_rate": 7.472373362834283e-05, | |
| "loss": 31.2744, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 0.4530212520263818, | |
| "grad_norm": 8.310611724853516, | |
| "learning_rate": 7.415349463348775e-05, | |
| "loss": 31.7448, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 0.45598217524224055, | |
| "grad_norm": 8.236388206481934, | |
| "learning_rate": 7.358189862895577e-05, | |
| "loss": 30.9859, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 0.4589430984580992, | |
| "grad_norm": 8.104386329650879, | |
| "learning_rate": 7.300900041826566e-05, | |
| "loss": 31.1935, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.46190402167395794, | |
| "grad_norm": 8.219923973083496, | |
| "learning_rate": 7.243485492978928e-05, | |
| "loss": 30.9099, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 0.46486494488981667, | |
| "grad_norm": 8.872945785522461, | |
| "learning_rate": 7.185951721148502e-05, | |
| "loss": 31.3423, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 0.46782586810567534, | |
| "grad_norm": 8.087647438049316, | |
| "learning_rate": 7.128304242561999e-05, | |
| "loss": 31.1816, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 0.47078679132153406, | |
| "grad_norm": 8.805392265319824, | |
| "learning_rate": 7.070548584348108e-05, | |
| "loss": 31.0977, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 0.4737477145373928, | |
| "grad_norm": 8.469452857971191, | |
| "learning_rate": 7.012690284007577e-05, | |
| "loss": 31.5828, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.4737477145373928, | |
| "eval_loss": 3.8530030250549316, | |
| "eval_runtime": 109.275, | |
| "eval_samples_per_second": 9.892, | |
| "eval_steps_per_second": 2.48, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.47670863775325145, | |
| "grad_norm": 8.871159553527832, | |
| "learning_rate": 6.954734888882281e-05, | |
| "loss": 30.9753, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 0.4796695609691102, | |
| "grad_norm": 8.81116008758545, | |
| "learning_rate": 6.896687955623357e-05, | |
| "loss": 31.2067, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 0.4826304841849689, | |
| "grad_norm": 7.77982759475708, | |
| "learning_rate": 6.838555049658432e-05, | |
| "loss": 31.089, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 0.48559140740082757, | |
| "grad_norm": 8.370245933532715, | |
| "learning_rate": 6.780341744658044e-05, | |
| "loss": 30.9776, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 0.4885523306166863, | |
| "grad_norm": 8.41613483428955, | |
| "learning_rate": 6.722053622001221e-05, | |
| "loss": 31.1095, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.49151325383254496, | |
| "grad_norm": 7.951696395874023, | |
| "learning_rate": 6.663696270240373e-05, | |
| "loss": 31.1532, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 0.4944741770484037, | |
| "grad_norm": 9.02717113494873, | |
| "learning_rate": 6.60527528456546e-05, | |
| "loss": 31.0777, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 0.4974351002642624, | |
| "grad_norm": 8.57259750366211, | |
| "learning_rate": 6.546796266267535e-05, | |
| "loss": 31.3509, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 0.5003960234801211, | |
| "grad_norm": 9.129491806030273, | |
| "learning_rate": 6.488264822201711e-05, | |
| "loss": 30.7844, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 0.5033569466959799, | |
| "grad_norm": 8.600064277648926, | |
| "learning_rate": 6.429686564249579e-05, | |
| "loss": 31.1164, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.5033569466959799, | |
| "eval_loss": 3.836409091949463, | |
| "eval_runtime": 109.0903, | |
| "eval_samples_per_second": 9.909, | |
| "eval_steps_per_second": 2.484, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.5063178699118385, | |
| "grad_norm": 8.62096881866455, | |
| "learning_rate": 6.371067108781158e-05, | |
| "loss": 31.1944, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 0.5092787931276972, | |
| "grad_norm": 8.052851676940918, | |
| "learning_rate": 6.312412076116401e-05, | |
| "loss": 31.0126, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 0.5122397163435559, | |
| "grad_norm": 8.32268238067627, | |
| "learning_rate": 6.253727089986337e-05, | |
| "loss": 31.0692, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 0.5152006395594146, | |
| "grad_norm": 8.130902290344238, | |
| "learning_rate": 6.195017776993876e-05, | |
| "loss": 30.9143, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 0.5181615627752734, | |
| "grad_norm": 9.245232582092285, | |
| "learning_rate": 6.136289766074334e-05, | |
| "loss": 31.0029, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.5211224859911321, | |
| "grad_norm": 8.296626091003418, | |
| "learning_rate": 6.077548687955759e-05, | |
| "loss": 31.0624, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 0.5240834092069907, | |
| "grad_norm": 8.933104515075684, | |
| "learning_rate": 6.018800174619048e-05, | |
| "loss": 31.0619, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 0.5270443324228494, | |
| "grad_norm": 7.37945032119751, | |
| "learning_rate": 5.960049858757974e-05, | |
| "loss": 31.3181, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 0.5300052556387082, | |
| "grad_norm": 8.817550659179688, | |
| "learning_rate": 5.901303373239133e-05, | |
| "loss": 30.8424, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 0.5329661788545669, | |
| "grad_norm": 7.71854305267334, | |
| "learning_rate": 5.842566350561879e-05, | |
| "loss": 31.0376, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.5329661788545669, | |
| "eval_loss": 3.822613477706909, | |
| "eval_runtime": 112.0979, | |
| "eval_samples_per_second": 9.643, | |
| "eval_steps_per_second": 2.418, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.5359271020704256, | |
| "grad_norm": 8.84870719909668, | |
| "learning_rate": 5.7838444223182826e-05, | |
| "loss": 30.8901, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 0.5388880252862842, | |
| "grad_norm": 7.48129415512085, | |
| "learning_rate": 5.725143218653187e-05, | |
| "loss": 31.0275, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 0.5418489485021429, | |
| "grad_norm": 8.218484878540039, | |
| "learning_rate": 5.666468367724412e-05, | |
| "loss": 31.1443, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 0.5448098717180017, | |
| "grad_norm": 9.589841842651367, | |
| "learning_rate": 5.607825495163119e-05, | |
| "loss": 30.9756, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 0.5477707949338604, | |
| "grad_norm": 8.583683013916016, | |
| "learning_rate": 5.549220223534451e-05, | |
| "loss": 31.0641, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.5507317181497191, | |
| "grad_norm": 7.978188991546631, | |
| "learning_rate": 5.490658171798439e-05, | |
| "loss": 30.8899, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 0.5536926413655778, | |
| "grad_norm": 8.130802154541016, | |
| "learning_rate": 5.432144954771287e-05, | |
| "loss": 31.0812, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 0.5566535645814364, | |
| "grad_norm": 8.981709480285645, | |
| "learning_rate": 5.37368618258701e-05, | |
| "loss": 31.0612, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 0.5596144877972952, | |
| "grad_norm": 7.87661075592041, | |
| "learning_rate": 5.315287460159561e-05, | |
| "loss": 30.8581, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 0.5625754110131539, | |
| "grad_norm": 8.329483032226562, | |
| "learning_rate": 5.256954386645438e-05, | |
| "loss": 31.1805, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.5625754110131539, | |
| "eval_loss": 3.8131661415100098, | |
| "eval_runtime": 111.5683, | |
| "eval_samples_per_second": 9.689, | |
| "eval_steps_per_second": 2.429, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.5655363342290126, | |
| "grad_norm": 8.833015441894531, | |
| "learning_rate": 5.198692554906851e-05, | |
| "loss": 30.9231, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 0.5684972574448713, | |
| "grad_norm": 7.966989994049072, | |
| "learning_rate": 5.1405075509754834e-05, | |
| "loss": 31.0225, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 0.5714581806607301, | |
| "grad_norm": 8.791169166564941, | |
| "learning_rate": 5.0824049535169166e-05, | |
| "loss": 31.1551, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 0.5744191038765887, | |
| "grad_norm": 7.9680023193359375, | |
| "learning_rate": 5.024390333295761e-05, | |
| "loss": 31.0498, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 0.5773800270924474, | |
| "grad_norm": 8.603718757629395, | |
| "learning_rate": 4.966469252641538e-05, | |
| "loss": 30.9017, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.5803409503083061, | |
| "grad_norm": 12.401627540588379, | |
| "learning_rate": 4.908647264915378e-05, | |
| "loss": 30.9988, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 0.5833018735241648, | |
| "grad_norm": 8.433266639709473, | |
| "learning_rate": 4.8509299139775734e-05, | |
| "loss": 30.9905, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 0.5862627967400236, | |
| "grad_norm": 7.99282693862915, | |
| "learning_rate": 4.7933227336560414e-05, | |
| "loss": 31.0604, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 0.5892237199558823, | |
| "grad_norm": 8.011063575744629, | |
| "learning_rate": 4.735831247215753e-05, | |
| "loss": 30.7471, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 0.5921846431717409, | |
| "grad_norm": 9.603862762451172, | |
| "learning_rate": 4.67846096682918e-05, | |
| "loss": 30.8428, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.5921846431717409, | |
| "eval_loss": 3.8060901165008545, | |
| "eval_runtime": 112.6154, | |
| "eval_samples_per_second": 9.599, | |
| "eval_steps_per_second": 2.406, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.5951455663875996, | |
| "grad_norm": 8.427188873291016, | |
| "learning_rate": 4.6212173930477874e-05, | |
| "loss": 30.8438, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 0.5981064896034584, | |
| "grad_norm": 7.692320346832275, | |
| "learning_rate": 4.5641060142746556e-05, | |
| "loss": 30.7664, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 0.6010674128193171, | |
| "grad_norm": 8.596179962158203, | |
| "learning_rate": 4.507132306238262e-05, | |
| "loss": 30.9387, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 0.6040283360351758, | |
| "grad_norm": 8.076534271240234, | |
| "learning_rate": 4.450301731467488e-05, | |
| "loss": 30.851, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 0.6069892592510344, | |
| "grad_norm": 9.05728816986084, | |
| "learning_rate": 4.3936197387678665e-05, | |
| "loss": 30.7486, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 0.6099501824668931, | |
| "grad_norm": 8.477595329284668, | |
| "learning_rate": 4.3370917626991706e-05, | |
| "loss": 30.6843, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 0.6129111056827519, | |
| "grad_norm": 8.171915054321289, | |
| "learning_rate": 4.2807232230543625e-05, | |
| "loss": 30.9551, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 0.6158720288986106, | |
| "grad_norm": 8.333806991577148, | |
| "learning_rate": 4.22451952433994e-05, | |
| "loss": 30.8566, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 0.6188329521144693, | |
| "grad_norm": 7.9477715492248535, | |
| "learning_rate": 4.168486055257777e-05, | |
| "loss": 30.8577, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 0.621793875330328, | |
| "grad_norm": 8.560218811035156, | |
| "learning_rate": 4.112628188188457e-05, | |
| "loss": 30.7203, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.621793875330328, | |
| "eval_loss": 3.7986109256744385, | |
| "eval_runtime": 109.4771, | |
| "eval_samples_per_second": 9.874, | |
| "eval_steps_per_second": 2.475, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.6247547985461867, | |
| "grad_norm": 8.963776588439941, | |
| "learning_rate": 4.056951278676187e-05, | |
| "loss": 30.9418, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 0.6277157217620454, | |
| "grad_norm": 8.338837623596191, | |
| "learning_rate": 4.001460664915308e-05, | |
| "loss": 30.756, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 0.6306766449779041, | |
| "grad_norm": 8.323155403137207, | |
| "learning_rate": 3.946161667238485e-05, | |
| "loss": 30.6959, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 0.6336375681937628, | |
| "grad_norm": 9.881996154785156, | |
| "learning_rate": 3.8910595876066085e-05, | |
| "loss": 30.9333, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 0.6365984914096215, | |
| "grad_norm": 8.089996337890625, | |
| "learning_rate": 3.836159709100446e-05, | |
| "loss": 30.6899, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 0.6395594146254803, | |
| "grad_norm": 7.9427289962768555, | |
| "learning_rate": 3.7814672954141055e-05, | |
| "loss": 30.8046, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 0.6425203378413389, | |
| "grad_norm": 8.468146324157715, | |
| "learning_rate": 3.7269875903503826e-05, | |
| "loss": 31.2292, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 0.6454812610571976, | |
| "grad_norm": 8.63842487335205, | |
| "learning_rate": 3.672725817317973e-05, | |
| "loss": 30.7721, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 0.6484421842730563, | |
| "grad_norm": 8.145241737365723, | |
| "learning_rate": 3.6186871788306674e-05, | |
| "loss": 30.5881, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 0.6514031074889151, | |
| "grad_norm": 8.194993019104004, | |
| "learning_rate": 3.5648768560085604e-05, | |
| "loss": 30.9425, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.6514031074889151, | |
| "eval_loss": 3.7950870990753174, | |
| "eval_runtime": 109.4264, | |
| "eval_samples_per_second": 9.879, | |
| "eval_steps_per_second": 2.477, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.6543640307047738, | |
| "grad_norm": 9.304323196411133, | |
| "learning_rate": 3.511300008081273e-05, | |
| "loss": 30.722, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 0.6573249539206325, | |
| "grad_norm": 7.82930850982666, | |
| "learning_rate": 3.4579617718933054e-05, | |
| "loss": 30.7943, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 0.6602858771364911, | |
| "grad_norm": 7.912548542022705, | |
| "learning_rate": 3.4048672614115294e-05, | |
| "loss": 30.8451, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 0.6632468003523498, | |
| "grad_norm": 8.46181583404541, | |
| "learning_rate": 3.352021567234869e-05, | |
| "loss": 30.9009, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 0.6662077235682086, | |
| "grad_norm": 7.727646827697754, | |
| "learning_rate": 3.299429756106215e-05, | |
| "loss": 30.8281, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 0.6691686467840673, | |
| "grad_norm": 8.119136810302734, | |
| "learning_rate": 3.247096870426649e-05, | |
| "loss": 30.7757, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 0.672129569999926, | |
| "grad_norm": 8.091607093811035, | |
| "learning_rate": 3.195027927771982e-05, | |
| "loss": 30.8661, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 0.6750904932157847, | |
| "grad_norm": 7.598474979400635, | |
| "learning_rate": 3.1432279204116776e-05, | |
| "loss": 30.6257, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 0.6780514164316434, | |
| "grad_norm": 9.547100067138672, | |
| "learning_rate": 3.091701814830198e-05, | |
| "loss": 30.8582, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 0.6810123396475021, | |
| "grad_norm": 7.637078762054443, | |
| "learning_rate": 3.0404545512508415e-05, | |
| "loss": 30.9432, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.6810123396475021, | |
| "eval_loss": 3.791748285293579, | |
| "eval_runtime": 109.2867, | |
| "eval_samples_per_second": 9.891, | |
| "eval_steps_per_second": 2.48, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.6839732628633608, | |
| "grad_norm": 8.485209465026855, | |
| "learning_rate": 2.98949104316207e-05, | |
| "loss": 30.921, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 0.6869341860792195, | |
| "grad_norm": 7.777042865753174, | |
| "learning_rate": 2.938816176846421e-05, | |
| "loss": 30.8116, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 0.6898951092950782, | |
| "grad_norm": 7.6587138175964355, | |
| "learning_rate": 2.8884348109120106e-05, | |
| "loss": 30.7965, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 0.6928560325109369, | |
| "grad_norm": 8.276775360107422, | |
| "learning_rate": 2.8383517758267178e-05, | |
| "loss": 30.6582, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 0.6958169557267956, | |
| "grad_norm": 7.5494771003723145, | |
| "learning_rate": 2.7885718734550257e-05, | |
| "loss": 30.6483, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 0.6987778789426543, | |
| "grad_norm": 7.938130855560303, | |
| "learning_rate": 2.739099876597646e-05, | |
| "loss": 30.529, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 0.701738802158513, | |
| "grad_norm": 8.202885627746582, | |
| "learning_rate": 2.6899405285339026e-05, | |
| "loss": 30.825, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 0.7046997253743718, | |
| "grad_norm": 8.393240928649902, | |
| "learning_rate": 2.6410985425669622e-05, | |
| "loss": 30.7867, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 0.7076606485902305, | |
| "grad_norm": 8.32459831237793, | |
| "learning_rate": 2.5925786015719207e-05, | |
| "loss": 30.7898, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 0.7106215718060891, | |
| "grad_norm": 9.323598861694336, | |
| "learning_rate": 2.544385357546831e-05, | |
| "loss": 30.5684, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.7106215718060891, | |
| "eval_loss": 3.789947271347046, | |
| "eval_runtime": 110.7686, | |
| "eval_samples_per_second": 9.759, | |
| "eval_steps_per_second": 2.447, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.7135824950219478, | |
| "grad_norm": 8.184738159179688, | |
| "learning_rate": 2.4965234311666717e-05, | |
| "loss": 30.7187, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 0.7165434182378065, | |
| "grad_norm": 7.82784366607666, | |
| "learning_rate": 2.4489974113403275e-05, | |
| "loss": 30.705, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 0.7195043414536653, | |
| "grad_norm": 7.945186614990234, | |
| "learning_rate": 2.4018118547706078e-05, | |
| "loss": 30.4846, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 0.722465264669524, | |
| "grad_norm": 9.277371406555176, | |
| "learning_rate": 2.3549712855173688e-05, | |
| "loss": 30.6765, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 0.7254261878853827, | |
| "grad_norm": 8.619938850402832, | |
| "learning_rate": 2.3084801945637512e-05, | |
| "loss": 30.6503, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 0.7283871111012413, | |
| "grad_norm": 8.467925071716309, | |
| "learning_rate": 2.262343039385585e-05, | |
| "loss": 30.957, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 0.7313480343171, | |
| "grad_norm": 8.035057067871094, | |
| "learning_rate": 2.216564243524035e-05, | |
| "loss": 30.6764, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 0.7343089575329588, | |
| "grad_norm": 7.555221080780029, | |
| "learning_rate": 2.1711481961614565e-05, | |
| "loss": 30.7666, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 0.7372698807488175, | |
| "grad_norm": 7.959348201751709, | |
| "learning_rate": 2.1260992517005892e-05, | |
| "loss": 30.8212, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 0.7402308039646762, | |
| "grad_norm": 7.882981300354004, | |
| "learning_rate": 2.0814217293470476e-05, | |
| "loss": 30.8312, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.7402308039646762, | |
| "eval_loss": 3.7874350547790527, | |
| "eval_runtime": 107.8316, | |
| "eval_samples_per_second": 10.025, | |
| "eval_steps_per_second": 2.513, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.743191727180535, | |
| "grad_norm": 7.499105930328369, | |
| "learning_rate": 2.0371199126952268e-05, | |
| "loss": 30.9958, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 0.7461526503963936, | |
| "grad_norm": 7.973631381988525, | |
| "learning_rate": 1.9931980493175735e-05, | |
| "loss": 30.6469, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 0.7491135736122523, | |
| "grad_norm": 7.996872425079346, | |
| "learning_rate": 1.949660350357356e-05, | |
| "loss": 30.6363, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 0.752074496828111, | |
| "grad_norm": 8.139349937438965, | |
| "learning_rate": 1.9065109901249e-05, | |
| "loss": 30.924, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 0.7550354200439697, | |
| "grad_norm": 8.981887817382812, | |
| "learning_rate": 1.863754105697369e-05, | |
| "loss": 30.9555, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 0.7579963432598285, | |
| "grad_norm": 7.660996913909912, | |
| "learning_rate": 1.821393796522096e-05, | |
| "loss": 30.8007, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 0.7609572664756871, | |
| "grad_norm": 7.750844955444336, | |
| "learning_rate": 1.7794341240235615e-05, | |
| "loss": 30.7227, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 0.7639181896915458, | |
| "grad_norm": 7.581575870513916, | |
| "learning_rate": 1.737879111213961e-05, | |
| "loss": 30.6509, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 0.7668791129074045, | |
| "grad_norm": 8.771635055541992, | |
| "learning_rate": 1.6967327423075142e-05, | |
| "loss": 30.7893, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 0.7698400361232632, | |
| "grad_norm": 8.594512939453125, | |
| "learning_rate": 1.6559989623384456e-05, | |
| "loss": 30.6874, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.7698400361232632, | |
| "eval_loss": 3.7861363887786865, | |
| "eval_runtime": 112.2096, | |
| "eval_samples_per_second": 9.634, | |
| "eval_steps_per_second": 2.415, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.772800959339122, | |
| "grad_norm": 7.919267177581787, | |
| "learning_rate": 1.615681676782755e-05, | |
| "loss": 30.7685, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 0.7757618825549807, | |
| "grad_norm": 7.744143009185791, | |
| "learning_rate": 1.5757847511837648e-05, | |
| "loss": 30.7558, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 0.7787228057708393, | |
| "grad_norm": 7.894962787628174, | |
| "learning_rate": 1.5363120107814955e-05, | |
| "loss": 30.7543, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 0.781683728986698, | |
| "grad_norm": 9.573600769042969, | |
| "learning_rate": 1.4972672401459143e-05, | |
| "loss": 30.808, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 0.7846446522025567, | |
| "grad_norm": 7.708218574523926, | |
| "learning_rate": 1.4586541828140706e-05, | |
| "loss": 30.6115, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 0.7876055754184155, | |
| "grad_norm": 8.170422554016113, | |
| "learning_rate": 1.4204765409311852e-05, | |
| "loss": 30.8811, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 0.7905664986342742, | |
| "grad_norm": 8.293937683105469, | |
| "learning_rate": 1.3827379748956783e-05, | |
| "loss": 30.8484, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 0.7935274218501329, | |
| "grad_norm": 7.64206075668335, | |
| "learning_rate": 1.3454421030082402e-05, | |
| "loss": 30.7768, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 0.7964883450659915, | |
| "grad_norm": 7.780085563659668, | |
| "learning_rate": 1.3085925011248902e-05, | |
| "loss": 30.6903, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 0.7994492682818503, | |
| "grad_norm": 7.651244640350342, | |
| "learning_rate": 1.2721927023141509e-05, | |
| "loss": 30.8888, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 0.7994492682818503, | |
| "eval_loss": 3.7866110801696777, | |
| "eval_runtime": 111.3993, | |
| "eval_samples_per_second": 9.704, | |
| "eval_steps_per_second": 2.433, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 0.802410191497709, | |
| "grad_norm": 7.893172740936279, | |
| "learning_rate": 1.2362461965182951e-05, | |
| "loss": 30.8551, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 0.8053711147135677, | |
| "grad_norm": 8.348461151123047, | |
| "learning_rate": 1.2007564302187395e-05, | |
| "loss": 30.9086, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 0.8083320379294264, | |
| "grad_norm": 8.005925178527832, | |
| "learning_rate": 1.1657268061055954e-05, | |
| "loss": 30.6258, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 0.8112929611452852, | |
| "grad_norm": 7.919161319732666, | |
| "learning_rate": 1.1311606827514432e-05, | |
| "loss": 30.4614, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 0.8142538843611438, | |
| "grad_norm": 8.806751251220703, | |
| "learning_rate": 1.0970613742892959e-05, | |
| "loss": 30.9882, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 0.8172148075770025, | |
| "grad_norm": 8.126434326171875, | |
| "learning_rate": 1.0634321500948665e-05, | |
| "loss": 30.6459, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 0.8201757307928612, | |
| "grad_norm": 7.643808364868164, | |
| "learning_rate": 1.0302762344730893e-05, | |
| "loss": 30.6614, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 0.8231366540087199, | |
| "grad_norm": 8.046734809875488, | |
| "learning_rate": 9.97596806349001e-06, | |
| "loss": 30.6958, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 0.8260975772245787, | |
| "grad_norm": 8.094582557678223, | |
| "learning_rate": 9.653969989629268e-06, | |
| "loss": 30.5807, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 0.8290585004404373, | |
| "grad_norm": 8.062453269958496, | |
| "learning_rate": 9.336798995700899e-06, | |
| "loss": 30.8323, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.8290585004404373, | |
| "eval_loss": 3.786661386489868, | |
| "eval_runtime": 106.9454, | |
| "eval_samples_per_second": 10.108, | |
| "eval_steps_per_second": 2.534, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.832019423656296, | |
| "grad_norm": 7.68911075592041, | |
| "learning_rate": 9.024485491446045e-06, | |
| "loss": 30.9853, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 0.8349803468721547, | |
| "grad_norm": 7.82414436340332, | |
| "learning_rate": 8.717059420879143e-06, | |
| "loss": 30.5061, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 0.8379412700880134, | |
| "grad_norm": 7.392062664031982, | |
| "learning_rate": 8.414550259416917e-06, | |
| "loss": 30.9525, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 0.8409021933038722, | |
| "grad_norm": 7.675992965698242, | |
| "learning_rate": 8.116987011052387e-06, | |
| "loss": 30.8296, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 0.8438631165197309, | |
| "grad_norm": 8.038030624389648, | |
| "learning_rate": 7.824398205574006e-06, | |
| "loss": 30.8155, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 0.8468240397355895, | |
| "grad_norm": 7.427101135253906, | |
| "learning_rate": 7.536811895830222e-06, | |
| "loss": 30.9259, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 0.8497849629514482, | |
| "grad_norm": 8.095186233520508, | |
| "learning_rate": 7.254255655039919e-06, | |
| "loss": 30.824, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 0.852745886167307, | |
| "grad_norm": 7.521733283996582, | |
| "learning_rate": 6.9767565741486815e-06, | |
| "loss": 30.7226, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 0.8557068093831657, | |
| "grad_norm": 7.494954586029053, | |
| "learning_rate": 6.704341259231415e-06, | |
| "loss": 30.7789, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 0.8586677325990244, | |
| "grad_norm": 7.641082763671875, | |
| "learning_rate": 6.437035828941324e-06, | |
| "loss": 30.8001, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 0.8586677325990244, | |
| "eval_loss": 3.786005973815918, | |
| "eval_runtime": 111.1521, | |
| "eval_samples_per_second": 9.725, | |
| "eval_steps_per_second": 2.438, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 0.8616286558148831, | |
| "grad_norm": 7.96475887298584, | |
| "learning_rate": 6.1748659120058386e-06, | |
| "loss": 30.8879, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 0.8645895790307417, | |
| "grad_norm": 6.990954875946045, | |
| "learning_rate": 5.917856644769242e-06, | |
| "loss": 30.6077, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 0.8675505022466005, | |
| "grad_norm": 7.170067310333252, | |
| "learning_rate": 5.666032668782735e-06, | |
| "loss": 30.8456, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 0.8705114254624592, | |
| "grad_norm": 8.4426851272583, | |
| "learning_rate": 5.419418128441846e-06, | |
| "loss": 30.9228, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 0.8734723486783179, | |
| "grad_norm": 8.034204483032227, | |
| "learning_rate": 5.178036668671475e-06, | |
| "loss": 30.7785, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 0.8764332718941766, | |
| "grad_norm": 7.411805629730225, | |
| "learning_rate": 4.941911432658868e-06, | |
| "loss": 30.7495, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 0.8793941951100354, | |
| "grad_norm": 7.887239456176758, | |
| "learning_rate": 4.7110650596347335e-06, | |
| "loss": 30.7797, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 0.882355118325894, | |
| "grad_norm": 8.600279808044434, | |
| "learning_rate": 4.48551968270261e-06, | |
| "loss": 30.8267, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 0.8853160415417527, | |
| "grad_norm": 8.055954933166504, | |
| "learning_rate": 4.26529692671679e-06, | |
| "loss": 30.8123, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 0.8882769647576114, | |
| "grad_norm": 7.540750503540039, | |
| "learning_rate": 4.050417906208945e-06, | |
| "loss": 30.8866, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.8882769647576114, | |
| "eval_loss": 3.7849574089050293, | |
| "eval_runtime": 108.0072, | |
| "eval_samples_per_second": 10.009, | |
| "eval_steps_per_second": 2.509, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.8912378879734701, | |
| "grad_norm": 7.607705593109131, | |
| "learning_rate": 3.840903223363752e-06, | |
| "loss": 30.7932, | |
| "step": 30100 | |
| }, | |
| { | |
| "epoch": 0.8941988111893289, | |
| "grad_norm": 7.834300518035889, | |
| "learning_rate": 3.636772966043571e-06, | |
| "loss": 30.6935, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 0.8971597344051876, | |
| "grad_norm": 9.865922927856445, | |
| "learning_rate": 3.4380467058624585e-06, | |
| "loss": 30.5129, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 0.9001206576210462, | |
| "grad_norm": 7.9707865715026855, | |
| "learning_rate": 3.244743496309701e-06, | |
| "loss": 30.8035, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 0.9030815808369049, | |
| "grad_norm": 8.035768508911133, | |
| "learning_rate": 3.0568818709229364e-06, | |
| "loss": 30.4973, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 0.9060425040527637, | |
| "grad_norm": 8.816192626953125, | |
| "learning_rate": 2.8744798415113015e-06, | |
| "loss": 30.5553, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 0.9090034272686224, | |
| "grad_norm": 7.411801338195801, | |
| "learning_rate": 2.6975548964283823e-06, | |
| "loss": 30.6758, | |
| "step": 30700 | |
| }, | |
| { | |
| "epoch": 0.9119643504844811, | |
| "grad_norm": 7.46308708190918, | |
| "learning_rate": 2.5261239988955733e-06, | |
| "loss": 30.8337, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 0.9149252737003397, | |
| "grad_norm": 8.57913875579834, | |
| "learning_rate": 2.360203585375571e-06, | |
| "loss": 31.0671, | |
| "step": 30900 | |
| }, | |
| { | |
| "epoch": 0.9178861969161984, | |
| "grad_norm": 7.983087062835693, | |
| "learning_rate": 2.1998095639965577e-06, | |
| "loss": 30.913, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 0.9178861969161984, | |
| "eval_loss": 3.785719394683838, | |
| "eval_runtime": 110.9703, | |
| "eval_samples_per_second": 9.741, | |
| "eval_steps_per_second": 2.442, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 0.9208471201320572, | |
| "grad_norm": 8.11637020111084, | |
| "learning_rate": 2.044957313026925e-06, | |
| "loss": 30.7294, | |
| "step": 31100 | |
| }, | |
| { | |
| "epoch": 0.9238080433479159, | |
| "grad_norm": 7.882040977478027, | |
| "learning_rate": 1.895661679400842e-06, | |
| "loss": 30.7816, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 0.9267689665637746, | |
| "grad_norm": 7.475772857666016, | |
| "learning_rate": 1.7519369772947525e-06, | |
| "loss": 30.5198, | |
| "step": 31300 | |
| }, | |
| { | |
| "epoch": 0.9297298897796333, | |
| "grad_norm": 8.094454765319824, | |
| "learning_rate": 1.6137969867549674e-06, | |
| "loss": 30.8313, | |
| "step": 31400 | |
| }, | |
| { | |
| "epoch": 0.932690812995492, | |
| "grad_norm": 8.635899543762207, | |
| "learning_rate": 1.4812549523764674e-06, | |
| "loss": 30.6539, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 0.9356517362113507, | |
| "grad_norm": 7.975414752960205, | |
| "learning_rate": 1.354323582033039e-06, | |
| "loss": 30.5804, | |
| "step": 31600 | |
| }, | |
| { | |
| "epoch": 0.9386126594272094, | |
| "grad_norm": 7.660233020782471, | |
| "learning_rate": 1.233015045658823e-06, | |
| "loss": 30.6357, | |
| "step": 31700 | |
| }, | |
| { | |
| "epoch": 0.9415735826430681, | |
| "grad_norm": 8.09595012664795, | |
| "learning_rate": 1.1173409740815532e-06, | |
| "loss": 30.7201, | |
| "step": 31800 | |
| }, | |
| { | |
| "epoch": 0.9445345058589268, | |
| "grad_norm": 8.44491958618164, | |
| "learning_rate": 1.0073124579073701e-06, | |
| "loss": 30.7462, | |
| "step": 31900 | |
| }, | |
| { | |
| "epoch": 0.9474954290747856, | |
| "grad_norm": 8.275026321411133, | |
| "learning_rate": 9.0294004645749e-07, | |
| "loss": 30.7256, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 0.9474954290747856, | |
| "eval_loss": 3.7850279808044434, | |
| "eval_runtime": 109.0824, | |
| "eval_samples_per_second": 9.91, | |
| "eval_steps_per_second": 2.484, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 0.9504563522906442, | |
| "grad_norm": 7.571169376373291, | |
| "learning_rate": 8.042337467567484e-07, | |
| "loss": 30.7194, | |
| "step": 32100 | |
| }, | |
| { | |
| "epoch": 0.9534172755065029, | |
| "grad_norm": 8.020681381225586, | |
| "learning_rate": 7.112030225741472e-07, | |
| "loss": 30.5828, | |
| "step": 32200 | |
| }, | |
| { | |
| "epoch": 0.9563781987223616, | |
| "grad_norm": 7.482342720031738, | |
| "learning_rate": 6.238567935155004e-07, | |
| "loss": 30.5888, | |
| "step": 32300 | |
| }, | |
| { | |
| "epoch": 0.9593391219382204, | |
| "grad_norm": 8.336071014404297, | |
| "learning_rate": 5.422034341682314e-07, | |
| "loss": 30.858, | |
| "step": 32400 | |
| }, | |
| { | |
| "epoch": 0.9623000451540791, | |
| "grad_norm": 7.819650173187256, | |
| "learning_rate": 4.6625077329842224e-07, | |
| "loss": 30.6983, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 0.9652609683699378, | |
| "grad_norm": 8.101078987121582, | |
| "learning_rate": 3.960060931002141e-07, | |
| "loss": 30.7803, | |
| "step": 32600 | |
| }, | |
| { | |
| "epoch": 0.9682218915857964, | |
| "grad_norm": 9.275129318237305, | |
| "learning_rate": 3.3147612849762533e-07, | |
| "loss": 30.8961, | |
| "step": 32700 | |
| }, | |
| { | |
| "epoch": 0.9711828148016551, | |
| "grad_norm": 8.00763988494873, | |
| "learning_rate": 2.7266706649877516e-07, | |
| "loss": 30.9344, | |
| "step": 32800 | |
| }, | |
| { | |
| "epoch": 0.9741437380175139, | |
| "grad_norm": 8.840792655944824, | |
| "learning_rate": 2.1958454560274455e-07, | |
| "loss": 30.7027, | |
| "step": 32900 | |
| }, | |
| { | |
| "epoch": 0.9771046612333726, | |
| "grad_norm": 8.015409469604492, | |
| "learning_rate": 1.722336552589021e-07, | |
| "loss": 30.7569, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 0.9771046612333726, | |
| "eval_loss": 3.785550117492676, | |
| "eval_runtime": 109.3256, | |
| "eval_samples_per_second": 9.888, | |
| "eval_steps_per_second": 2.479, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 0.9800655844492313, | |
| "grad_norm": 8.226040840148926, | |
| "learning_rate": 1.3061893537898773e-07, | |
| "loss": 30.6858, | |
| "step": 33100 | |
| }, | |
| { | |
| "epoch": 0.9830265076650899, | |
| "grad_norm": 7.274777889251709, | |
| "learning_rate": 9.474437590182072e-08, | |
| "loss": 30.701, | |
| "step": 33200 | |
| }, | |
| { | |
| "epoch": 0.9859874308809486, | |
| "grad_norm": 7.866406440734863, | |
| "learning_rate": 6.46134164107326e-08, | |
| "loss": 30.6392, | |
| "step": 33300 | |
| }, | |
| { | |
| "epoch": 0.9889483540968074, | |
| "grad_norm": 7.722043514251709, | |
| "learning_rate": 4.022894580381742e-08, | |
| "loss": 30.8502, | |
| "step": 33400 | |
| }, | |
| { | |
| "epoch": 0.9919092773126661, | |
| "grad_norm": 7.612312316894531, | |
| "learning_rate": 2.1593302016933437e-08, | |
| "loss": 30.7914, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 0.9948702005285248, | |
| "grad_norm": 8.195243835449219, | |
| "learning_rate": 8.708271799542367e-09, | |
| "loss": 30.8885, | |
| "step": 33600 | |
| }, | |
| { | |
| "epoch": 0.9978311237443835, | |
| "grad_norm": 8.127638816833496, | |
| "learning_rate": 1.5750905434130935e-09, | |
| "loss": 30.9894, | |
| "step": 33700 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 33773, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.598282561239384e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |