| { | |
| "best_metric": 2.2994935512542725, | |
| "best_model_checkpoint": "/exports/eddie/scratch/s1970716/models/longt5_xl_sfd_20/checkpoint-28", | |
| "epoch": 19.47826086956522, | |
| "eval_steps": 500, | |
| "global_step": 280, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 8.068708419799805, | |
| "learning_rate": 0.001, | |
| "loss": 3.274, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 1.4994572401046753, | |
| "learning_rate": 0.001, | |
| "loss": 3.2963, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 1.0570803880691528, | |
| "learning_rate": 0.001, | |
| "loss": 3.3164, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 1.2446849346160889, | |
| "learning_rate": 0.001, | |
| "loss": 3.0866, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.721084713935852, | |
| "learning_rate": 0.001, | |
| "loss": 2.8976, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 1.2132383584976196, | |
| "learning_rate": 0.001, | |
| "loss": 2.8298, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 0.4689762592315674, | |
| "learning_rate": 0.001, | |
| "loss": 2.9377, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "eval_loss": 2.7965147495269775, | |
| "eval_runtime": 81.4763, | |
| "eval_samples_per_second": 4.148, | |
| "eval_steps_per_second": 0.528, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 0.42892181873321533, | |
| "learning_rate": 0.001, | |
| "loss": 2.741, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 0.4487678110599518, | |
| "learning_rate": 0.001, | |
| "loss": 2.4441, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 0.4653552770614624, | |
| "learning_rate": 0.001, | |
| "loss": 2.432, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 0.35275548696517944, | |
| "learning_rate": 0.001, | |
| "loss": 2.4016, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 0.43277695775032043, | |
| "learning_rate": 0.001, | |
| "loss": 2.391, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 0.3408297300338745, | |
| "learning_rate": 0.001, | |
| "loss": 2.3911, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 0.3205319344997406, | |
| "learning_rate": 0.001, | |
| "loss": 2.3247, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "eval_loss": 2.2994935512542725, | |
| "eval_runtime": 81.4693, | |
| "eval_samples_per_second": 4.149, | |
| "eval_steps_per_second": 0.528, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 2.09, | |
| "grad_norm": 0.4033512771129608, | |
| "learning_rate": 0.001, | |
| "loss": 2.0701, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "grad_norm": 0.36825311183929443, | |
| "learning_rate": 0.001, | |
| "loss": 2.0968, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "grad_norm": 0.5080482363700867, | |
| "learning_rate": 0.001, | |
| "loss": 2.0681, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.4196927845478058, | |
| "learning_rate": 0.001, | |
| "loss": 2.0914, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 0.3230506479740143, | |
| "learning_rate": 0.001, | |
| "loss": 2.0317, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 2.78, | |
| "grad_norm": 0.2733004689216614, | |
| "learning_rate": 0.001, | |
| "loss": 1.9723, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": 0.2709517776966095, | |
| "learning_rate": 0.001, | |
| "loss": 1.9943, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 2.99, | |
| "eval_loss": 2.3308048248291016, | |
| "eval_runtime": 81.5083, | |
| "eval_samples_per_second": 4.147, | |
| "eval_steps_per_second": 0.528, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 3.06, | |
| "grad_norm": 0.3230663537979126, | |
| "learning_rate": 0.001, | |
| "loss": 1.9093, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 0.3976946175098419, | |
| "learning_rate": 0.001, | |
| "loss": 1.7682, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 3.34, | |
| "grad_norm": 0.42008209228515625, | |
| "learning_rate": 0.001, | |
| "loss": 1.7119, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 3.48, | |
| "grad_norm": 0.31828513741493225, | |
| "learning_rate": 0.001, | |
| "loss": 1.7283, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 3.62, | |
| "grad_norm": 0.2448839396238327, | |
| "learning_rate": 0.001, | |
| "loss": 1.6905, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 3.76, | |
| "grad_norm": 0.25552132725715637, | |
| "learning_rate": 0.001, | |
| "loss": 1.6645, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 3.9, | |
| "grad_norm": 15.679224014282227, | |
| "learning_rate": 0.001, | |
| "loss": 1.7056, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 3.97, | |
| "eval_loss": 2.3368992805480957, | |
| "eval_runtime": 81.4742, | |
| "eval_samples_per_second": 4.149, | |
| "eval_steps_per_second": 0.528, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 4.03, | |
| "grad_norm": 0.29547178745269775, | |
| "learning_rate": 0.001, | |
| "loss": 1.564, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 4.17, | |
| "grad_norm": 0.31610924005508423, | |
| "learning_rate": 0.001, | |
| "loss": 1.3607, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 4.31, | |
| "grad_norm": 0.32351407408714294, | |
| "learning_rate": 0.001, | |
| "loss": 1.4158, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 4.45, | |
| "grad_norm": 0.5101042985916138, | |
| "learning_rate": 0.001, | |
| "loss": 1.4694, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 4.59, | |
| "grad_norm": 0.41575145721435547, | |
| "learning_rate": 0.001, | |
| "loss": 1.4755, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 4.73, | |
| "grad_norm": 0.3269899785518646, | |
| "learning_rate": 0.001, | |
| "loss": 1.4268, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 4.87, | |
| "grad_norm": 0.4077276587486267, | |
| "learning_rate": 0.001, | |
| "loss": 1.4471, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 4.94, | |
| "eval_loss": 2.553175926208496, | |
| "eval_runtime": 81.5149, | |
| "eval_samples_per_second": 4.146, | |
| "eval_steps_per_second": 0.528, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 5.01, | |
| "grad_norm": 0.37493908405303955, | |
| "learning_rate": 0.001, | |
| "loss": 1.4436, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 5.15, | |
| "grad_norm": 0.8398223519325256, | |
| "learning_rate": 0.001, | |
| "loss": 1.1776, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 5.29, | |
| "grad_norm": 0.621316134929657, | |
| "learning_rate": 0.001, | |
| "loss": 1.192, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 5.43, | |
| "grad_norm": 0.5988876819610596, | |
| "learning_rate": 0.001, | |
| "loss": 1.1561, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 5.57, | |
| "grad_norm": 0.561390221118927, | |
| "learning_rate": 0.001, | |
| "loss": 1.2129, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 5.7, | |
| "grad_norm": 0.32573097944259644, | |
| "learning_rate": 0.001, | |
| "loss": 1.19, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 5.84, | |
| "grad_norm": 0.3272527754306793, | |
| "learning_rate": 0.001, | |
| "loss": 1.1933, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 5.98, | |
| "grad_norm": 0.36107558012008667, | |
| "learning_rate": 0.001, | |
| "loss": 1.1932, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 5.98, | |
| "eval_loss": 2.696089744567871, | |
| "eval_runtime": 81.5294, | |
| "eval_samples_per_second": 4.146, | |
| "eval_steps_per_second": 0.527, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 6.12, | |
| "grad_norm": 0.4167131781578064, | |
| "learning_rate": 0.001, | |
| "loss": 0.9285, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 6.26, | |
| "grad_norm": 0.38736867904663086, | |
| "learning_rate": 0.001, | |
| "loss": 0.9568, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 6.4, | |
| "grad_norm": 0.3212537169456482, | |
| "learning_rate": 0.001, | |
| "loss": 0.9538, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 6.54, | |
| "grad_norm": 0.2966512143611908, | |
| "learning_rate": 0.001, | |
| "loss": 0.9133, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 6.68, | |
| "grad_norm": 0.3149372935295105, | |
| "learning_rate": 0.001, | |
| "loss": 0.9374, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 6.82, | |
| "grad_norm": 0.3140605092048645, | |
| "learning_rate": 0.001, | |
| "loss": 0.9585, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 6.96, | |
| "grad_norm": 0.33559679985046387, | |
| "learning_rate": 0.001, | |
| "loss": 0.9199, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 6.96, | |
| "eval_loss": 2.645321846008301, | |
| "eval_runtime": 81.5044, | |
| "eval_samples_per_second": 4.147, | |
| "eval_steps_per_second": 0.528, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 7.1, | |
| "grad_norm": 0.3616858720779419, | |
| "learning_rate": 0.001, | |
| "loss": 0.7517, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 7.23, | |
| "grad_norm": 0.4970415234565735, | |
| "learning_rate": 0.001, | |
| "loss": 0.7378, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 7.37, | |
| "grad_norm": 0.6654688119888306, | |
| "learning_rate": 0.001, | |
| "loss": 0.7864, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 7.51, | |
| "grad_norm": 0.51229327917099, | |
| "learning_rate": 0.001, | |
| "loss": 0.762, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 7.65, | |
| "grad_norm": 0.4524416923522949, | |
| "learning_rate": 0.001, | |
| "loss": 0.7342, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 7.79, | |
| "grad_norm": 0.48206427693367004, | |
| "learning_rate": 0.001, | |
| "loss": 0.7706, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 7.93, | |
| "grad_norm": 0.4534417688846588, | |
| "learning_rate": 0.001, | |
| "loss": 0.7571, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_loss": 3.0977730751037598, | |
| "eval_runtime": 81.5778, | |
| "eval_samples_per_second": 4.143, | |
| "eval_steps_per_second": 0.527, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 8.07, | |
| "grad_norm": 0.306815505027771, | |
| "learning_rate": 0.001, | |
| "loss": 0.6809, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 8.21, | |
| "grad_norm": 0.34183812141418457, | |
| "learning_rate": 0.001, | |
| "loss": 0.5853, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 8.35, | |
| "grad_norm": 0.3781261444091797, | |
| "learning_rate": 0.001, | |
| "loss": 0.5819, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 8.49, | |
| "grad_norm": 0.36344149708747864, | |
| "learning_rate": 0.001, | |
| "loss": 0.6059, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 8.63, | |
| "grad_norm": 0.38990476727485657, | |
| "learning_rate": 0.001, | |
| "loss": 0.5929, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 8.77, | |
| "grad_norm": 0.34000781178474426, | |
| "learning_rate": 0.001, | |
| "loss": 0.5887, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 8.9, | |
| "grad_norm": 0.32895970344543457, | |
| "learning_rate": 0.001, | |
| "loss": 0.6287, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 8.97, | |
| "eval_loss": 3.145782709121704, | |
| "eval_runtime": 81.5735, | |
| "eval_samples_per_second": 4.144, | |
| "eval_steps_per_second": 0.527, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 9.04, | |
| "grad_norm": 0.36275872588157654, | |
| "learning_rate": 0.001, | |
| "loss": 0.5983, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 9.18, | |
| "grad_norm": 0.3596336245536804, | |
| "learning_rate": 0.001, | |
| "loss": 0.4615, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 9.32, | |
| "grad_norm": 0.37557095289230347, | |
| "learning_rate": 0.001, | |
| "loss": 0.4756, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 9.46, | |
| "grad_norm": 0.39249515533447266, | |
| "learning_rate": 0.001, | |
| "loss": 0.4546, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 9.6, | |
| "grad_norm": 0.3760348856449127, | |
| "learning_rate": 0.001, | |
| "loss": 0.4792, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 9.74, | |
| "grad_norm": 0.3137217164039612, | |
| "learning_rate": 0.001, | |
| "loss": 0.4674, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 9.88, | |
| "grad_norm": 0.40549594163894653, | |
| "learning_rate": 0.001, | |
| "loss": 0.4939, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 9.95, | |
| "eval_loss": 3.5685999393463135, | |
| "eval_runtime": 81.5958, | |
| "eval_samples_per_second": 4.142, | |
| "eval_steps_per_second": 0.527, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 10.02, | |
| "grad_norm": 0.4173819422721863, | |
| "learning_rate": 0.001, | |
| "loss": 0.5055, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 10.16, | |
| "grad_norm": 0.280066579580307, | |
| "learning_rate": 0.001, | |
| "loss": 0.3353, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 10.3, | |
| "grad_norm": 0.30166783928871155, | |
| "learning_rate": 0.001, | |
| "loss": 0.351, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 10.43, | |
| "grad_norm": 0.28606531023979187, | |
| "learning_rate": 0.001, | |
| "loss": 0.3834, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 10.57, | |
| "grad_norm": 0.2835221588611603, | |
| "learning_rate": 0.001, | |
| "loss": 0.3718, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 10.71, | |
| "grad_norm": 0.3148328959941864, | |
| "learning_rate": 0.001, | |
| "loss": 0.3692, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 10.85, | |
| "grad_norm": 0.3502219021320343, | |
| "learning_rate": 0.001, | |
| "loss": 0.38, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 10.99, | |
| "grad_norm": 0.3344653844833374, | |
| "learning_rate": 0.001, | |
| "loss": 0.376, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 10.99, | |
| "eval_loss": 3.425977945327759, | |
| "eval_runtime": 81.532, | |
| "eval_samples_per_second": 4.146, | |
| "eval_steps_per_second": 0.527, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 11.13, | |
| "grad_norm": 0.32332998514175415, | |
| "learning_rate": 0.001, | |
| "loss": 0.2827, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 11.27, | |
| "grad_norm": 0.35432103276252747, | |
| "learning_rate": 0.001, | |
| "loss": 0.2966, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 11.41, | |
| "grad_norm": 0.29032111167907715, | |
| "learning_rate": 0.001, | |
| "loss": 0.2954, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 11.55, | |
| "grad_norm": 0.3170696198940277, | |
| "learning_rate": 0.001, | |
| "loss": 0.2738, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 11.69, | |
| "grad_norm": 0.3339516520500183, | |
| "learning_rate": 0.001, | |
| "loss": 0.2786, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 11.83, | |
| "grad_norm": 0.3187398910522461, | |
| "learning_rate": 0.001, | |
| "loss": 0.315, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 11.97, | |
| "grad_norm": 0.2842791974544525, | |
| "learning_rate": 0.001, | |
| "loss": 0.313, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 11.97, | |
| "eval_loss": 3.9301607608795166, | |
| "eval_runtime": 81.5908, | |
| "eval_samples_per_second": 4.143, | |
| "eval_steps_per_second": 0.527, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 12.1, | |
| "grad_norm": 0.2522130012512207, | |
| "learning_rate": 0.001, | |
| "loss": 0.2504, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 12.24, | |
| "grad_norm": 0.23560765385627747, | |
| "learning_rate": 0.001, | |
| "loss": 0.212, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 12.38, | |
| "grad_norm": 0.24140460789203644, | |
| "learning_rate": 0.001, | |
| "loss": 0.2156, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 12.52, | |
| "grad_norm": 0.2790488302707672, | |
| "learning_rate": 0.001, | |
| "loss": 0.2474, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 12.66, | |
| "grad_norm": 0.2879179120063782, | |
| "learning_rate": 0.001, | |
| "loss": 0.2486, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 12.8, | |
| "grad_norm": 0.3126004934310913, | |
| "learning_rate": 0.001, | |
| "loss": 0.2499, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 12.94, | |
| "grad_norm": 0.3011338412761688, | |
| "learning_rate": 0.001, | |
| "loss": 0.2562, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 12.94, | |
| "eval_loss": 3.743312120437622, | |
| "eval_runtime": 81.5885, | |
| "eval_samples_per_second": 4.143, | |
| "eval_steps_per_second": 0.527, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 13.08, | |
| "grad_norm": 0.24417123198509216, | |
| "learning_rate": 0.001, | |
| "loss": 0.2166, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 13.22, | |
| "grad_norm": 0.21955759823322296, | |
| "learning_rate": 0.001, | |
| "loss": 0.1767, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 13.36, | |
| "grad_norm": 0.20537225902080536, | |
| "learning_rate": 0.001, | |
| "loss": 0.1715, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 13.5, | |
| "grad_norm": 0.21406413614749908, | |
| "learning_rate": 0.001, | |
| "loss": 0.1857, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 13.63, | |
| "grad_norm": 0.21677067875862122, | |
| "learning_rate": 0.001, | |
| "loss": 0.1881, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 13.77, | |
| "grad_norm": 0.2592070996761322, | |
| "learning_rate": 0.001, | |
| "loss": 0.2022, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 13.91, | |
| "grad_norm": 0.23913638293743134, | |
| "learning_rate": 0.001, | |
| "loss": 0.2051, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 13.98, | |
| "eval_loss": 3.911346197128296, | |
| "eval_runtime": 81.5425, | |
| "eval_samples_per_second": 4.145, | |
| "eval_steps_per_second": 0.527, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 14.05, | |
| "grad_norm": 0.19888806343078613, | |
| "learning_rate": 0.001, | |
| "loss": 0.1774, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 14.19, | |
| "grad_norm": 0.17841410636901855, | |
| "learning_rate": 0.001, | |
| "loss": 0.1409, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 14.33, | |
| "grad_norm": 0.22502601146697998, | |
| "learning_rate": 0.001, | |
| "loss": 0.1432, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 14.47, | |
| "grad_norm": 0.21947847306728363, | |
| "learning_rate": 0.001, | |
| "loss": 0.1487, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 14.61, | |
| "grad_norm": 0.20319664478302002, | |
| "learning_rate": 0.001, | |
| "loss": 0.1753, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 14.75, | |
| "grad_norm": 0.20484566688537598, | |
| "learning_rate": 0.001, | |
| "loss": 0.1627, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 14.89, | |
| "grad_norm": 0.24411869049072266, | |
| "learning_rate": 0.001, | |
| "loss": 0.1802, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 14.96, | |
| "eval_loss": 4.0449538230896, | |
| "eval_runtime": 81.5583, | |
| "eval_samples_per_second": 4.144, | |
| "eval_steps_per_second": 0.527, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 15.03, | |
| "grad_norm": 0.23610645532608032, | |
| "learning_rate": 0.001, | |
| "loss": 0.1881, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 15.17, | |
| "grad_norm": 0.17829175293445587, | |
| "learning_rate": 0.001, | |
| "loss": 0.123, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 15.3, | |
| "grad_norm": 0.178519606590271, | |
| "learning_rate": 0.001, | |
| "loss": 0.1166, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 15.44, | |
| "grad_norm": 0.19595706462860107, | |
| "learning_rate": 0.001, | |
| "loss": 0.135, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 15.58, | |
| "grad_norm": 0.20790521800518036, | |
| "learning_rate": 0.001, | |
| "loss": 0.1494, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 15.72, | |
| "grad_norm": 0.1832074671983719, | |
| "learning_rate": 0.001, | |
| "loss": 0.1488, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 15.86, | |
| "grad_norm": 0.17795896530151367, | |
| "learning_rate": 0.001, | |
| "loss": 0.1448, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "grad_norm": 0.20039702951908112, | |
| "learning_rate": 0.001, | |
| "loss": 0.1378, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "eval_loss": 3.939739227294922, | |
| "eval_runtime": 81.6032, | |
| "eval_samples_per_second": 4.142, | |
| "eval_steps_per_second": 0.527, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 16.14, | |
| "grad_norm": 0.19622142612934113, | |
| "learning_rate": 0.001, | |
| "loss": 0.3001, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 16.28, | |
| "grad_norm": 19.05455207824707, | |
| "learning_rate": 0.001, | |
| "loss": 0.2708, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 16.42, | |
| "grad_norm": 29.798582077026367, | |
| "learning_rate": 0.001, | |
| "loss": 0.2154, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 16.56, | |
| "grad_norm": 8.835821151733398, | |
| "learning_rate": 0.001, | |
| "loss": 0.1348, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 16.7, | |
| "grad_norm": 0.3760863244533539, | |
| "learning_rate": 0.001, | |
| "loss": 0.6235, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 16.83, | |
| "grad_norm": 0.3473583459854126, | |
| "learning_rate": 0.001, | |
| "loss": 0.1445, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 16.97, | |
| "grad_norm": 0.4041793942451477, | |
| "learning_rate": 0.001, | |
| "loss": 0.1546, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 16.97, | |
| "eval_loss": 4.307888984680176, | |
| "eval_runtime": 81.6566, | |
| "eval_samples_per_second": 4.139, | |
| "eval_steps_per_second": 0.527, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 17.11, | |
| "grad_norm": 0.2586219906806946, | |
| "learning_rate": 0.001, | |
| "loss": 0.1188, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 17.25, | |
| "grad_norm": 0.4334220886230469, | |
| "learning_rate": 0.001, | |
| "loss": 0.1041, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 17.39, | |
| "grad_norm": 17.520734786987305, | |
| "learning_rate": 0.001, | |
| "loss": 0.1108, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 17.53, | |
| "grad_norm": 0.5943770408630371, | |
| "learning_rate": 0.001, | |
| "loss": 0.1146, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 17.67, | |
| "grad_norm": 0.4325353503227234, | |
| "learning_rate": 0.001, | |
| "loss": 0.1325, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 17.81, | |
| "grad_norm": 0.41412413120269775, | |
| "learning_rate": 0.001, | |
| "loss": 0.1491, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 17.95, | |
| "grad_norm": 0.19986829161643982, | |
| "learning_rate": 0.001, | |
| "loss": 0.1375, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 17.95, | |
| "eval_loss": 4.552526950836182, | |
| "eval_runtime": 81.6054, | |
| "eval_samples_per_second": 4.142, | |
| "eval_steps_per_second": 0.527, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 18.09, | |
| "grad_norm": 0.7999384999275208, | |
| "learning_rate": 0.001, | |
| "loss": 0.1155, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 18.23, | |
| "grad_norm": 0.17563021183013916, | |
| "learning_rate": 0.001, | |
| "loss": 0.1006, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 18.37, | |
| "grad_norm": 0.17661228775978088, | |
| "learning_rate": 0.001, | |
| "loss": 0.1062, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 18.5, | |
| "grad_norm": 0.17768113315105438, | |
| "learning_rate": 0.001, | |
| "loss": 0.1059, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 18.64, | |
| "grad_norm": 0.15412819385528564, | |
| "learning_rate": 0.001, | |
| "loss": 0.0981, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 18.78, | |
| "grad_norm": 0.1754271388053894, | |
| "learning_rate": 0.001, | |
| "loss": 0.0988, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 18.92, | |
| "grad_norm": 0.15736614167690277, | |
| "learning_rate": 0.001, | |
| "loss": 0.1005, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 18.99, | |
| "eval_loss": 4.900540828704834, | |
| "eval_runtime": 81.5789, | |
| "eval_samples_per_second": 4.143, | |
| "eval_steps_per_second": 0.527, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 19.06, | |
| "grad_norm": 0.1531495302915573, | |
| "learning_rate": 0.001, | |
| "loss": 0.0844, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 19.2, | |
| "grad_norm": 0.15237411856651306, | |
| "learning_rate": 0.001, | |
| "loss": 0.0752, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 19.34, | |
| "grad_norm": 0.1433786153793335, | |
| "learning_rate": 0.001, | |
| "loss": 0.0782, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 19.48, | |
| "grad_norm": 0.1296713650226593, | |
| "learning_rate": 0.001, | |
| "loss": 0.0808, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 19.48, | |
| "eval_loss": 4.81671667098999, | |
| "eval_runtime": 81.4692, | |
| "eval_samples_per_second": 4.149, | |
| "eval_steps_per_second": 0.528, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 19.48, | |
| "step": 280, | |
| "total_flos": 4.895208054457934e+18, | |
| "train_loss": 0.8494854368801628, | |
| "train_runtime": 68771.7044, | |
| "train_samples_per_second": 1.068, | |
| "train_steps_per_second": 0.004 | |
| } | |
| ], | |
| "logging_steps": 2, | |
| "max_steps": 280, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 20, | |
| "save_steps": 500, | |
| "total_flos": 4.895208054457934e+18, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |