{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 31135, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08029548739360848, "grad_norm": 1.0575518608093262, "learning_rate": 4.919704512606392e-05, "loss": 1.6978, "step": 500 }, { "epoch": 0.16059097478721696, "grad_norm": 1.1808357238769531, "learning_rate": 4.839409025212783e-05, "loss": 1.5655, "step": 1000 }, { "epoch": 0.24088646218082543, "grad_norm": 1.2457187175750732, "learning_rate": 4.759113537819175e-05, "loss": 1.5116, "step": 1500 }, { "epoch": 0.3211819495744339, "grad_norm": 1.1846860647201538, "learning_rate": 4.678818050425566e-05, "loss": 1.4617, "step": 2000 }, { "epoch": 0.4014774369680424, "grad_norm": 1.1566858291625977, "learning_rate": 4.598522563031958e-05, "loss": 1.4351, "step": 2500 }, { "epoch": 0.48177292436165087, "grad_norm": 1.2010133266448975, "learning_rate": 4.5182270756383495e-05, "loss": 1.4215, "step": 3000 }, { "epoch": 0.5620684117552593, "grad_norm": 1.0966484546661377, "learning_rate": 4.4379315882447406e-05, "loss": 1.3943, "step": 3500 }, { "epoch": 0.6423638991488678, "grad_norm": 1.1054482460021973, "learning_rate": 4.3576361008511324e-05, "loss": 1.392, "step": 4000 }, { "epoch": 0.7226593865424763, "grad_norm": 1.166495680809021, "learning_rate": 4.277340613457524e-05, "loss": 1.3607, "step": 4500 }, { "epoch": 0.8029548739360848, "grad_norm": 1.6440229415893555, "learning_rate": 4.197045126063915e-05, "loss": 1.3453, "step": 5000 }, { "epoch": 0.8832503613296933, "grad_norm": 1.1146718263626099, "learning_rate": 4.116749638670307e-05, "loss": 1.328, "step": 5500 }, { "epoch": 0.9635458487233017, "grad_norm": 1.2365636825561523, "learning_rate": 4.036454151276698e-05, "loss": 1.3213, "step": 6000 }, { "epoch": 1.0438413361169103, "grad_norm": 1.0937212705612183, "learning_rate": 3.95615866388309e-05, "loss": 1.2654, "step": 6500 }, { "epoch": 1.1241368235105187, "grad_norm": 0.8828343749046326, "learning_rate": 3.875863176489482e-05, "loss": 1.2134, "step": 7000 }, { "epoch": 1.2044323109041273, "grad_norm": 1.099165916442871, "learning_rate": 3.795567689095873e-05, "loss": 1.2104, "step": 7500 }, { "epoch": 1.2847277982977356, "grad_norm": 1.2219111919403076, "learning_rate": 3.7152722017022646e-05, "loss": 1.1973, "step": 8000 }, { "epoch": 1.3650232856913442, "grad_norm": 1.0750117301940918, "learning_rate": 3.6349767143086564e-05, "loss": 1.1923, "step": 8500 }, { "epoch": 1.4453187730849526, "grad_norm": 1.098244547843933, "learning_rate": 3.5546812269150475e-05, "loss": 1.1925, "step": 9000 }, { "epoch": 1.525614260478561, "grad_norm": 1.1637680530548096, "learning_rate": 3.474385739521439e-05, "loss": 1.182, "step": 9500 }, { "epoch": 1.6059097478721696, "grad_norm": 1.1562321186065674, "learning_rate": 3.3940902521278304e-05, "loss": 1.1634, "step": 10000 }, { "epoch": 1.6862052352657781, "grad_norm": 1.4565141201019287, "learning_rate": 3.313794764734222e-05, "loss": 1.1542, "step": 10500 }, { "epoch": 1.7665007226593865, "grad_norm": 1.434606671333313, "learning_rate": 3.233499277340614e-05, "loss": 1.1533, "step": 11000 }, { "epoch": 1.8467962100529949, "grad_norm": 1.1290115118026733, "learning_rate": 3.153203789947005e-05, "loss": 1.1496, "step": 11500 }, { "epoch": 1.9270916974466035, "grad_norm": 1.1467580795288086, "learning_rate": 3.072908302553397e-05, "loss": 1.1444, "step": 12000 }, { "epoch": 2.007387184840212, "grad_norm": 1.1580528020858765, "learning_rate": 2.9926128151597882e-05, "loss": 1.1478, "step": 12500 }, { "epoch": 2.0876826722338206, "grad_norm": 1.040642261505127, "learning_rate": 2.9123173277661797e-05, "loss": 1.0662, "step": 13000 }, { "epoch": 2.167978159627429, "grad_norm": 1.1460875272750854, "learning_rate": 2.832021840372571e-05, "loss": 1.0788, "step": 13500 }, { "epoch": 2.2482736470210374, "grad_norm": 1.0731582641601562, "learning_rate": 2.751726352978963e-05, "loss": 1.0635, "step": 14000 }, { "epoch": 2.328569134414646, "grad_norm": 1.1237194538116455, "learning_rate": 2.6714308655853543e-05, "loss": 1.065, "step": 14500 }, { "epoch": 2.4088646218082546, "grad_norm": 1.0012214183807373, "learning_rate": 2.5911353781917458e-05, "loss": 1.0509, "step": 15000 }, { "epoch": 2.4891601092018627, "grad_norm": 1.1109308004379272, "learning_rate": 2.5108398907981372e-05, "loss": 1.0574, "step": 15500 }, { "epoch": 2.5694555965954713, "grad_norm": 1.1631648540496826, "learning_rate": 2.430544403404529e-05, "loss": 1.0345, "step": 16000 }, { "epoch": 2.64975108398908, "grad_norm": 1.0513032674789429, "learning_rate": 2.3502489160109204e-05, "loss": 1.0616, "step": 16500 }, { "epoch": 2.7300465713826885, "grad_norm": 1.189889669418335, "learning_rate": 2.269953428617312e-05, "loss": 1.0533, "step": 17000 }, { "epoch": 2.8103420587762966, "grad_norm": 1.0951628684997559, "learning_rate": 2.1896579412237033e-05, "loss": 1.0388, "step": 17500 }, { "epoch": 2.890637546169905, "grad_norm": 1.0122724771499634, "learning_rate": 2.109362453830095e-05, "loss": 1.0374, "step": 18000 }, { "epoch": 2.970933033563514, "grad_norm": 1.1020405292510986, "learning_rate": 2.0290669664364865e-05, "loss": 1.0325, "step": 18500 }, { "epoch": 3.0512285209571224, "grad_norm": 1.0594305992126465, "learning_rate": 1.948771479042878e-05, "loss": 1.0047, "step": 19000 }, { "epoch": 3.1315240083507305, "grad_norm": 1.070056438446045, "learning_rate": 1.8684759916492694e-05, "loss": 0.9794, "step": 19500 }, { "epoch": 3.211819495744339, "grad_norm": 1.106451392173767, "learning_rate": 1.7881805042556608e-05, "loss": 0.971, "step": 20000 }, { "epoch": 3.2921149831379477, "grad_norm": 1.0232676267623901, "learning_rate": 1.7078850168620526e-05, "loss": 0.9819, "step": 20500 }, { "epoch": 3.3724104705315563, "grad_norm": 1.1868596076965332, "learning_rate": 1.627589529468444e-05, "loss": 0.9763, "step": 21000 }, { "epoch": 3.4527059579251644, "grad_norm": 1.0707334280014038, "learning_rate": 1.5472940420748355e-05, "loss": 0.9741, "step": 21500 }, { "epoch": 3.533001445318773, "grad_norm": 1.0286450386047363, "learning_rate": 1.466998554681227e-05, "loss": 0.9821, "step": 22000 }, { "epoch": 3.6132969327123816, "grad_norm": 1.1337109804153442, "learning_rate": 1.3867030672876185e-05, "loss": 0.9754, "step": 22500 }, { "epoch": 3.69359242010599, "grad_norm": 1.1301957368850708, "learning_rate": 1.3064075798940101e-05, "loss": 0.9757, "step": 23000 }, { "epoch": 3.7738879074995983, "grad_norm": 0.8995300531387329, "learning_rate": 1.2261120925004016e-05, "loss": 0.9728, "step": 23500 }, { "epoch": 3.854183394893207, "grad_norm": 1.099932074546814, "learning_rate": 1.1458166051067932e-05, "loss": 0.9549, "step": 24000 }, { "epoch": 3.9344788822868155, "grad_norm": 1.0159733295440674, "learning_rate": 1.0655211177131846e-05, "loss": 0.976, "step": 24500 }, { "epoch": 4.014774369680424, "grad_norm": 1.0208700895309448, "learning_rate": 9.852256303195762e-06, "loss": 0.9571, "step": 25000 }, { "epoch": 4.095069857074033, "grad_norm": 1.040358304977417, "learning_rate": 9.049301429259676e-06, "loss": 0.9293, "step": 25500 }, { "epoch": 4.175365344467641, "grad_norm": 1.1360992193222046, "learning_rate": 8.246346555323591e-06, "loss": 0.9351, "step": 26000 }, { "epoch": 4.255660831861249, "grad_norm": 1.0629996061325073, "learning_rate": 7.443391681387506e-06, "loss": 0.9308, "step": 26500 }, { "epoch": 4.335956319254858, "grad_norm": 1.1828113794326782, "learning_rate": 6.6404368074514205e-06, "loss": 0.9356, "step": 27000 }, { "epoch": 4.416251806648466, "grad_norm": 1.156646966934204, "learning_rate": 5.8374819335153366e-06, "loss": 0.9396, "step": 27500 }, { "epoch": 4.496547294042075, "grad_norm": 1.0000945329666138, "learning_rate": 5.034527059579252e-06, "loss": 0.9266, "step": 28000 }, { "epoch": 4.576842781435683, "grad_norm": 1.0536987781524658, "learning_rate": 4.231572185643167e-06, "loss": 0.9269, "step": 28500 }, { "epoch": 4.657138268829292, "grad_norm": 1.1100162267684937, "learning_rate": 3.4286173117070822e-06, "loss": 0.9256, "step": 29000 }, { "epoch": 4.7374337562229005, "grad_norm": 1.1744736433029175, "learning_rate": 2.6256624377709975e-06, "loss": 0.9176, "step": 29500 }, { "epoch": 4.817729243616509, "grad_norm": 1.049423098564148, "learning_rate": 1.8227075638349127e-06, "loss": 0.9355, "step": 30000 }, { "epoch": 4.898024731010118, "grad_norm": 1.227993369102478, "learning_rate": 1.0197526898988277e-06, "loss": 0.9221, "step": 30500 }, { "epoch": 4.978320218403725, "grad_norm": 1.1226952075958252, "learning_rate": 2.167978159627429e-07, "loss": 0.9336, "step": 31000 } ], "logging_steps": 500, "max_steps": 31135, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.6463151666049843e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }