{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9897610921501707, "eval_steps": 146, "global_step": 438, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06825938566552901, "grad_norm": 121.90807294112876, "learning_rate": 9.885844748858448e-07, "loss": 2.2874, "step": 10 }, { "epoch": 0.13651877133105803, "grad_norm": 50.17053366859769, "learning_rate": 9.771689497716894e-07, "loss": 0.9472, "step": 20 }, { "epoch": 0.20477815699658702, "grad_norm": 26.596625414659396, "learning_rate": 9.657534246575343e-07, "loss": 0.9087, "step": 30 }, { "epoch": 0.27303754266211605, "grad_norm": 66.69994801604244, "learning_rate": 9.54337899543379e-07, "loss": 0.8543, "step": 40 }, { "epoch": 0.3412969283276451, "grad_norm": 48.76266229603266, "learning_rate": 9.429223744292237e-07, "loss": 0.6994, "step": 50 }, { "epoch": 0.40955631399317405, "grad_norm": 52.043092597792516, "learning_rate": 9.315068493150684e-07, "loss": 0.7693, "step": 60 }, { "epoch": 0.4778156996587031, "grad_norm": 98.55471713268969, "learning_rate": 9.200913242009132e-07, "loss": 0.7138, "step": 70 }, { "epoch": 0.5460750853242321, "grad_norm": 78.48196775888992, "learning_rate": 9.08675799086758e-07, "loss": 0.7903, "step": 80 }, { "epoch": 0.6143344709897611, "grad_norm": 84.85124588773758, "learning_rate": 8.972602739726027e-07, "loss": 0.7586, "step": 90 }, { "epoch": 0.6825938566552902, "grad_norm": 58.99018773695002, "learning_rate": 8.858447488584474e-07, "loss": 0.663, "step": 100 }, { "epoch": 0.7508532423208191, "grad_norm": 39.94579058556075, "learning_rate": 8.744292237442922e-07, "loss": 0.589, "step": 110 }, { "epoch": 0.8191126279863481, "grad_norm": 58.372841216401504, "learning_rate": 8.63013698630137e-07, "loss": 0.699, "step": 120 }, { "epoch": 0.8873720136518771, "grad_norm": 43.360346848406074, "learning_rate": 8.515981735159817e-07, "loss": 0.6512, "step": 130 }, { "epoch": 0.9556313993174061, "grad_norm": 61.008846083100586, "learning_rate": 8.401826484018264e-07, "loss": 0.7701, "step": 140 }, { "epoch": 0.9965870307167235, "eval_loss": 0.619476318359375, "eval_runtime": 10.3711, "eval_samples_per_second": 25.07, "eval_steps_per_second": 3.182, "step": 146 }, { "epoch": 1.023890784982935, "grad_norm": 36.97894806080412, "learning_rate": 8.287671232876712e-07, "loss": 0.5175, "step": 150 }, { "epoch": 1.0921501706484642, "grad_norm": 18.0897726845655, "learning_rate": 8.173515981735159e-07, "loss": 0.3988, "step": 160 }, { "epoch": 1.1604095563139931, "grad_norm": 53.788209383918755, "learning_rate": 8.059360730593608e-07, "loss": 0.3701, "step": 170 }, { "epoch": 1.2286689419795223, "grad_norm": 34.202912199875364, "learning_rate": 7.945205479452054e-07, "loss": 0.4629, "step": 180 }, { "epoch": 1.2969283276450512, "grad_norm": 32.334790445469515, "learning_rate": 7.831050228310501e-07, "loss": 0.4946, "step": 190 }, { "epoch": 1.36518771331058, "grad_norm": 34.57788598902016, "learning_rate": 7.71689497716895e-07, "loss": 0.5149, "step": 200 }, { "epoch": 1.4334470989761092, "grad_norm": 40.59898497248684, "learning_rate": 7.602739726027397e-07, "loss": 0.4565, "step": 210 }, { "epoch": 1.5017064846416384, "grad_norm": 28.000076288707174, "learning_rate": 7.488584474885844e-07, "loss": 0.4383, "step": 220 }, { "epoch": 1.5699658703071673, "grad_norm": 28.11781200722797, "learning_rate": 7.374429223744292e-07, "loss": 0.3446, "step": 230 }, { "epoch": 1.6382252559726962, "grad_norm": 21.457173259928325, "learning_rate": 7.260273972602739e-07, "loss": 0.4206, "step": 240 }, { "epoch": 1.7064846416382253, "grad_norm": 11.954122609487266, "learning_rate": 7.146118721461188e-07, "loss": 0.4117, "step": 250 }, { "epoch": 1.7747440273037542, "grad_norm": 25.454890078823407, "learning_rate": 7.031963470319634e-07, "loss": 0.3557, "step": 260 }, { "epoch": 1.8430034129692832, "grad_norm": 59.00305506329269, "learning_rate": 6.917808219178081e-07, "loss": 0.4312, "step": 270 }, { "epoch": 1.9112627986348123, "grad_norm": 19.693798816454596, "learning_rate": 6.80365296803653e-07, "loss": 0.3798, "step": 280 }, { "epoch": 1.9795221843003414, "grad_norm": 31.09835580820732, "learning_rate": 6.689497716894977e-07, "loss": 0.4721, "step": 290 }, { "epoch": 1.993174061433447, "eval_loss": 0.6094198226928711, "eval_runtime": 10.038, "eval_samples_per_second": 25.901, "eval_steps_per_second": 3.287, "step": 292 }, { "epoch": 2.04778156996587, "grad_norm": 14.148404582312557, "learning_rate": 6.575342465753423e-07, "loss": 0.3175, "step": 300 }, { "epoch": 2.1160409556313993, "grad_norm": 9.302005033170696, "learning_rate": 6.461187214611872e-07, "loss": 0.3385, "step": 310 }, { "epoch": 2.1843003412969284, "grad_norm": 23.055786559150018, "learning_rate": 6.347031963470319e-07, "loss": 0.2497, "step": 320 }, { "epoch": 2.252559726962457, "grad_norm": 15.965787246539906, "learning_rate": 6.232876712328768e-07, "loss": 0.2529, "step": 330 }, { "epoch": 2.3208191126279862, "grad_norm": 19.137778271986814, "learning_rate": 6.118721461187214e-07, "loss": 0.2681, "step": 340 }, { "epoch": 2.3890784982935154, "grad_norm": 43.528989020233965, "learning_rate": 6.004566210045661e-07, "loss": 0.1969, "step": 350 }, { "epoch": 2.4573378839590445, "grad_norm": 38.488063777446804, "learning_rate": 5.89041095890411e-07, "loss": 0.244, "step": 360 }, { "epoch": 2.5255972696245736, "grad_norm": 19.9306913959441, "learning_rate": 5.776255707762557e-07, "loss": 0.3198, "step": 370 }, { "epoch": 2.5938566552901023, "grad_norm": 19.0407162824872, "learning_rate": 5.662100456621004e-07, "loss": 0.2783, "step": 380 }, { "epoch": 2.6621160409556315, "grad_norm": 16.817106035170273, "learning_rate": 5.547945205479452e-07, "loss": 0.2687, "step": 390 }, { "epoch": 2.73037542662116, "grad_norm": 27.440144369018864, "learning_rate": 5.433789954337899e-07, "loss": 0.2999, "step": 400 }, { "epoch": 2.7986348122866893, "grad_norm": 26.842832940417683, "learning_rate": 5.319634703196348e-07, "loss": 0.3246, "step": 410 }, { "epoch": 2.8668941979522184, "grad_norm": 52.24386186460223, "learning_rate": 5.205479452054794e-07, "loss": 0.2099, "step": 420 }, { "epoch": 2.9351535836177476, "grad_norm": 39.20725213578627, "learning_rate": 5.091324200913241e-07, "loss": 0.3482, "step": 430 }, { "epoch": 2.9897610921501707, "eval_loss": 0.6506538391113281, "eval_runtime": 10.0544, "eval_samples_per_second": 25.859, "eval_steps_per_second": 3.282, "step": 438 } ], "logging_steps": 10, "max_steps": 876, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 146, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6392788746240.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }