| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9897610921501707, | |
| "eval_steps": 146, | |
| "global_step": 438, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.06825938566552901, | |
| "grad_norm": 121.90807294112876, | |
| "learning_rate": 9.885844748858448e-07, | |
| "loss": 2.2874, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.13651877133105803, | |
| "grad_norm": 50.17053366859769, | |
| "learning_rate": 9.771689497716894e-07, | |
| "loss": 0.9472, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.20477815699658702, | |
| "grad_norm": 26.596625414659396, | |
| "learning_rate": 9.657534246575343e-07, | |
| "loss": 0.9087, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.27303754266211605, | |
| "grad_norm": 66.69994801604244, | |
| "learning_rate": 9.54337899543379e-07, | |
| "loss": 0.8543, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.3412969283276451, | |
| "grad_norm": 48.76266229603266, | |
| "learning_rate": 9.429223744292237e-07, | |
| "loss": 0.6994, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.40955631399317405, | |
| "grad_norm": 52.043092597792516, | |
| "learning_rate": 9.315068493150684e-07, | |
| "loss": 0.7693, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.4778156996587031, | |
| "grad_norm": 98.55471713268969, | |
| "learning_rate": 9.200913242009132e-07, | |
| "loss": 0.7138, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.5460750853242321, | |
| "grad_norm": 78.48196775888992, | |
| "learning_rate": 9.08675799086758e-07, | |
| "loss": 0.7903, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.6143344709897611, | |
| "grad_norm": 84.85124588773758, | |
| "learning_rate": 8.972602739726027e-07, | |
| "loss": 0.7586, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.6825938566552902, | |
| "grad_norm": 58.99018773695002, | |
| "learning_rate": 8.858447488584474e-07, | |
| "loss": 0.663, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.7508532423208191, | |
| "grad_norm": 39.94579058556075, | |
| "learning_rate": 8.744292237442922e-07, | |
| "loss": 0.589, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.8191126279863481, | |
| "grad_norm": 58.372841216401504, | |
| "learning_rate": 8.63013698630137e-07, | |
| "loss": 0.699, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.8873720136518771, | |
| "grad_norm": 43.360346848406074, | |
| "learning_rate": 8.515981735159817e-07, | |
| "loss": 0.6512, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.9556313993174061, | |
| "grad_norm": 61.008846083100586, | |
| "learning_rate": 8.401826484018264e-07, | |
| "loss": 0.7701, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.9965870307167235, | |
| "eval_loss": 0.619476318359375, | |
| "eval_runtime": 10.3711, | |
| "eval_samples_per_second": 25.07, | |
| "eval_steps_per_second": 3.182, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 1.023890784982935, | |
| "grad_norm": 36.97894806080412, | |
| "learning_rate": 8.287671232876712e-07, | |
| "loss": 0.5175, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.0921501706484642, | |
| "grad_norm": 18.0897726845655, | |
| "learning_rate": 8.173515981735159e-07, | |
| "loss": 0.3988, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.1604095563139931, | |
| "grad_norm": 53.788209383918755, | |
| "learning_rate": 8.059360730593608e-07, | |
| "loss": 0.3701, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.2286689419795223, | |
| "grad_norm": 34.202912199875364, | |
| "learning_rate": 7.945205479452054e-07, | |
| "loss": 0.4629, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.2969283276450512, | |
| "grad_norm": 32.334790445469515, | |
| "learning_rate": 7.831050228310501e-07, | |
| "loss": 0.4946, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.36518771331058, | |
| "grad_norm": 34.57788598902016, | |
| "learning_rate": 7.71689497716895e-07, | |
| "loss": 0.5149, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.4334470989761092, | |
| "grad_norm": 40.59898497248684, | |
| "learning_rate": 7.602739726027397e-07, | |
| "loss": 0.4565, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.5017064846416384, | |
| "grad_norm": 28.000076288707174, | |
| "learning_rate": 7.488584474885844e-07, | |
| "loss": 0.4383, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.5699658703071673, | |
| "grad_norm": 28.11781200722797, | |
| "learning_rate": 7.374429223744292e-07, | |
| "loss": 0.3446, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.6382252559726962, | |
| "grad_norm": 21.457173259928325, | |
| "learning_rate": 7.260273972602739e-07, | |
| "loss": 0.4206, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.7064846416382253, | |
| "grad_norm": 11.954122609487266, | |
| "learning_rate": 7.146118721461188e-07, | |
| "loss": 0.4117, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.7747440273037542, | |
| "grad_norm": 25.454890078823407, | |
| "learning_rate": 7.031963470319634e-07, | |
| "loss": 0.3557, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.8430034129692832, | |
| "grad_norm": 59.00305506329269, | |
| "learning_rate": 6.917808219178081e-07, | |
| "loss": 0.4312, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.9112627986348123, | |
| "grad_norm": 19.693798816454596, | |
| "learning_rate": 6.80365296803653e-07, | |
| "loss": 0.3798, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.9795221843003414, | |
| "grad_norm": 31.09835580820732, | |
| "learning_rate": 6.689497716894977e-07, | |
| "loss": 0.4721, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.993174061433447, | |
| "eval_loss": 0.6094198226928711, | |
| "eval_runtime": 10.038, | |
| "eval_samples_per_second": 25.901, | |
| "eval_steps_per_second": 3.287, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 2.04778156996587, | |
| "grad_norm": 14.148404582312557, | |
| "learning_rate": 6.575342465753423e-07, | |
| "loss": 0.3175, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.1160409556313993, | |
| "grad_norm": 9.302005033170696, | |
| "learning_rate": 6.461187214611872e-07, | |
| "loss": 0.3385, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 2.1843003412969284, | |
| "grad_norm": 23.055786559150018, | |
| "learning_rate": 6.347031963470319e-07, | |
| "loss": 0.2497, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.252559726962457, | |
| "grad_norm": 15.965787246539906, | |
| "learning_rate": 6.232876712328768e-07, | |
| "loss": 0.2529, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.3208191126279862, | |
| "grad_norm": 19.137778271986814, | |
| "learning_rate": 6.118721461187214e-07, | |
| "loss": 0.2681, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.3890784982935154, | |
| "grad_norm": 43.528989020233965, | |
| "learning_rate": 6.004566210045661e-07, | |
| "loss": 0.1969, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.4573378839590445, | |
| "grad_norm": 38.488063777446804, | |
| "learning_rate": 5.89041095890411e-07, | |
| "loss": 0.244, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.5255972696245736, | |
| "grad_norm": 19.9306913959441, | |
| "learning_rate": 5.776255707762557e-07, | |
| "loss": 0.3198, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.5938566552901023, | |
| "grad_norm": 19.0407162824872, | |
| "learning_rate": 5.662100456621004e-07, | |
| "loss": 0.2783, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.6621160409556315, | |
| "grad_norm": 16.817106035170273, | |
| "learning_rate": 5.547945205479452e-07, | |
| "loss": 0.2687, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.73037542662116, | |
| "grad_norm": 27.440144369018864, | |
| "learning_rate": 5.433789954337899e-07, | |
| "loss": 0.2999, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.7986348122866893, | |
| "grad_norm": 26.842832940417683, | |
| "learning_rate": 5.319634703196348e-07, | |
| "loss": 0.3246, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.8668941979522184, | |
| "grad_norm": 52.24386186460223, | |
| "learning_rate": 5.205479452054794e-07, | |
| "loss": 0.2099, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.9351535836177476, | |
| "grad_norm": 39.20725213578627, | |
| "learning_rate": 5.091324200913241e-07, | |
| "loss": 0.3482, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.9897610921501707, | |
| "eval_loss": 0.6506538391113281, | |
| "eval_runtime": 10.0544, | |
| "eval_samples_per_second": 25.859, | |
| "eval_steps_per_second": 3.282, | |
| "step": 438 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 876, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 6, | |
| "save_steps": 146, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6392788746240.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |