{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9897610921501707, "eval_steps": 146, "global_step": 438, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06825938566552901, "grad_norm": 180.72653528520956, "learning_rate": 4.942922374429224e-07, "loss": 3.2191, "step": 10 }, { "epoch": 0.13651877133105803, "grad_norm": 67.78875632967043, "learning_rate": 4.885844748858447e-07, "loss": 1.3006, "step": 20 }, { "epoch": 0.20477815699658702, "grad_norm": 58.498874329827956, "learning_rate": 4.828767123287671e-07, "loss": 1.0222, "step": 30 }, { "epoch": 0.27303754266211605, "grad_norm": 65.81012271745634, "learning_rate": 4.771689497716894e-07, "loss": 1.0702, "step": 40 }, { "epoch": 0.3412969283276451, "grad_norm": 67.6868122468499, "learning_rate": 4.7146118721461187e-07, "loss": 0.8233, "step": 50 }, { "epoch": 0.40955631399317405, "grad_norm": 130.6895452398653, "learning_rate": 4.657534246575342e-07, "loss": 1.0114, "step": 60 }, { "epoch": 0.4778156996587031, "grad_norm": 67.10774835755684, "learning_rate": 4.600456621004566e-07, "loss": 0.7751, "step": 70 }, { "epoch": 0.5460750853242321, "grad_norm": 222.42047329326795, "learning_rate": 4.54337899543379e-07, "loss": 0.9296, "step": 80 }, { "epoch": 0.6143344709897611, "grad_norm": 53.90173100677448, "learning_rate": 4.4863013698630134e-07, "loss": 0.8508, "step": 90 }, { "epoch": 0.6825938566552902, "grad_norm": 73.87543764538118, "learning_rate": 4.429223744292237e-07, "loss": 0.7461, "step": 100 }, { "epoch": 0.7508532423208191, "grad_norm": 51.730677807091766, "learning_rate": 4.372146118721461e-07, "loss": 0.6538, "step": 110 }, { "epoch": 0.8191126279863481, "grad_norm": 58.532049779965085, "learning_rate": 4.315068493150685e-07, "loss": 0.7099, "step": 120 }, { "epoch": 0.8873720136518771, "grad_norm": 69.81205859668461, "learning_rate": 4.2579908675799087e-07, "loss": 0.7059, "step": 130 }, { "epoch": 0.9556313993174061, "grad_norm": 48.161843074686466, "learning_rate": 4.200913242009132e-07, "loss": 0.8547, "step": 140 }, { "epoch": 0.9965870307167235, "eval_loss": 0.6474742889404297, "eval_runtime": 10.4556, "eval_samples_per_second": 24.867, "eval_steps_per_second": 3.156, "step": 146 }, { "epoch": 1.023890784982935, "grad_norm": 27.25525751270437, "learning_rate": 4.143835616438356e-07, "loss": 0.6076, "step": 150 }, { "epoch": 1.0921501706484642, "grad_norm": 25.863617719976116, "learning_rate": 4.0867579908675797e-07, "loss": 0.5054, "step": 160 }, { "epoch": 1.1604095563139931, "grad_norm": 52.70780610444248, "learning_rate": 4.029680365296804e-07, "loss": 0.4987, "step": 170 }, { "epoch": 1.2286689419795223, "grad_norm": 25.718837256960583, "learning_rate": 3.972602739726027e-07, "loss": 0.5697, "step": 180 }, { "epoch": 1.2969283276450512, "grad_norm": 36.98618805648008, "learning_rate": 3.915525114155251e-07, "loss": 0.615, "step": 190 }, { "epoch": 1.36518771331058, "grad_norm": 32.58402570740449, "learning_rate": 3.858447488584475e-07, "loss": 0.5849, "step": 200 }, { "epoch": 1.4334470989761092, "grad_norm": 23.37886970180941, "learning_rate": 3.8013698630136986e-07, "loss": 0.5924, "step": 210 }, { "epoch": 1.5017064846416384, "grad_norm": 27.73682160230221, "learning_rate": 3.744292237442922e-07, "loss": 0.5095, "step": 220 }, { "epoch": 1.5699658703071673, "grad_norm": 58.838640484256175, "learning_rate": 3.687214611872146e-07, "loss": 0.4661, "step": 230 }, { "epoch": 1.6382252559726962, "grad_norm": 29.677456691329837, "learning_rate": 3.6301369863013697e-07, "loss": 0.5448, "step": 240 }, { "epoch": 1.7064846416382253, "grad_norm": 83.73694041166883, "learning_rate": 3.573059360730594e-07, "loss": 0.4715, "step": 250 }, { "epoch": 1.7747440273037542, "grad_norm": 72.88041601319203, "learning_rate": 3.515981735159817e-07, "loss": 0.4408, "step": 260 }, { "epoch": 1.8430034129692832, "grad_norm": 48.86805911410425, "learning_rate": 3.4589041095890407e-07, "loss": 0.5299, "step": 270 }, { "epoch": 1.9112627986348123, "grad_norm": 56.39227585073634, "learning_rate": 3.401826484018265e-07, "loss": 0.462, "step": 280 }, { "epoch": 1.9795221843003414, "grad_norm": 47.10405902113264, "learning_rate": 3.3447488584474886e-07, "loss": 0.5135, "step": 290 }, { "epoch": 1.993174061433447, "eval_loss": 0.6217488646507263, "eval_runtime": 10.0993, "eval_samples_per_second": 25.744, "eval_steps_per_second": 3.268, "step": 292 }, { "epoch": 2.04778156996587, "grad_norm": 47.53720316702887, "learning_rate": 3.287671232876712e-07, "loss": 0.4304, "step": 300 }, { "epoch": 2.1160409556313993, "grad_norm": 29.27611194113683, "learning_rate": 3.230593607305936e-07, "loss": 0.4024, "step": 310 }, { "epoch": 2.1843003412969284, "grad_norm": 22.633855883872762, "learning_rate": 3.1735159817351596e-07, "loss": 0.3316, "step": 320 }, { "epoch": 2.252559726962457, "grad_norm": 33.60546914801217, "learning_rate": 3.116438356164384e-07, "loss": 0.3474, "step": 330 }, { "epoch": 2.3208191126279862, "grad_norm": 38.727812421442835, "learning_rate": 3.059360730593607e-07, "loss": 0.3546, "step": 340 }, { "epoch": 2.3890784982935154, "grad_norm": 41.67978939810039, "learning_rate": 3.0022831050228307e-07, "loss": 0.2542, "step": 350 }, { "epoch": 2.4573378839590445, "grad_norm": 77.57463923271911, "learning_rate": 2.945205479452055e-07, "loss": 0.2766, "step": 360 }, { "epoch": 2.5255972696245736, "grad_norm": 59.43311669158369, "learning_rate": 2.8881278538812786e-07, "loss": 0.44, "step": 370 }, { "epoch": 2.5938566552901023, "grad_norm": 22.434362950673165, "learning_rate": 2.831050228310502e-07, "loss": 0.2935, "step": 380 }, { "epoch": 2.6621160409556315, "grad_norm": 30.626819579935223, "learning_rate": 2.773972602739726e-07, "loss": 0.3208, "step": 390 }, { "epoch": 2.73037542662116, "grad_norm": 64.9106394772347, "learning_rate": 2.7168949771689496e-07, "loss": 0.41, "step": 400 }, { "epoch": 2.7986348122866893, "grad_norm": 112.81858040393827, "learning_rate": 2.659817351598174e-07, "loss": 0.4234, "step": 410 }, { "epoch": 2.8668941979522184, "grad_norm": 25.381208288443705, "learning_rate": 2.602739726027397e-07, "loss": 0.3328, "step": 420 }, { "epoch": 2.9351535836177476, "grad_norm": 45.82561826818154, "learning_rate": 2.5456621004566206e-07, "loss": 0.4019, "step": 430 }, { "epoch": 2.9897610921501707, "eval_loss": 0.64452064037323, "eval_runtime": 10.0372, "eval_samples_per_second": 25.904, "eval_steps_per_second": 3.288, "step": 438 } ], "logging_steps": 10, "max_steps": 876, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 146, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6392788746240.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }