m2m100_rup_ur_to_rur / trainer_state.json
Muhammad Umer Tariq Butt
Upload M2M100 model for ur to rur trained on rup
f09d023
{
"best_metric": 0.024393858388066292,
"best_model_checkpoint": "/netscratch/butt/Transliterate/RUP/finetuning/data/output_models/with_ur_rur_pretraining/m2m100_ur_rur/checkpoint-98957",
"epoch": 4.0,
"eval_steps": 500,
"global_step": 98957,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.16168638903766283,
"grad_norm": 0.7672635912895203,
"learning_rate": 1.7955627413994434e-06,
"loss": 3.2537,
"step": 4000
},
{
"epoch": 0.32337277807532566,
"grad_norm": 0.197320356965065,
"learning_rate": 3.592023713284829e-06,
"loss": 0.1013,
"step": 8000
},
{
"epoch": 0.48505916711298847,
"grad_norm": 0.1217622384428978,
"learning_rate": 5.388035569927243e-06,
"loss": 0.0622,
"step": 12000
},
{
"epoch": 0.6467455561506513,
"grad_norm": 0.13456104695796967,
"learning_rate": 7.184047426569658e-06,
"loss": 0.0481,
"step": 16000
},
{
"epoch": 0.8084319451883141,
"grad_norm": 0.15323583781719208,
"learning_rate": 8.980059283212073e-06,
"loss": 0.0411,
"step": 20000
},
{
"epoch": 0.9701183342259769,
"grad_norm": 0.10961552709341049,
"learning_rate": 9.998163154071579e-06,
"loss": 0.0368,
"step": 24000
},
{
"epoch": 0.9999898946006851,
"eval_bleu_score": 89.54460906982422,
"eval_loss": 0.027495749294757843,
"eval_runtime": 95.4142,
"eval_samples_per_second": 47.163,
"eval_steps_per_second": 0.744,
"step": 24739
},
{
"epoch": 1.1318047232636397,
"grad_norm": 0.0804838314652443,
"learning_rate": 9.979866450572228e-06,
"loss": 0.0339,
"step": 28000
},
{
"epoch": 1.2934911123013026,
"grad_norm": 0.08419705927371979,
"learning_rate": 9.941985543338884e-06,
"loss": 0.0321,
"step": 32000
},
{
"epoch": 1.4551775013389654,
"grad_norm": 0.06364738196134567,
"learning_rate": 9.88469246388591e-06,
"loss": 0.0311,
"step": 36000
},
{
"epoch": 1.6168638903766284,
"grad_norm": 0.19403564929962158,
"learning_rate": 9.808226815651367e-06,
"loss": 0.0301,
"step": 40000
},
{
"epoch": 1.778550279414291,
"grad_norm": 0.08095328509807587,
"learning_rate": 9.712850825850488e-06,
"loss": 0.0295,
"step": 44000
},
{
"epoch": 1.9402366684519539,
"grad_norm": 0.05232414975762367,
"learning_rate": 9.598956005105698e-06,
"loss": 0.029,
"step": 48000
},
{
"epoch": 1.9999797892013702,
"eval_bleu_score": 89.95458984375,
"eval_loss": 0.02493358589708805,
"eval_runtime": 92.9895,
"eval_samples_per_second": 48.393,
"eval_steps_per_second": 0.764,
"step": 49478
},
{
"epoch": 2.101923057489617,
"grad_norm": 0.1178918406367302,
"learning_rate": 9.467060328243924e-06,
"loss": 0.0284,
"step": 52000
},
{
"epoch": 2.2636094465272794,
"grad_norm": 0.06169985607266426,
"learning_rate": 9.317550119060927e-06,
"loss": 0.028,
"step": 56000
},
{
"epoch": 2.4252958355649423,
"grad_norm": 0.05251774191856384,
"learning_rate": 9.151118083407196e-06,
"loss": 0.0283,
"step": 60000
},
{
"epoch": 2.5869822246026053,
"grad_norm": 0.1015240028500557,
"learning_rate": 8.968287285850323e-06,
"loss": 0.0275,
"step": 64000
},
{
"epoch": 2.748668613640268,
"grad_norm": 0.06474316865205765,
"learning_rate": 8.76990706667961e-06,
"loss": 0.0273,
"step": 68000
},
{
"epoch": 2.910355002677931,
"grad_norm": 0.07183582335710526,
"learning_rate": 8.556823439595787e-06,
"loss": 0.0272,
"step": 72000
},
{
"epoch": 2.9999696838020555,
"eval_bleu_score": 90.0082015991211,
"eval_loss": 0.02451913431286812,
"eval_runtime": 92.6435,
"eval_samples_per_second": 48.573,
"eval_steps_per_second": 0.766,
"step": 74217
},
{
"epoch": 3.0720413917155938,
"grad_norm": 0.04460311308503151,
"learning_rate": 8.329660528736868e-06,
"loss": 0.0268,
"step": 76000
},
{
"epoch": 3.2337277807532563,
"grad_norm": 0.047645051032304764,
"learning_rate": 8.08941394035222e-06,
"loss": 0.0266,
"step": 80000
},
{
"epoch": 3.3954141697909193,
"grad_norm": 0.04565703496336937,
"learning_rate": 7.83709225538658e-06,
"loss": 0.0264,
"step": 84000
},
{
"epoch": 3.5571005588285822,
"grad_norm": 0.06948993355035782,
"learning_rate": 7.573493564289276e-06,
"loss": 0.0264,
"step": 88000
},
{
"epoch": 3.7187869478662448,
"grad_norm": 0.041806410998106,
"learning_rate": 7.2998470510947005e-06,
"loss": 0.0263,
"step": 92000
},
{
"epoch": 3.8804733369039077,
"grad_norm": 0.04544525593519211,
"learning_rate": 7.017306895492585e-06,
"loss": 0.0262,
"step": 96000
},
{
"epoch": 4.0,
"eval_bleu_score": 90.16093444824219,
"eval_loss": 0.024393858388066292,
"eval_runtime": 93.7037,
"eval_samples_per_second": 48.024,
"eval_steps_per_second": 0.758,
"step": 98957
}
],
"logging_steps": 4000,
"max_steps": 222651,
"num_input_tokens_seen": 0,
"num_train_epochs": 9,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.352425921214939e+18,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}