Veronica-Polymorphic 551M — Pretrained v1

Browse files

Files changed (8) hide show

added_tokens.json +16 -0
config.json +32 -0
generation_config.json +9 -0
merges.txt +0 -0
pytorch_model.bin +3 -0
trainer_state.json +2657 -0
training_args.bin +3 -0
vocab.json +0 -0

added_tokens.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "<|assistant|>": 50261,
+  "<|bos|>": 50257,
+  "<|code|>": 50268,
+  "<|document|>": 50264,
+  "<|endofchunk|>": 50270,
+  "<|observation|>": 50263,
+  "<|pad|>": 50258,
+  "<|retrieval|>": 50265,
+  "<|scratchpad|>": 50266,
+  "<|sep|>": 50269,
+  "<|system|>": 50259,
+  "<|thought|>": 50267,
+  "<|tool|>": 50262,
+  "<|user|>": 50260
+}

config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "_comment": "Ultra-deep reasoning config: 24L/12H/768d, mlp_mult=4. Maximum reasoning depth for RAG + multi-step inference. ~11-12GB VRAM.",
+  "architectures": [
+    "VeronicaForCausalLM"
+  ],
+  "bos_token_id": 50257,
+  "dropout": 0.0,
+  "dtype": "bfloat16",
+  "eos_token_id": 50256,
+  "gradient_checkpointing": true,
+  "hidden_size": 768,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 2048,
+  "mlp_mult": 4,
+  "model_type": "veronica",
+  "n_embd": 768,
+  "n_head": 12,
+  "n_layer": 24,
+  "num_attention_heads": 12,
+  "num_funcs": 3,
+  "num_hidden_layers": 24,
+  "pad_token_id": 50258,
+  "rope_theta": 10000.0,
+  "router_aux_weight": 0.015999763124389305,
+  "router_dim": 128,
+  "router_tau": 1.400028804954452,
+  "transformers_version": "4.57.0.dev0",
+  "use_cache": false,
+  "use_channel_attention": false,
+  "use_flash_attn": true,
+  "vocab_size": 50271
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50257,
+  "eos_token_id": [
+    50256
+  ],
+  "pad_token_id": 50258,
+  "transformers_version": "4.57.0.dev0"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:809ed2aff4a95bff277106992f002be0e789ad07b108eb62ca33dcd9e6952879
+size 1102824111

trainer_state.json ADDED Viewed

	@@ -0,0 +1,2657 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9999925976919604,
+  "eval_steps": 1000,
+  "global_step": 33773,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.002960923215858705,
+      "grad_norm": 41.32149887084961,
+      "learning_rate": 1.98e-06,
+      "loss": 87.7468,
+      "step": 100
+    },
+    {
+      "epoch": 0.00592184643171741,
+      "grad_norm": 39.51910400390625,
+      "learning_rate": 3.98e-06,
+      "loss": 85.766,
+      "step": 200
+    },
+    {
+      "epoch": 0.008882769647576115,
+      "grad_norm": 26.557823181152344,
+      "learning_rate": 5.98e-06,
+      "loss": 80.1376,
+      "step": 300
+    },
+    {
+      "epoch": 0.01184369286343482,
+      "grad_norm": 21.80652618408203,
+      "learning_rate": 7.98e-06,
+      "loss": 74.3306,
+      "step": 400
+    },
+    {
+      "epoch": 0.014804616079293524,
+      "grad_norm": 16.2612247467041,
+      "learning_rate": 9.980000000000001e-06,
+      "loss": 72.3247,
+      "step": 500
+    },
+    {
+      "epoch": 0.01776553929515223,
+      "grad_norm": 17.281190872192383,
+      "learning_rate": 1.198e-05,
+      "loss": 71.0703,
+      "step": 600
+    },
+    {
+      "epoch": 0.020726462511010933,
+      "grad_norm": 16.140579223632812,
+      "learning_rate": 1.3980000000000002e-05,
+      "loss": 69.5824,
+      "step": 700
+    },
+    {
+      "epoch": 0.02368738572686964,
+      "grad_norm": 13.456184387207031,
+      "learning_rate": 1.598e-05,
+      "loss": 67.2135,
+      "step": 800
+    },
+    {
+      "epoch": 0.026648308942728342,
+      "grad_norm": 11.778711318969727,
+      "learning_rate": 1.798e-05,
+      "loss": 65.1397,
+      "step": 900
+    },
+    {
+      "epoch": 0.02960923215858705,
+      "grad_norm": 11.965922355651855,
+      "learning_rate": 1.9980000000000002e-05,
+      "loss": 63.204,
+      "step": 1000
+    },
+    {
+      "epoch": 0.02960923215858705,
+      "eval_loss": 7.691287040710449,
+      "eval_runtime": 37.9485,
+      "eval_samples_per_second": 28.486,
+      "eval_steps_per_second": 7.141,
+      "step": 1000
+    },
+    {
+      "epoch": 0.032570155374445756,
+      "grad_norm": 8.482980728149414,
+      "learning_rate": 2.198e-05,
+      "loss": 60.9402,
+      "step": 1100
+    },
+    {
+      "epoch": 0.03553107859030446,
+      "grad_norm": 49.948341369628906,
+      "learning_rate": 2.398e-05,
+      "loss": 58.9346,
+      "step": 1200
+    },
+    {
+      "epoch": 0.03849200180616316,
+      "grad_norm": 10.039616584777832,
+      "learning_rate": 2.5980000000000002e-05,
+      "loss": 57.309,
+      "step": 1300
+    },
+    {
+      "epoch": 0.041452925022021865,
+      "grad_norm": 8.930785179138184,
+      "learning_rate": 2.798e-05,
+      "loss": 56.3741,
+      "step": 1400
+    },
+    {
+      "epoch": 0.04441384823788057,
+      "grad_norm": 8.14844036102295,
+      "learning_rate": 2.998e-05,
+      "loss": 55.8969,
+      "step": 1500
+    },
+    {
+      "epoch": 0.04737477145373928,
+      "grad_norm": 16.170246124267578,
+      "learning_rate": 3.198e-05,
+      "loss": 55.12,
+      "step": 1600
+    },
+    {
+      "epoch": 0.05033569466959798,
+      "grad_norm": 10.525145530700684,
+      "learning_rate": 3.398e-05,
+      "loss": 54.5077,
+      "step": 1700
+    },
+    {
+      "epoch": 0.053296617885456685,
+      "grad_norm": 29.427160263061523,
+      "learning_rate": 3.5980000000000004e-05,
+      "loss": 54.2087,
+      "step": 1800
+    },
+    {
+      "epoch": 0.05625754110131539,
+      "grad_norm": 47.125083923339844,
+      "learning_rate": 3.798e-05,
+      "loss": 53.4388,
+      "step": 1900
+    },
+    {
+      "epoch": 0.0592184643171741,
+      "grad_norm": 10.07633113861084,
+      "learning_rate": 3.998e-05,
+      "loss": 53.5798,
+      "step": 2000
+    },
+    {
+      "epoch": 0.0592184643171741,
+      "eval_loss": 6.5744218826293945,
+      "eval_runtime": 38.8407,
+      "eval_samples_per_second": 27.832,
+      "eval_steps_per_second": 6.977,
+      "step": 2000
+    },
+    {
+      "epoch": 0.0621793875330328,
+      "grad_norm": 10.918025970458984,
+      "learning_rate": 4.198e-05,
+      "loss": 52.5816,
+      "step": 2100
+    },
+    {
+      "epoch": 0.06514031074889151,
+      "grad_norm": 12.68106460571289,
+      "learning_rate": 4.398e-05,
+      "loss": 52.6681,
+      "step": 2200
+    },
+    {
+      "epoch": 0.06810123396475021,
+      "grad_norm": 15.503605842590332,
+      "learning_rate": 4.598e-05,
+      "loss": 52.5443,
+      "step": 2300
+    },
+    {
+      "epoch": 0.07106215718060892,
+      "grad_norm": 10.995290756225586,
+      "learning_rate": 4.798e-05,
+      "loss": 51.4841,
+      "step": 2400
+    },
+    {
+      "epoch": 0.07402308039646761,
+      "grad_norm": 22.71038055419922,
+      "learning_rate": 4.998e-05,
+      "loss": 51.05,
+      "step": 2500
+    },
+    {
+      "epoch": 0.07698400361232632,
+      "grad_norm": 11.035310745239258,
+      "learning_rate": 5.198e-05,
+      "loss": 50.8273,
+      "step": 2600
+    },
+    {
+      "epoch": 0.07994492682818503,
+      "grad_norm": 14.107246398925781,
+      "learning_rate": 5.398e-05,
+      "loss": 50.9043,
+      "step": 2700
+    },
+    {
+      "epoch": 0.08290585004404373,
+      "grad_norm": 11.570377349853516,
+      "learning_rate": 5.598e-05,
+      "loss": 50.4862,
+      "step": 2800
+    },
+    {
+      "epoch": 0.08586677325990244,
+      "grad_norm": 16.12681770324707,
+      "learning_rate": 5.7980000000000004e-05,
+      "loss": 50.0248,
+      "step": 2900
+    },
+    {
+      "epoch": 0.08882769647576114,
+      "grad_norm": 14.925129890441895,
+      "learning_rate": 5.9980000000000005e-05,
+      "loss": 49.789,
+      "step": 3000
+    },
+    {
+      "epoch": 0.08882769647576114,
+      "eval_loss": 6.141844272613525,
+      "eval_runtime": 38.2773,
+      "eval_samples_per_second": 28.241,
+      "eval_steps_per_second": 7.08,
+      "step": 3000
+    },
+    {
+      "epoch": 0.09178861969161985,
+      "grad_norm": 16.748519897460938,
+      "learning_rate": 6.198e-05,
+      "loss": 49.4085,
+      "step": 3100
+    },
+    {
+      "epoch": 0.09474954290747856,
+      "grad_norm": 12.9891939163208,
+      "learning_rate": 6.398000000000001e-05,
+      "loss": 49.1003,
+      "step": 3200
+    },
+    {
+      "epoch": 0.09771046612333725,
+      "grad_norm": 11.456365585327148,
+      "learning_rate": 6.598e-05,
+      "loss": 48.6685,
+      "step": 3300
+    },
+    {
+      "epoch": 0.10067138933919596,
+      "grad_norm": 19.153154373168945,
+      "learning_rate": 6.798e-05,
+      "loss": 48.3342,
+      "step": 3400
+    },
+    {
+      "epoch": 0.10363231255505466,
+      "grad_norm": 13.501580238342285,
+      "learning_rate": 6.998e-05,
+      "loss": 47.7641,
+      "step": 3500
+    },
+    {
+      "epoch": 0.10659323577091337,
+      "grad_norm": 14.034686088562012,
+      "learning_rate": 7.198e-05,
+      "loss": 47.8053,
+      "step": 3600
+    },
+    {
+      "epoch": 0.10955415898677208,
+      "grad_norm": 14.023058891296387,
+      "learning_rate": 7.398e-05,
+      "loss": 47.4925,
+      "step": 3700
+    },
+    {
+      "epoch": 0.11251508220263078,
+      "grad_norm": 16.410221099853516,
+      "learning_rate": 7.598e-05,
+      "loss": 47.1501,
+      "step": 3800
+    },
+    {
+      "epoch": 0.11547600541848949,
+      "grad_norm": 17.547571182250977,
+      "learning_rate": 7.798000000000001e-05,
+      "loss": 47.3132,
+      "step": 3900
+    },
+    {
+      "epoch": 0.1184369286343482,
+      "grad_norm": 11.753161430358887,
+      "learning_rate": 7.998e-05,
+      "loss": 47.071,
+      "step": 4000
+    },
+    {
+      "epoch": 0.1184369286343482,
+      "eval_loss": 5.719655990600586,
+      "eval_runtime": 38.3575,
+      "eval_samples_per_second": 28.182,
+      "eval_steps_per_second": 7.065,
+      "step": 4000
+    },
+    {
+      "epoch": 0.12139785185020689,
+      "grad_norm": 51.37761306762695,
+      "learning_rate": 8.198000000000001e-05,
+      "loss": 46.1728,
+      "step": 4100
+    },
+    {
+      "epoch": 0.1243587750660656,
+      "grad_norm": 12.61581802368164,
+      "learning_rate": 8.398e-05,
+      "loss": 45.9941,
+      "step": 4200
+    },
+    {
+      "epoch": 0.1273196982819243,
+      "grad_norm": 12.667135238647461,
+      "learning_rate": 8.598e-05,
+      "loss": 46.1649,
+      "step": 4300
+    },
+    {
+      "epoch": 0.13028062149778302,
+      "grad_norm": 10.368157386779785,
+      "learning_rate": 8.798e-05,
+      "loss": 45.9172,
+      "step": 4400
+    },
+    {
+      "epoch": 0.13324154471364172,
+      "grad_norm": 12.5702543258667,
+      "learning_rate": 8.998e-05,
+      "loss": 45.5541,
+      "step": 4500
+    },
+    {
+      "epoch": 0.13620246792950041,
+      "grad_norm": 10.535380363464355,
+      "learning_rate": 9.198e-05,
+      "loss": 44.45,
+      "step": 4600
+    },
+    {
+      "epoch": 0.1391633911453591,
+      "grad_norm": 11.035446166992188,
+      "learning_rate": 9.398e-05,
+      "loss": 44.3243,
+      "step": 4700
+    },
+    {
+      "epoch": 0.14212431436121784,
+      "grad_norm": 10.718255043029785,
+      "learning_rate": 9.598e-05,
+      "loss": 43.8677,
+      "step": 4800
+    },
+    {
+      "epoch": 0.14508523757707653,
+      "grad_norm": 9.816108703613281,
+      "learning_rate": 9.798000000000001e-05,
+      "loss": 43.6948,
+      "step": 4900
+    },
+    {
+      "epoch": 0.14804616079293523,
+      "grad_norm": 9.837696075439453,
+      "learning_rate": 9.998000000000002e-05,
+      "loss": 43.6361,
+      "step": 5000
+    },
+    {
+      "epoch": 0.14804616079293523,
+      "eval_loss": 5.329010486602783,
+      "eval_runtime": 38.0693,
+      "eval_samples_per_second": 28.396,
+      "eval_steps_per_second": 7.119,
+      "step": 5000
+    },
+    {
+      "epoch": 0.15100708400879395,
+      "grad_norm": 10.140490531921387,
+      "learning_rate": 0.00010198,
+      "loss": 42.8106,
+      "step": 5100
+    },
+    {
+      "epoch": 0.15396800722465265,
+      "grad_norm": 9.330647468566895,
+      "learning_rate": 0.00010398,
+      "loss": 42.4439,
+      "step": 5200
+    },
+    {
+      "epoch": 0.15692893044051134,
+      "grad_norm": 9.038117408752441,
+      "learning_rate": 0.00010598,
+      "loss": 41.8943,
+      "step": 5300
+    },
+    {
+      "epoch": 0.15988985365637007,
+      "grad_norm": 10.28738021850586,
+      "learning_rate": 0.00010798,
+      "loss": 41.5117,
+      "step": 5400
+    },
+    {
+      "epoch": 0.16285077687222876,
+      "grad_norm": 9.869328498840332,
+      "learning_rate": 0.00010998,
+      "loss": 41.3489,
+      "step": 5500
+    },
+    {
+      "epoch": 0.16581170008808746,
+      "grad_norm": 10.985088348388672,
+      "learning_rate": 0.00011198000000000001,
+      "loss": 40.7585,
+      "step": 5600
+    },
+    {
+      "epoch": 0.16877262330394618,
+      "grad_norm": 11.46516227722168,
+      "learning_rate": 0.00011398,
+      "loss": 40.2893,
+      "step": 5700
+    },
+    {
+      "epoch": 0.17173354651980488,
+      "grad_norm": 9.691688537597656,
+      "learning_rate": 0.00011598000000000001,
+      "loss": 40.0513,
+      "step": 5800
+    },
+    {
+      "epoch": 0.17469446973566358,
+      "grad_norm": 10.819178581237793,
+      "learning_rate": 0.00011798,
+      "loss": 39.986,
+      "step": 5900
+    },
+    {
+      "epoch": 0.17765539295152227,
+      "grad_norm": 9.694029808044434,
+      "learning_rate": 0.00011998,
+      "loss": 39.3918,
+      "step": 6000
+    },
+    {
+      "epoch": 0.17765539295152227,
+      "eval_loss": 4.900777339935303,
+      "eval_runtime": 38.052,
+      "eval_samples_per_second": 28.409,
+      "eval_steps_per_second": 7.122,
+      "step": 6000
+    },
+    {
+      "epoch": 0.180616316167381,
+      "grad_norm": 9.988055229187012,
+      "learning_rate": 0.00011999900481764066,
+      "loss": 39.336,
+      "step": 6100
+    },
+    {
+      "epoch": 0.1835772393832397,
+      "grad_norm": 13.242379188537598,
+      "learning_rate": 0.00011999597899343296,
+      "loss": 39.0612,
+      "step": 6200
+    },
+    {
+      "epoch": 0.1865381625990984,
+      "grad_norm": 13.935639381408691,
+      "learning_rate": 0.00011999092252825071,
+      "loss": 38.585,
+      "step": 6300
+    },
+    {
+      "epoch": 0.1894990858149571,
+      "grad_norm": 9.50368881225586,
+      "learning_rate": 0.00011998383559323646,
+      "loss": 38.3112,
+      "step": 6400
+    },
+    {
+      "epoch": 0.1924600090308158,
+      "grad_norm": 10.878887176513672,
+      "learning_rate": 0.00011997471842825661,
+      "loss": 38.3219,
+      "step": 6500
+    },
+    {
+      "epoch": 0.1954209322466745,
+      "grad_norm": 10.517402648925781,
+      "learning_rate": 0.00011996357134189334,
+      "loss": 37.8246,
+      "step": 6600
+    },
+    {
+      "epoch": 0.19838185546253323,
+      "grad_norm": 10.922290802001953,
+      "learning_rate": 0.0001199503947114341,
+      "loss": 37.6387,
+      "step": 6700
+    },
+    {
+      "epoch": 0.20134277867839193,
+      "grad_norm": 11.845630645751953,
+      "learning_rate": 0.00011993518898285887,
+      "loss": 37.8343,
+      "step": 6800
+    },
+    {
+      "epoch": 0.20430370189425062,
+      "grad_norm": 8.628484725952148,
+      "learning_rate": 0.00011991795467082508,
+      "loss": 37.5011,
+      "step": 6900
+    },
+    {
+      "epoch": 0.20726462511010932,
+      "grad_norm": 9.489052772521973,
+      "learning_rate": 0.00011989869235865012,
+      "loss": 37.132,
+      "step": 7000
+    },
+    {
+      "epoch": 0.20726462511010932,
+      "eval_loss": 4.595886707305908,
+      "eval_runtime": 38.0814,
+      "eval_samples_per_second": 28.387,
+      "eval_steps_per_second": 7.116,
+      "step": 7000
+    },
+    {
+      "epoch": 0.21022554832596804,
+      "grad_norm": 9.687568664550781,
+      "learning_rate": 0.00011987740269829175,
+      "loss": 36.9362,
+      "step": 7100
+    },
+    {
+      "epoch": 0.21318647154182674,
+      "grad_norm": 8.676931381225586,
+      "learning_rate": 0.0001198540864103258,
+      "loss": 37.0267,
+      "step": 7200
+    },
+    {
+      "epoch": 0.21614739475768543,
+      "grad_norm": 9.232645988464355,
+      "learning_rate": 0.00011982874428392204,
+      "loss": 36.5181,
+      "step": 7300
+    },
+    {
+      "epoch": 0.21910831797354416,
+      "grad_norm": 8.917469024658203,
+      "learning_rate": 0.00011980137717681727,
+      "loss": 36.5812,
+      "step": 7400
+    },
+    {
+      "epoch": 0.22206924118940286,
+      "grad_norm": 8.593257904052734,
+      "learning_rate": 0.0001197719860152864,
+      "loss": 36.0672,
+      "step": 7500
+    },
+    {
+      "epoch": 0.22503016440526155,
+      "grad_norm": 10.630696296691895,
+      "learning_rate": 0.00011974057179411103,
+      "loss": 36.2405,
+      "step": 7600
+    },
+    {
+      "epoch": 0.22799108762112028,
+      "grad_norm": 9.975415229797363,
+      "learning_rate": 0.00011970713557654582,
+      "loss": 35.9903,
+      "step": 7700
+    },
+    {
+      "epoch": 0.23095201083697897,
+      "grad_norm": 8.622698783874512,
+      "learning_rate": 0.00011967167849428251,
+      "loss": 35.8196,
+      "step": 7800
+    },
+    {
+      "epoch": 0.23391293405283767,
+      "grad_norm": 14.828067779541016,
+      "learning_rate": 0.00011963420174741161,
+      "loss": 35.7946,
+      "step": 7900
+    },
+    {
+      "epoch": 0.2368738572686964,
+      "grad_norm": 9.303028106689453,
+      "learning_rate": 0.00011959470660438173,
+      "loss": 35.5493,
+      "step": 8000
+    },
+    {
+      "epoch": 0.2368738572686964,
+      "eval_loss": 4.408100128173828,
+      "eval_runtime": 37.807,
+      "eval_samples_per_second": 28.593,
+      "eval_steps_per_second": 7.168,
+      "step": 8000
+    },
+    {
+      "epoch": 0.2398347804845551,
+      "grad_norm": 11.987268447875977,
+      "learning_rate": 0.00011955319440195674,
+      "loss": 35.6014,
+      "step": 8100
+    },
+    {
+      "epoch": 0.24279570370041378,
+      "grad_norm": 10.032620429992676,
+      "learning_rate": 0.00011950966654517043,
+      "loss": 35.5302,
+      "step": 8200
+    },
+    {
+      "epoch": 0.24575662691627248,
+      "grad_norm": 9.362653732299805,
+      "learning_rate": 0.00011946412450727906,
+      "loss": 35.2124,
+      "step": 8300
+    },
+    {
+      "epoch": 0.2487175501321312,
+      "grad_norm": 9.706056594848633,
+      "learning_rate": 0.00011941656982971138,
+      "loss": 34.9229,
+      "step": 8400
+    },
+    {
+      "epoch": 0.25167847334798993,
+      "grad_norm": 10.424148559570312,
+      "learning_rate": 0.00011936700412201653,
+      "loss": 35.1602,
+      "step": 8500
+    },
+    {
+      "epoch": 0.2546393965638486,
+      "grad_norm": 10.900792121887207,
+      "learning_rate": 0.00011931542906180957,
+      "loss": 34.9212,
+      "step": 8600
+    },
+    {
+      "epoch": 0.2576003197797073,
+      "grad_norm": 10.541563034057617,
+      "learning_rate": 0.00011926184639471465,
+      "loss": 34.8347,
+      "step": 8700
+    },
+    {
+      "epoch": 0.26056124299556604,
+      "grad_norm": 8.576896667480469,
+      "learning_rate": 0.00011920625793430596,
+      "loss": 34.9933,
+      "step": 8800
+    },
+    {
+      "epoch": 0.2635221662114247,
+      "grad_norm": 10.162493705749512,
+      "learning_rate": 0.00011914866556204637,
+      "loss": 34.3925,
+      "step": 8900
+    },
+    {
+      "epoch": 0.26648308942728344,
+      "grad_norm": 11.247607231140137,
+      "learning_rate": 0.0001190890712272237,
+      "loss": 34.4828,
+      "step": 9000
+    },
+    {
+      "epoch": 0.26648308942728344,
+      "eval_loss": 4.2549567222595215,
+      "eval_runtime": 37.961,
+      "eval_samples_per_second": 28.477,
+      "eval_steps_per_second": 7.139,
+      "step": 9000
+    },
+    {
+      "epoch": 0.2694440126431421,
+      "grad_norm": 9.189545631408691,
+      "learning_rate": 0.00011902747694688472,
+      "loss": 34.3655,
+      "step": 9100
+    },
+    {
+      "epoch": 0.27240493585900083,
+      "grad_norm": 11.199912071228027,
+      "learning_rate": 0.000118963884805767,
+      "loss": 34.4358,
+      "step": 9200
+    },
+    {
+      "epoch": 0.27536585907485955,
+      "grad_norm": 9.673705101013184,
+      "learning_rate": 0.00011889829695622823,
+      "loss": 34.3689,
+      "step": 9300
+    },
+    {
+      "epoch": 0.2783267822907182,
+      "grad_norm": 10.03848934173584,
+      "learning_rate": 0.00011883071561817344,
+      "loss": 33.9158,
+      "step": 9400
+    },
+    {
+      "epoch": 0.28128770550657695,
+      "grad_norm": 11.581180572509766,
+      "learning_rate": 0.00011876114307897981,
+      "loss": 33.992,
+      "step": 9500
+    },
+    {
+      "epoch": 0.28424862872243567,
+      "grad_norm": 10.81711483001709,
+      "learning_rate": 0.00011868958169341929,
+      "loss": 34.1195,
+      "step": 9600
+    },
+    {
+      "epoch": 0.28720955193829434,
+      "grad_norm": 9.648648262023926,
+      "learning_rate": 0.00011861603388357893,
+      "loss": 34.1664,
+      "step": 9700
+    },
+    {
+      "epoch": 0.29017047515415306,
+      "grad_norm": 11.37558364868164,
+      "learning_rate": 0.00011854050213877877,
+      "loss": 33.9937,
+      "step": 9800
+    },
+    {
+      "epoch": 0.2931313983700118,
+      "grad_norm": 9.346961975097656,
+      "learning_rate": 0.0001184629890154878,
+      "loss": 33.6917,
+      "step": 9900
+    },
+    {
+      "epoch": 0.29609232158587045,
+      "grad_norm": 11.31644058227539,
+      "learning_rate": 0.0001183834971372372,
+      "loss": 33.7808,
+      "step": 10000
+    },
+    {
+      "epoch": 0.29609232158587045,
+      "eval_loss": 4.157764434814453,
+      "eval_runtime": 39.9196,
+      "eval_samples_per_second": 27.079,
+      "eval_steps_per_second": 6.789,
+      "step": 10000
+    },
+    {
+      "epoch": 0.2990532448017292,
+      "grad_norm": 18.920991897583008,
+      "learning_rate": 0.00011180531798567065,
+      "loss": 44.0571,
+      "step": 10100
+    },
+    {
+      "epoch": 0.3020141680175879,
+      "grad_norm": 15.393646240234375,
+      "learning_rate": 0.0001116461207502148,
+      "loss": 39.8888,
+      "step": 10200
+    },
+    {
+      "epoch": 0.30497509123344657,
+      "grad_norm": 13.2774076461792,
+      "learning_rate": 0.00011148550761026972,
+      "loss": 38.6529,
+      "step": 10300
+    },
+    {
+      "epoch": 0.3079360144493053,
+      "grad_norm": 13.391098976135254,
+      "learning_rate": 0.00011132348296912578,
+      "loss": 37.759,
+      "step": 10400
+    },
+    {
+      "epoch": 0.310896937665164,
+      "grad_norm": 11.917950630187988,
+      "learning_rate": 0.00011116005126877037,
+      "loss": 37.1968,
+      "step": 10500
+    },
+    {
+      "epoch": 0.3138578608810227,
+      "grad_norm": 11.100213050842285,
+      "learning_rate": 0.0001109952169897661,
+      "loss": 37.213,
+      "step": 10600
+    },
+    {
+      "epoch": 0.3168187840968814,
+      "grad_norm": 14.579487800598145,
+      "learning_rate": 0.00011082898465112802,
+      "loss": 36.7415,
+      "step": 10700
+    },
+    {
+      "epoch": 0.31977970731274014,
+      "grad_norm": 11.359614372253418,
+      "learning_rate": 0.00011066135881019965,
+      "loss": 36.361,
+      "step": 10800
+    },
+    {
+      "epoch": 0.3227406305285988,
+      "grad_norm": 12.316486358642578,
+      "learning_rate": 0.00011049234406252809,
+      "loss": 36.0591,
+      "step": 10900
+    },
+    {
+      "epoch": 0.32570155374445753,
+      "grad_norm": 13.45693588256836,
+      "learning_rate": 0.00011032194504173804,
+      "loss": 35.6357,
+      "step": 11000
+    },
+    {
+      "epoch": 0.32570155374445753,
+      "eval_loss": 4.41249418258667,
+      "eval_runtime": 39.8412,
+      "eval_samples_per_second": 27.133,
+      "eval_steps_per_second": 6.802,
+      "step": 11000
+    },
+    {
+      "epoch": 0.32866247696031625,
+      "grad_norm": 10.018808364868164,
+      "learning_rate": 0.00011412090151135696,
+      "loss": 33.7098,
+      "step": 11100
+    },
+    {
+      "epoch": 0.3316234001761749,
+      "grad_norm": 10.30320930480957,
+      "learning_rate": 0.00011397731809339621,
+      "loss": 33.7831,
+      "step": 11200
+    },
+    {
+      "epoch": 0.33458432339203364,
+      "grad_norm": 9.650611877441406,
+      "learning_rate": 0.0001138320949911399,
+      "loss": 33.5415,
+      "step": 11300
+    },
+    {
+      "epoch": 0.33754524660789237,
+      "grad_norm": 8.77065372467041,
+      "learning_rate": 0.0001136852366160714,
+      "loss": 33.2261,
+      "step": 11400
+    },
+    {
+      "epoch": 0.34050616982375104,
+      "grad_norm": 12.062385559082031,
+      "learning_rate": 0.00011353674742934919,
+      "loss": 33.0819,
+      "step": 11500
+    },
+    {
+      "epoch": 0.34346709303960976,
+      "grad_norm": 10.947739601135254,
+      "learning_rate": 0.00011338663194167138,
+      "loss": 33.3451,
+      "step": 11600
+    },
+    {
+      "epoch": 0.34642801625546843,
+      "grad_norm": 9.377535820007324,
+      "learning_rate": 0.00011323489471313875,
+      "loss": 32.8928,
+      "step": 11700
+    },
+    {
+      "epoch": 0.34938893947132715,
+      "grad_norm": 8.902270317077637,
+      "learning_rate": 0.00011308154035311608,
+      "loss": 33.1756,
+      "step": 11800
+    },
+    {
+      "epoch": 0.3523498626871859,
+      "grad_norm": 10.434513092041016,
+      "learning_rate": 0.00011292657352009224,
+      "loss": 33.1595,
+      "step": 11900
+    },
+    {
+      "epoch": 0.35531078590304455,
+      "grad_norm": 11.084539413452148,
+      "learning_rate": 0.00011276999892153867,
+      "loss": 33.359,
+      "step": 12000
+    },
+    {
+      "epoch": 0.35531078590304455,
+      "eval_loss": 4.073917865753174,
+      "eval_runtime": 38.304,
+      "eval_samples_per_second": 28.222,
+      "eval_steps_per_second": 7.075,
+      "step": 12000
+    },
+    {
+      "epoch": 0.35827170911890327,
+      "grad_norm": 7.943862438201904,
+      "learning_rate": 9.143653002276282e-05,
+      "loss": 32.5648,
+      "step": 12100
+    },
+    {
+      "epoch": 0.361232632334762,
+      "grad_norm": 8.098073959350586,
+      "learning_rate": 9.09346201340685e-05,
+      "loss": 32.1551,
+      "step": 12200
+    },
+    {
+      "epoch": 0.36419355555062066,
+      "grad_norm": 7.46992826461792,
+      "learning_rate": 9.042974429385753e-05,
+      "loss": 32.3569,
+      "step": 12300
+    },
+    {
+      "epoch": 0.3671544787664794,
+      "grad_norm": 7.480947971343994,
+      "learning_rate": 8.992195090864853e-05,
+      "loss": 32.4467,
+      "step": 12400
+    },
+    {
+      "epoch": 0.3701154019823381,
+      "grad_norm": 7.488786220550537,
+      "learning_rate": 8.941128866468864e-05,
+      "loss": 32.4447,
+      "step": 12500
+    },
+    {
+      "epoch": 0.3730763251981968,
+      "grad_norm": 8.124217987060547,
+      "learning_rate": 8.889780652328559e-05,
+      "loss": 32.3657,
+      "step": 12600
+    },
+    {
+      "epoch": 0.3760372484140555,
+      "grad_norm": 8.322397232055664,
+      "learning_rate": 8.83815537161135e-05,
+      "loss": 31.9431,
+      "step": 12700
+    },
+    {
+      "epoch": 0.3789981716299142,
+      "grad_norm": 8.59915828704834,
+      "learning_rate": 8.786257974049245e-05,
+      "loss": 31.9211,
+      "step": 12800
+    },
+    {
+      "epoch": 0.3819590948457729,
+      "grad_norm": 8.048558235168457,
+      "learning_rate": 8.734093435464301e-05,
+      "loss": 32.437,
+      "step": 12900
+    },
+    {
+      "epoch": 0.3849200180616316,
+      "grad_norm": 7.816276550292969,
+      "learning_rate": 8.681666757291531e-05,
+      "loss": 32.0396,
+      "step": 13000
+    },
+    {
+      "epoch": 0.3849200180616316,
+      "eval_loss": 3.9447479248046875,
+      "eval_runtime": 112.3499,
+      "eval_samples_per_second": 9.622,
+      "eval_steps_per_second": 2.412,
+      "step": 13000
+    },
+    {
+      "epoch": 0.38788094127749034,
+      "grad_norm": 8.613288879394531,
+      "learning_rate": 8.628982966099388e-05,
+      "loss": 31.874,
+      "step": 13100
+    },
+    {
+      "epoch": 0.390841864493349,
+      "grad_norm": 7.478573799133301,
+      "learning_rate": 8.576047113107821e-05,
+      "loss": 31.7233,
+      "step": 13200
+    },
+    {
+      "epoch": 0.39380278770920774,
+      "grad_norm": 7.845474720001221,
+      "learning_rate": 8.52286427370398e-05,
+      "loss": 31.628,
+      "step": 13300
+    },
+    {
+      "epoch": 0.39676371092506646,
+      "grad_norm": 7.7132158279418945,
+      "learning_rate": 8.469439546955592e-05,
+      "loss": 31.8516,
+      "step": 13400
+    },
+    {
+      "epoch": 0.39972463414092513,
+      "grad_norm": 9.245190620422363,
+      "learning_rate": 8.415778055122073e-05,
+      "loss": 31.8406,
+      "step": 13500
+    },
+    {
+      "epoch": 0.40268555735678385,
+      "grad_norm": 8.426488876342773,
+      "learning_rate": 8.361884943163423e-05,
+      "loss": 31.7148,
+      "step": 13600
+    },
+    {
+      "epoch": 0.4056464805726426,
+      "grad_norm": 7.879675388336182,
+      "learning_rate": 8.307765378246925e-05,
+      "loss": 31.9798,
+      "step": 13700
+    },
+    {
+      "epoch": 0.40860740378850124,
+      "grad_norm": 8.469719886779785,
+      "learning_rate": 8.253424549251735e-05,
+      "loss": 31.6741,
+      "step": 13800
+    },
+    {
+      "epoch": 0.41156832700435997,
+      "grad_norm": 8.198810577392578,
+      "learning_rate": 8.198867666271385e-05,
+      "loss": 31.6722,
+      "step": 13900
+    },
+    {
+      "epoch": 0.41452925022021864,
+      "grad_norm": 7.881684303283691,
+      "learning_rate": 8.144099960114239e-05,
+      "loss": 31.8682,
+      "step": 14000
+    },
+    {
+      "epoch": 0.41452925022021864,
+      "eval_loss": 3.904888153076172,
+      "eval_runtime": 110.4703,
+      "eval_samples_per_second": 9.785,
+      "eval_steps_per_second": 2.453,
+      "step": 14000
+    },
+    {
+      "epoch": 0.41749017343607736,
+      "grad_norm": 7.772391319274902,
+      "learning_rate": 8.089126681801981e-05,
+      "loss": 32.0349,
+      "step": 14100
+    },
+    {
+      "epoch": 0.4204510966519361,
+      "grad_norm": 8.459504127502441,
+      "learning_rate": 8.033953102066161e-05,
+      "loss": 31.5844,
+      "step": 14200
+    },
+    {
+      "epoch": 0.42341201986779475,
+      "grad_norm": 7.765544414520264,
+      "learning_rate": 7.978584510842833e-05,
+      "loss": 31.6879,
+      "step": 14300
+    },
+    {
+      "epoch": 0.4263729430836535,
+      "grad_norm": 8.06749153137207,
+      "learning_rate": 7.923026216765381e-05,
+      "loss": 31.5893,
+      "step": 14400
+    },
+    {
+      "epoch": 0.4293338662995122,
+      "grad_norm": 8.966425895690918,
+      "learning_rate": 7.86728354665553e-05,
+      "loss": 31.392,
+      "step": 14500
+    },
+    {
+      "epoch": 0.43229478951537087,
+      "grad_norm": 8.47319221496582,
+      "learning_rate": 7.81136184501262e-05,
+      "loss": 31.3068,
+      "step": 14600
+    },
+    {
+      "epoch": 0.4352557127312296,
+      "grad_norm": 8.642230033874512,
+      "learning_rate": 7.755266473501193e-05,
+      "loss": 31.5877,
+      "step": 14700
+    },
+    {
+      "epoch": 0.4382166359470883,
+      "grad_norm": 8.412428855895996,
+      "learning_rate": 7.699002810436915e-05,
+      "loss": 31.6239,
+      "step": 14800
+    },
+    {
+      "epoch": 0.441177559162947,
+      "grad_norm": 6.971558094024658,
+      "learning_rate": 7.642576250270929e-05,
+      "loss": 31.7946,
+      "step": 14900
+    },
+    {
+      "epoch": 0.4441384823788057,
+      "grad_norm": 7.922480583190918,
+      "learning_rate": 7.585992203072628e-05,
+      "loss": 31.4474,
+      "step": 15000
+    },
+    {
+      "epoch": 0.4441384823788057,
+      "eval_loss": 3.875948667526245,
+      "eval_runtime": 109.3049,
+      "eval_samples_per_second": 9.89,
+      "eval_steps_per_second": 2.479,
+      "step": 15000
+    },
+    {
+      "epoch": 0.44709940559466443,
+      "grad_norm": 8.747485160827637,
+      "learning_rate": 7.529256094010965e-05,
+      "loss": 31.6016,
+      "step": 15100
+    },
+    {
+      "epoch": 0.4500603288105231,
+      "grad_norm": 8.723346710205078,
+      "learning_rate": 7.472373362834283e-05,
+      "loss": 31.2744,
+      "step": 15200
+    },
+    {
+      "epoch": 0.4530212520263818,
+      "grad_norm": 8.310611724853516,
+      "learning_rate": 7.415349463348775e-05,
+      "loss": 31.7448,
+      "step": 15300
+    },
+    {
+      "epoch": 0.45598217524224055,
+      "grad_norm": 8.236388206481934,
+      "learning_rate": 7.358189862895577e-05,
+      "loss": 30.9859,
+      "step": 15400
+    },
+    {
+      "epoch": 0.4589430984580992,
+      "grad_norm": 8.104386329650879,
+      "learning_rate": 7.300900041826566e-05,
+      "loss": 31.1935,
+      "step": 15500
+    },
+    {
+      "epoch": 0.46190402167395794,
+      "grad_norm": 8.219923973083496,
+      "learning_rate": 7.243485492978928e-05,
+      "loss": 30.9099,
+      "step": 15600
+    },
+    {
+      "epoch": 0.46486494488981667,
+      "grad_norm": 8.872945785522461,
+      "learning_rate": 7.185951721148502e-05,
+      "loss": 31.3423,
+      "step": 15700
+    },
+    {
+      "epoch": 0.46782586810567534,
+      "grad_norm": 8.087647438049316,
+      "learning_rate": 7.128304242561999e-05,
+      "loss": 31.1816,
+      "step": 15800
+    },
+    {
+      "epoch": 0.47078679132153406,
+      "grad_norm": 8.805392265319824,
+      "learning_rate": 7.070548584348108e-05,
+      "loss": 31.0977,
+      "step": 15900
+    },
+    {
+      "epoch": 0.4737477145373928,
+      "grad_norm": 8.469452857971191,
+      "learning_rate": 7.012690284007577e-05,
+      "loss": 31.5828,
+      "step": 16000
+    },
+    {
+      "epoch": 0.4737477145373928,
+      "eval_loss": 3.8530030250549316,
+      "eval_runtime": 109.275,
+      "eval_samples_per_second": 9.892,
+      "eval_steps_per_second": 2.48,
+      "step": 16000
+    },
+    {
+      "epoch": 0.47670863775325145,
+      "grad_norm": 8.871159553527832,
+      "learning_rate": 6.954734888882281e-05,
+      "loss": 30.9753,
+      "step": 16100
+    },
+    {
+      "epoch": 0.4796695609691102,
+      "grad_norm": 8.81116008758545,
+      "learning_rate": 6.896687955623357e-05,
+      "loss": 31.2067,
+      "step": 16200
+    },
+    {
+      "epoch": 0.4826304841849689,
+      "grad_norm": 7.77982759475708,
+      "learning_rate": 6.838555049658432e-05,
+      "loss": 31.089,
+      "step": 16300
+    },
+    {
+      "epoch": 0.48559140740082757,
+      "grad_norm": 8.370245933532715,
+      "learning_rate": 6.780341744658044e-05,
+      "loss": 30.9776,
+      "step": 16400
+    },
+    {
+      "epoch": 0.4885523306166863,
+      "grad_norm": 8.41613483428955,
+      "learning_rate": 6.722053622001221e-05,
+      "loss": 31.1095,
+      "step": 16500
+    },
+    {
+      "epoch": 0.49151325383254496,
+      "grad_norm": 7.951696395874023,
+      "learning_rate": 6.663696270240373e-05,
+      "loss": 31.1532,
+      "step": 16600
+    },
+    {
+      "epoch": 0.4944741770484037,
+      "grad_norm": 9.02717113494873,
+      "learning_rate": 6.60527528456546e-05,
+      "loss": 31.0777,
+      "step": 16700
+    },
+    {
+      "epoch": 0.4974351002642624,
+      "grad_norm": 8.57259750366211,
+      "learning_rate": 6.546796266267535e-05,
+      "loss": 31.3509,
+      "step": 16800
+    },
+    {
+      "epoch": 0.5003960234801211,
+      "grad_norm": 9.129491806030273,
+      "learning_rate": 6.488264822201711e-05,
+      "loss": 30.7844,
+      "step": 16900
+    },
+    {
+      "epoch": 0.5033569466959799,
+      "grad_norm": 8.600064277648926,
+      "learning_rate": 6.429686564249579e-05,
+      "loss": 31.1164,
+      "step": 17000
+    },
+    {
+      "epoch": 0.5033569466959799,
+      "eval_loss": 3.836409091949463,
+      "eval_runtime": 109.0903,
+      "eval_samples_per_second": 9.909,
+      "eval_steps_per_second": 2.484,
+      "step": 17000
+    },
+    {
+      "epoch": 0.5063178699118385,
+      "grad_norm": 8.62096881866455,
+      "learning_rate": 6.371067108781158e-05,
+      "loss": 31.1944,
+      "step": 17100
+    },
+    {
+      "epoch": 0.5092787931276972,
+      "grad_norm": 8.052851676940918,
+      "learning_rate": 6.312412076116401e-05,
+      "loss": 31.0126,
+      "step": 17200
+    },
+    {
+      "epoch": 0.5122397163435559,
+      "grad_norm": 8.32268238067627,
+      "learning_rate": 6.253727089986337e-05,
+      "loss": 31.0692,
+      "step": 17300
+    },
+    {
+      "epoch": 0.5152006395594146,
+      "grad_norm": 8.130902290344238,
+      "learning_rate": 6.195017776993876e-05,
+      "loss": 30.9143,
+      "step": 17400
+    },
+    {
+      "epoch": 0.5181615627752734,
+      "grad_norm": 9.245232582092285,
+      "learning_rate": 6.136289766074334e-05,
+      "loss": 31.0029,
+      "step": 17500
+    },
+    {
+      "epoch": 0.5211224859911321,
+      "grad_norm": 8.296626091003418,
+      "learning_rate": 6.077548687955759e-05,
+      "loss": 31.0624,
+      "step": 17600
+    },
+    {
+      "epoch": 0.5240834092069907,
+      "grad_norm": 8.933104515075684,
+      "learning_rate": 6.018800174619048e-05,
+      "loss": 31.0619,
+      "step": 17700
+    },
+    {
+      "epoch": 0.5270443324228494,
+      "grad_norm": 7.37945032119751,
+      "learning_rate": 5.960049858757974e-05,
+      "loss": 31.3181,
+      "step": 17800
+    },
+    {
+      "epoch": 0.5300052556387082,
+      "grad_norm": 8.817550659179688,
+      "learning_rate": 5.901303373239133e-05,
+      "loss": 30.8424,
+      "step": 17900
+    },
+    {
+      "epoch": 0.5329661788545669,
+      "grad_norm": 7.71854305267334,
+      "learning_rate": 5.842566350561879e-05,
+      "loss": 31.0376,
+      "step": 18000
+    },
+    {
+      "epoch": 0.5329661788545669,
+      "eval_loss": 3.822613477706909,
+      "eval_runtime": 112.0979,
+      "eval_samples_per_second": 9.643,
+      "eval_steps_per_second": 2.418,
+      "step": 18000
+    },
+    {
+      "epoch": 0.5359271020704256,
+      "grad_norm": 8.84870719909668,
+      "learning_rate": 5.7838444223182826e-05,
+      "loss": 30.8901,
+      "step": 18100
+    },
+    {
+      "epoch": 0.5388880252862842,
+      "grad_norm": 7.48129415512085,
+      "learning_rate": 5.725143218653187e-05,
+      "loss": 31.0275,
+      "step": 18200
+    },
+    {
+      "epoch": 0.5418489485021429,
+      "grad_norm": 8.218484878540039,
+      "learning_rate": 5.666468367724412e-05,
+      "loss": 31.1443,
+      "step": 18300
+    },
+    {
+      "epoch": 0.5448098717180017,
+      "grad_norm": 9.589841842651367,
+      "learning_rate": 5.607825495163119e-05,
+      "loss": 30.9756,
+      "step": 18400
+    },
+    {
+      "epoch": 0.5477707949338604,
+      "grad_norm": 8.583683013916016,
+      "learning_rate": 5.549220223534451e-05,
+      "loss": 31.0641,
+      "step": 18500
+    },
+    {
+      "epoch": 0.5507317181497191,
+      "grad_norm": 7.978188991546631,
+      "learning_rate": 5.490658171798439e-05,
+      "loss": 30.8899,
+      "step": 18600
+    },
+    {
+      "epoch": 0.5536926413655778,
+      "grad_norm": 8.130802154541016,
+      "learning_rate": 5.432144954771287e-05,
+      "loss": 31.0812,
+      "step": 18700
+    },
+    {
+      "epoch": 0.5566535645814364,
+      "grad_norm": 8.981709480285645,
+      "learning_rate": 5.37368618258701e-05,
+      "loss": 31.0612,
+      "step": 18800
+    },
+    {
+      "epoch": 0.5596144877972952,
+      "grad_norm": 7.87661075592041,
+      "learning_rate": 5.315287460159561e-05,
+      "loss": 30.8581,
+      "step": 18900
+    },
+    {
+      "epoch": 0.5625754110131539,
+      "grad_norm": 8.329483032226562,
+      "learning_rate": 5.256954386645438e-05,
+      "loss": 31.1805,
+      "step": 19000
+    },
+    {
+      "epoch": 0.5625754110131539,
+      "eval_loss": 3.8131661415100098,
+      "eval_runtime": 111.5683,
+      "eval_samples_per_second": 9.689,
+      "eval_steps_per_second": 2.429,
+      "step": 19000
+    },
+    {
+      "epoch": 0.5655363342290126,
+      "grad_norm": 8.833015441894531,
+      "learning_rate": 5.198692554906851e-05,
+      "loss": 30.9231,
+      "step": 19100
+    },
+    {
+      "epoch": 0.5684972574448713,
+      "grad_norm": 7.966989994049072,
+      "learning_rate": 5.1405075509754834e-05,
+      "loss": 31.0225,
+      "step": 19200
+    },
+    {
+      "epoch": 0.5714581806607301,
+      "grad_norm": 8.791169166564941,
+      "learning_rate": 5.0824049535169166e-05,
+      "loss": 31.1551,
+      "step": 19300
+    },
+    {
+      "epoch": 0.5744191038765887,
+      "grad_norm": 7.9680023193359375,
+      "learning_rate": 5.024390333295761e-05,
+      "loss": 31.0498,
+      "step": 19400
+    },
+    {
+      "epoch": 0.5773800270924474,
+      "grad_norm": 8.603718757629395,
+      "learning_rate": 4.966469252641538e-05,
+      "loss": 30.9017,
+      "step": 19500
+    },
+    {
+      "epoch": 0.5803409503083061,
+      "grad_norm": 12.401627540588379,
+      "learning_rate": 4.908647264915378e-05,
+      "loss": 30.9988,
+      "step": 19600
+    },
+    {
+      "epoch": 0.5833018735241648,
+      "grad_norm": 8.433266639709473,
+      "learning_rate": 4.8509299139775734e-05,
+      "loss": 30.9905,
+      "step": 19700
+    },
+    {
+      "epoch": 0.5862627967400236,
+      "grad_norm": 7.99282693862915,
+      "learning_rate": 4.7933227336560414e-05,
+      "loss": 31.0604,
+      "step": 19800
+    },
+    {
+      "epoch": 0.5892237199558823,
+      "grad_norm": 8.011063575744629,
+      "learning_rate": 4.735831247215753e-05,
+      "loss": 30.7471,
+      "step": 19900
+    },
+    {
+      "epoch": 0.5921846431717409,
+      "grad_norm": 9.603862762451172,
+      "learning_rate": 4.67846096682918e-05,
+      "loss": 30.8428,
+      "step": 20000
+    },
+    {
+      "epoch": 0.5921846431717409,
+      "eval_loss": 3.8060901165008545,
+      "eval_runtime": 112.6154,
+      "eval_samples_per_second": 9.599,
+      "eval_steps_per_second": 2.406,
+      "step": 20000
+    },
+    {
+      "epoch": 0.5951455663875996,
+      "grad_norm": 8.427188873291016,
+      "learning_rate": 4.6212173930477874e-05,
+      "loss": 30.8438,
+      "step": 20100
+    },
+    {
+      "epoch": 0.5981064896034584,
+      "grad_norm": 7.692320346832275,
+      "learning_rate": 4.5641060142746556e-05,
+      "loss": 30.7664,
+      "step": 20200
+    },
+    {
+      "epoch": 0.6010674128193171,
+      "grad_norm": 8.596179962158203,
+      "learning_rate": 4.507132306238262e-05,
+      "loss": 30.9387,
+      "step": 20300
+    },
+    {
+      "epoch": 0.6040283360351758,
+      "grad_norm": 8.076534271240234,
+      "learning_rate": 4.450301731467488e-05,
+      "loss": 30.851,
+      "step": 20400
+    },
+    {
+      "epoch": 0.6069892592510344,
+      "grad_norm": 9.05728816986084,
+      "learning_rate": 4.3936197387678665e-05,
+      "loss": 30.7486,
+      "step": 20500
+    },
+    {
+      "epoch": 0.6099501824668931,
+      "grad_norm": 8.477595329284668,
+      "learning_rate": 4.3370917626991706e-05,
+      "loss": 30.6843,
+      "step": 20600
+    },
+    {
+      "epoch": 0.6129111056827519,
+      "grad_norm": 8.171915054321289,
+      "learning_rate": 4.2807232230543625e-05,
+      "loss": 30.9551,
+      "step": 20700
+    },
+    {
+      "epoch": 0.6158720288986106,
+      "grad_norm": 8.333806991577148,
+      "learning_rate": 4.22451952433994e-05,
+      "loss": 30.8566,
+      "step": 20800
+    },
+    {
+      "epoch": 0.6188329521144693,
+      "grad_norm": 7.9477715492248535,
+      "learning_rate": 4.168486055257777e-05,
+      "loss": 30.8577,
+      "step": 20900
+    },
+    {
+      "epoch": 0.621793875330328,
+      "grad_norm": 8.560218811035156,
+      "learning_rate": 4.112628188188457e-05,
+      "loss": 30.7203,
+      "step": 21000
+    },
+    {
+      "epoch": 0.621793875330328,
+      "eval_loss": 3.7986109256744385,
+      "eval_runtime": 109.4771,
+      "eval_samples_per_second": 9.874,
+      "eval_steps_per_second": 2.475,
+      "step": 21000
+    },
+    {
+      "epoch": 0.6247547985461867,
+      "grad_norm": 8.963776588439941,
+      "learning_rate": 4.056951278676187e-05,
+      "loss": 30.9418,
+      "step": 21100
+    },
+    {
+      "epoch": 0.6277157217620454,
+      "grad_norm": 8.338837623596191,
+      "learning_rate": 4.001460664915308e-05,
+      "loss": 30.756,
+      "step": 21200
+    },
+    {
+      "epoch": 0.6306766449779041,
+      "grad_norm": 8.323155403137207,
+      "learning_rate": 3.946161667238485e-05,
+      "loss": 30.6959,
+      "step": 21300
+    },
+    {
+      "epoch": 0.6336375681937628,
+      "grad_norm": 9.881996154785156,
+      "learning_rate": 3.8910595876066085e-05,
+      "loss": 30.9333,
+      "step": 21400
+    },
+    {
+      "epoch": 0.6365984914096215,
+      "grad_norm": 8.089996337890625,
+      "learning_rate": 3.836159709100446e-05,
+      "loss": 30.6899,
+      "step": 21500
+    },
+    {
+      "epoch": 0.6395594146254803,
+      "grad_norm": 7.9427289962768555,
+      "learning_rate": 3.7814672954141055e-05,
+      "loss": 30.8046,
+      "step": 21600
+    },
+    {
+      "epoch": 0.6425203378413389,
+      "grad_norm": 8.468146324157715,
+      "learning_rate": 3.7269875903503826e-05,
+      "loss": 31.2292,
+      "step": 21700
+    },
+    {
+      "epoch": 0.6454812610571976,
+      "grad_norm": 8.63842487335205,
+      "learning_rate": 3.672725817317973e-05,
+      "loss": 30.7721,
+      "step": 21800
+    },
+    {
+      "epoch": 0.6484421842730563,
+      "grad_norm": 8.145241737365723,
+      "learning_rate": 3.6186871788306674e-05,
+      "loss": 30.5881,
+      "step": 21900
+    },
+    {
+      "epoch": 0.6514031074889151,
+      "grad_norm": 8.194993019104004,
+      "learning_rate": 3.5648768560085604e-05,
+      "loss": 30.9425,
+      "step": 22000
+    },
+    {
+      "epoch": 0.6514031074889151,
+      "eval_loss": 3.7950870990753174,
+      "eval_runtime": 109.4264,
+      "eval_samples_per_second": 9.879,
+      "eval_steps_per_second": 2.477,
+      "step": 22000
+    },
+    {
+      "epoch": 0.6543640307047738,
+      "grad_norm": 9.304323196411133,
+      "learning_rate": 3.511300008081273e-05,
+      "loss": 30.722,
+      "step": 22100
+    },
+    {
+      "epoch": 0.6573249539206325,
+      "grad_norm": 7.82930850982666,
+      "learning_rate": 3.4579617718933054e-05,
+      "loss": 30.7943,
+      "step": 22200
+    },
+    {
+      "epoch": 0.6602858771364911,
+      "grad_norm": 7.912548542022705,
+      "learning_rate": 3.4048672614115294e-05,
+      "loss": 30.8451,
+      "step": 22300
+    },
+    {
+      "epoch": 0.6632468003523498,
+      "grad_norm": 8.46181583404541,
+      "learning_rate": 3.352021567234869e-05,
+      "loss": 30.9009,
+      "step": 22400
+    },
+    {
+      "epoch": 0.6662077235682086,
+      "grad_norm": 7.727646827697754,
+      "learning_rate": 3.299429756106215e-05,
+      "loss": 30.8281,
+      "step": 22500
+    },
+    {
+      "epoch": 0.6691686467840673,
+      "grad_norm": 8.119136810302734,
+      "learning_rate": 3.247096870426649e-05,
+      "loss": 30.7757,
+      "step": 22600
+    },
+    {
+      "epoch": 0.672129569999926,
+      "grad_norm": 8.091607093811035,
+      "learning_rate": 3.195027927771982e-05,
+      "loss": 30.8661,
+      "step": 22700
+    },
+    {
+      "epoch": 0.6750904932157847,
+      "grad_norm": 7.598474979400635,
+      "learning_rate": 3.1432279204116776e-05,
+      "loss": 30.6257,
+      "step": 22800
+    },
+    {
+      "epoch": 0.6780514164316434,
+      "grad_norm": 9.547100067138672,
+      "learning_rate": 3.091701814830198e-05,
+      "loss": 30.8582,
+      "step": 22900
+    },
+    {
+      "epoch": 0.6810123396475021,
+      "grad_norm": 7.637078762054443,
+      "learning_rate": 3.0404545512508415e-05,
+      "loss": 30.9432,
+      "step": 23000
+    },
+    {
+      "epoch": 0.6810123396475021,
+      "eval_loss": 3.791748285293579,
+      "eval_runtime": 109.2867,
+      "eval_samples_per_second": 9.891,
+      "eval_steps_per_second": 2.48,
+      "step": 23000
+    },
+    {
+      "epoch": 0.6839732628633608,
+      "grad_norm": 8.485209465026855,
+      "learning_rate": 2.98949104316207e-05,
+      "loss": 30.921,
+      "step": 23100
+    },
+    {
+      "epoch": 0.6869341860792195,
+      "grad_norm": 7.777042865753174,
+      "learning_rate": 2.938816176846421e-05,
+      "loss": 30.8116,
+      "step": 23200
+    },
+    {
+      "epoch": 0.6898951092950782,
+      "grad_norm": 7.6587138175964355,
+      "learning_rate": 2.8884348109120106e-05,
+      "loss": 30.7965,
+      "step": 23300
+    },
+    {
+      "epoch": 0.6928560325109369,
+      "grad_norm": 8.276775360107422,
+      "learning_rate": 2.8383517758267178e-05,
+      "loss": 30.6582,
+      "step": 23400
+    },
+    {
+      "epoch": 0.6958169557267956,
+      "grad_norm": 7.5494771003723145,
+      "learning_rate": 2.7885718734550257e-05,
+      "loss": 30.6483,
+      "step": 23500
+    },
+    {
+      "epoch": 0.6987778789426543,
+      "grad_norm": 7.938130855560303,
+      "learning_rate": 2.739099876597646e-05,
+      "loss": 30.529,
+      "step": 23600
+    },
+    {
+      "epoch": 0.701738802158513,
+      "grad_norm": 8.202885627746582,
+      "learning_rate": 2.6899405285339026e-05,
+      "loss": 30.825,
+      "step": 23700
+    },
+    {
+      "epoch": 0.7046997253743718,
+      "grad_norm": 8.393240928649902,
+      "learning_rate": 2.6410985425669622e-05,
+      "loss": 30.7867,
+      "step": 23800
+    },
+    {
+      "epoch": 0.7076606485902305,
+      "grad_norm": 8.32459831237793,
+      "learning_rate": 2.5925786015719207e-05,
+      "loss": 30.7898,
+      "step": 23900
+    },
+    {
+      "epoch": 0.7106215718060891,
+      "grad_norm": 9.323598861694336,
+      "learning_rate": 2.544385357546831e-05,
+      "loss": 30.5684,
+      "step": 24000
+    },
+    {
+      "epoch": 0.7106215718060891,
+      "eval_loss": 3.789947271347046,
+      "eval_runtime": 110.7686,
+      "eval_samples_per_second": 9.759,
+      "eval_steps_per_second": 2.447,
+      "step": 24000
+    },
+    {
+      "epoch": 0.7135824950219478,
+      "grad_norm": 8.184738159179688,
+      "learning_rate": 2.4965234311666717e-05,
+      "loss": 30.7187,
+      "step": 24100
+    },
+    {
+      "epoch": 0.7165434182378065,
+      "grad_norm": 7.82784366607666,
+      "learning_rate": 2.4489974113403275e-05,
+      "loss": 30.705,
+      "step": 24200
+    },
+    {
+      "epoch": 0.7195043414536653,
+      "grad_norm": 7.945186614990234,
+      "learning_rate": 2.4018118547706078e-05,
+      "loss": 30.4846,
+      "step": 24300
+    },
+    {
+      "epoch": 0.722465264669524,
+      "grad_norm": 9.277371406555176,
+      "learning_rate": 2.3549712855173688e-05,
+      "loss": 30.6765,
+      "step": 24400
+    },
+    {
+      "epoch": 0.7254261878853827,
+      "grad_norm": 8.619938850402832,
+      "learning_rate": 2.3084801945637512e-05,
+      "loss": 30.6503,
+      "step": 24500
+    },
+    {
+      "epoch": 0.7283871111012413,
+      "grad_norm": 8.467925071716309,
+      "learning_rate": 2.262343039385585e-05,
+      "loss": 30.957,
+      "step": 24600
+    },
+    {
+      "epoch": 0.7313480343171,
+      "grad_norm": 8.035057067871094,
+      "learning_rate": 2.216564243524035e-05,
+      "loss": 30.6764,
+      "step": 24700
+    },
+    {
+      "epoch": 0.7343089575329588,
+      "grad_norm": 7.555221080780029,
+      "learning_rate": 2.1711481961614565e-05,
+      "loss": 30.7666,
+      "step": 24800
+    },
+    {
+      "epoch": 0.7372698807488175,
+      "grad_norm": 7.959348201751709,
+      "learning_rate": 2.1260992517005892e-05,
+      "loss": 30.8212,
+      "step": 24900
+    },
+    {
+      "epoch": 0.7402308039646762,
+      "grad_norm": 7.882981300354004,
+      "learning_rate": 2.0814217293470476e-05,
+      "loss": 30.8312,
+      "step": 25000
+    },
+    {
+      "epoch": 0.7402308039646762,
+      "eval_loss": 3.7874350547790527,
+      "eval_runtime": 107.8316,
+      "eval_samples_per_second": 10.025,
+      "eval_steps_per_second": 2.513,
+      "step": 25000
+    },
+    {
+      "epoch": 0.743191727180535,
+      "grad_norm": 7.499105930328369,
+      "learning_rate": 2.0371199126952268e-05,
+      "loss": 30.9958,
+      "step": 25100
+    },
+    {
+      "epoch": 0.7461526503963936,
+      "grad_norm": 7.973631381988525,
+      "learning_rate": 1.9931980493175735e-05,
+      "loss": 30.6469,
+      "step": 25200
+    },
+    {
+      "epoch": 0.7491135736122523,
+      "grad_norm": 7.996872425079346,
+      "learning_rate": 1.949660350357356e-05,
+      "loss": 30.6363,
+      "step": 25300
+    },
+    {
+      "epoch": 0.752074496828111,
+      "grad_norm": 8.139349937438965,
+      "learning_rate": 1.9065109901249e-05,
+      "loss": 30.924,
+      "step": 25400
+    },
+    {
+      "epoch": 0.7550354200439697,
+      "grad_norm": 8.981887817382812,
+      "learning_rate": 1.863754105697369e-05,
+      "loss": 30.9555,
+      "step": 25500
+    },
+    {
+      "epoch": 0.7579963432598285,
+      "grad_norm": 7.660996913909912,
+      "learning_rate": 1.821393796522096e-05,
+      "loss": 30.8007,
+      "step": 25600
+    },
+    {
+      "epoch": 0.7609572664756871,
+      "grad_norm": 7.750844955444336,
+      "learning_rate": 1.7794341240235615e-05,
+      "loss": 30.7227,
+      "step": 25700
+    },
+    {
+      "epoch": 0.7639181896915458,
+      "grad_norm": 7.581575870513916,
+      "learning_rate": 1.737879111213961e-05,
+      "loss": 30.6509,
+      "step": 25800
+    },
+    {
+      "epoch": 0.7668791129074045,
+      "grad_norm": 8.771635055541992,
+      "learning_rate": 1.6967327423075142e-05,
+      "loss": 30.7893,
+      "step": 25900
+    },
+    {
+      "epoch": 0.7698400361232632,
+      "grad_norm": 8.594512939453125,
+      "learning_rate": 1.6559989623384456e-05,
+      "loss": 30.6874,
+      "step": 26000
+    },
+    {
+      "epoch": 0.7698400361232632,
+      "eval_loss": 3.7861363887786865,
+      "eval_runtime": 112.2096,
+      "eval_samples_per_second": 9.634,
+      "eval_steps_per_second": 2.415,
+      "step": 26000
+    },
+    {
+      "epoch": 0.772800959339122,
+      "grad_norm": 7.919267177581787,
+      "learning_rate": 1.615681676782755e-05,
+      "loss": 30.7685,
+      "step": 26100
+    },
+    {
+      "epoch": 0.7757618825549807,
+      "grad_norm": 7.744143009185791,
+      "learning_rate": 1.5757847511837648e-05,
+      "loss": 30.7558,
+      "step": 26200
+    },
+    {
+      "epoch": 0.7787228057708393,
+      "grad_norm": 7.894962787628174,
+      "learning_rate": 1.5363120107814955e-05,
+      "loss": 30.7543,
+      "step": 26300
+    },
+    {
+      "epoch": 0.781683728986698,
+      "grad_norm": 9.573600769042969,
+      "learning_rate": 1.4972672401459143e-05,
+      "loss": 30.808,
+      "step": 26400
+    },
+    {
+      "epoch": 0.7846446522025567,
+      "grad_norm": 7.708218574523926,
+      "learning_rate": 1.4586541828140706e-05,
+      "loss": 30.6115,
+      "step": 26500
+    },
+    {
+      "epoch": 0.7876055754184155,
+      "grad_norm": 8.170422554016113,
+      "learning_rate": 1.4204765409311852e-05,
+      "loss": 30.8811,
+      "step": 26600
+    },
+    {
+      "epoch": 0.7905664986342742,
+      "grad_norm": 8.293937683105469,
+      "learning_rate": 1.3827379748956783e-05,
+      "loss": 30.8484,
+      "step": 26700
+    },
+    {
+      "epoch": 0.7935274218501329,
+      "grad_norm": 7.64206075668335,
+      "learning_rate": 1.3454421030082402e-05,
+      "loss": 30.7768,
+      "step": 26800
+    },
+    {
+      "epoch": 0.7964883450659915,
+      "grad_norm": 7.780085563659668,
+      "learning_rate": 1.3085925011248902e-05,
+      "loss": 30.6903,
+      "step": 26900
+    },
+    {
+      "epoch": 0.7994492682818503,
+      "grad_norm": 7.651244640350342,
+      "learning_rate": 1.2721927023141509e-05,
+      "loss": 30.8888,
+      "step": 27000
+    },
+    {
+      "epoch": 0.7994492682818503,
+      "eval_loss": 3.7866110801696777,
+      "eval_runtime": 111.3993,
+      "eval_samples_per_second": 9.704,
+      "eval_steps_per_second": 2.433,
+      "step": 27000
+    },
+    {
+      "epoch": 0.802410191497709,
+      "grad_norm": 7.893172740936279,
+      "learning_rate": 1.2362461965182951e-05,
+      "loss": 30.8551,
+      "step": 27100
+    },
+    {
+      "epoch": 0.8053711147135677,
+      "grad_norm": 8.348461151123047,
+      "learning_rate": 1.2007564302187395e-05,
+      "loss": 30.9086,
+      "step": 27200
+    },
+    {
+      "epoch": 0.8083320379294264,
+      "grad_norm": 8.005925178527832,
+      "learning_rate": 1.1657268061055954e-05,
+      "loss": 30.6258,
+      "step": 27300
+    },
+    {
+      "epoch": 0.8112929611452852,
+      "grad_norm": 7.919161319732666,
+      "learning_rate": 1.1311606827514432e-05,
+      "loss": 30.4614,
+      "step": 27400
+    },
+    {
+      "epoch": 0.8142538843611438,
+      "grad_norm": 8.806751251220703,
+      "learning_rate": 1.0970613742892959e-05,
+      "loss": 30.9882,
+      "step": 27500
+    },
+    {
+      "epoch": 0.8172148075770025,
+      "grad_norm": 8.126434326171875,
+      "learning_rate": 1.0634321500948665e-05,
+      "loss": 30.6459,
+      "step": 27600
+    },
+    {
+      "epoch": 0.8201757307928612,
+      "grad_norm": 7.643808364868164,
+      "learning_rate": 1.0302762344730893e-05,
+      "loss": 30.6614,
+      "step": 27700
+    },
+    {
+      "epoch": 0.8231366540087199,
+      "grad_norm": 8.046734809875488,
+      "learning_rate": 9.97596806349001e-06,
+      "loss": 30.6958,
+      "step": 27800
+    },
+    {
+      "epoch": 0.8260975772245787,
+      "grad_norm": 8.094582557678223,
+      "learning_rate": 9.653969989629268e-06,
+      "loss": 30.5807,
+      "step": 27900
+    },
+    {
+      "epoch": 0.8290585004404373,
+      "grad_norm": 8.062453269958496,
+      "learning_rate": 9.336798995700899e-06,
+      "loss": 30.8323,
+      "step": 28000
+    },
+    {
+      "epoch": 0.8290585004404373,
+      "eval_loss": 3.786661386489868,
+      "eval_runtime": 106.9454,
+      "eval_samples_per_second": 10.108,
+      "eval_steps_per_second": 2.534,
+      "step": 28000
+    },
+    {
+      "epoch": 0.832019423656296,
+      "grad_norm": 7.68911075592041,
+      "learning_rate": 9.024485491446045e-06,
+      "loss": 30.9853,
+      "step": 28100
+    },
+    {
+      "epoch": 0.8349803468721547,
+      "grad_norm": 7.82414436340332,
+      "learning_rate": 8.717059420879143e-06,
+      "loss": 30.5061,
+      "step": 28200
+    },
+    {
+      "epoch": 0.8379412700880134,
+      "grad_norm": 7.392062664031982,
+      "learning_rate": 8.414550259416917e-06,
+      "loss": 30.9525,
+      "step": 28300
+    },
+    {
+      "epoch": 0.8409021933038722,
+      "grad_norm": 7.675992965698242,
+      "learning_rate": 8.116987011052387e-06,
+      "loss": 30.8296,
+      "step": 28400
+    },
+    {
+      "epoch": 0.8438631165197309,
+      "grad_norm": 8.038030624389648,
+      "learning_rate": 7.824398205574006e-06,
+      "loss": 30.8155,
+      "step": 28500
+    },
+    {
+      "epoch": 0.8468240397355895,
+      "grad_norm": 7.427101135253906,
+      "learning_rate": 7.536811895830222e-06,
+      "loss": 30.9259,
+      "step": 28600
+    },
+    {
+      "epoch": 0.8497849629514482,
+      "grad_norm": 8.095186233520508,
+      "learning_rate": 7.254255655039919e-06,
+      "loss": 30.824,
+      "step": 28700
+    },
+    {
+      "epoch": 0.852745886167307,
+      "grad_norm": 7.521733283996582,
+      "learning_rate": 6.9767565741486815e-06,
+      "loss": 30.7226,
+      "step": 28800
+    },
+    {
+      "epoch": 0.8557068093831657,
+      "grad_norm": 7.494954586029053,
+      "learning_rate": 6.704341259231415e-06,
+      "loss": 30.7789,
+      "step": 28900
+    },
+    {
+      "epoch": 0.8586677325990244,
+      "grad_norm": 7.641082763671875,
+      "learning_rate": 6.437035828941324e-06,
+      "loss": 30.8001,
+      "step": 29000
+    },
+    {
+      "epoch": 0.8586677325990244,
+      "eval_loss": 3.786005973815918,
+      "eval_runtime": 111.1521,
+      "eval_samples_per_second": 9.725,
+      "eval_steps_per_second": 2.438,
+      "step": 29000
+    },
+    {
+      "epoch": 0.8616286558148831,
+      "grad_norm": 7.96475887298584,
+      "learning_rate": 6.1748659120058386e-06,
+      "loss": 30.8879,
+      "step": 29100
+    },
+    {
+      "epoch": 0.8645895790307417,
+      "grad_norm": 6.990954875946045,
+      "learning_rate": 5.917856644769242e-06,
+      "loss": 30.6077,
+      "step": 29200
+    },
+    {
+      "epoch": 0.8675505022466005,
+      "grad_norm": 7.170067310333252,
+      "learning_rate": 5.666032668782735e-06,
+      "loss": 30.8456,
+      "step": 29300
+    },
+    {
+      "epoch": 0.8705114254624592,
+      "grad_norm": 8.4426851272583,
+      "learning_rate": 5.419418128441846e-06,
+      "loss": 30.9228,
+      "step": 29400
+    },
+    {
+      "epoch": 0.8734723486783179,
+      "grad_norm": 8.034204483032227,
+      "learning_rate": 5.178036668671475e-06,
+      "loss": 30.7785,
+      "step": 29500
+    },
+    {
+      "epoch": 0.8764332718941766,
+      "grad_norm": 7.411805629730225,
+      "learning_rate": 4.941911432658868e-06,
+      "loss": 30.7495,
+      "step": 29600
+    },
+    {
+      "epoch": 0.8793941951100354,
+      "grad_norm": 7.887239456176758,
+      "learning_rate": 4.7110650596347335e-06,
+      "loss": 30.7797,
+      "step": 29700
+    },
+    {
+      "epoch": 0.882355118325894,
+      "grad_norm": 8.600279808044434,
+      "learning_rate": 4.48551968270261e-06,
+      "loss": 30.8267,
+      "step": 29800
+    },
+    {
+      "epoch": 0.8853160415417527,
+      "grad_norm": 8.055954933166504,
+      "learning_rate": 4.26529692671679e-06,
+      "loss": 30.8123,
+      "step": 29900
+    },
+    {
+      "epoch": 0.8882769647576114,
+      "grad_norm": 7.540750503540039,
+      "learning_rate": 4.050417906208945e-06,
+      "loss": 30.8866,
+      "step": 30000
+    },
+    {
+      "epoch": 0.8882769647576114,
+      "eval_loss": 3.7849574089050293,
+      "eval_runtime": 108.0072,
+      "eval_samples_per_second": 10.009,
+      "eval_steps_per_second": 2.509,
+      "step": 30000
+    },
+    {
+      "epoch": 0.8912378879734701,
+      "grad_norm": 7.607705593109131,
+      "learning_rate": 3.840903223363752e-06,
+      "loss": 30.7932,
+      "step": 30100
+    },
+    {
+      "epoch": 0.8941988111893289,
+      "grad_norm": 7.834300518035889,
+      "learning_rate": 3.636772966043571e-06,
+      "loss": 30.6935,
+      "step": 30200
+    },
+    {
+      "epoch": 0.8971597344051876,
+      "grad_norm": 9.865922927856445,
+      "learning_rate": 3.4380467058624585e-06,
+      "loss": 30.5129,
+      "step": 30300
+    },
+    {
+      "epoch": 0.9001206576210462,
+      "grad_norm": 7.9707865715026855,
+      "learning_rate": 3.244743496309701e-06,
+      "loss": 30.8035,
+      "step": 30400
+    },
+    {
+      "epoch": 0.9030815808369049,
+      "grad_norm": 8.035768508911133,
+      "learning_rate": 3.0568818709229364e-06,
+      "loss": 30.4973,
+      "step": 30500
+    },
+    {
+      "epoch": 0.9060425040527637,
+      "grad_norm": 8.816192626953125,
+      "learning_rate": 2.8744798415113015e-06,
+      "loss": 30.5553,
+      "step": 30600
+    },
+    {
+      "epoch": 0.9090034272686224,
+      "grad_norm": 7.411801338195801,
+      "learning_rate": 2.6975548964283823e-06,
+      "loss": 30.6758,
+      "step": 30700
+    },
+    {
+      "epoch": 0.9119643504844811,
+      "grad_norm": 7.46308708190918,
+      "learning_rate": 2.5261239988955733e-06,
+      "loss": 30.8337,
+      "step": 30800
+    },
+    {
+      "epoch": 0.9149252737003397,
+      "grad_norm": 8.57913875579834,
+      "learning_rate": 2.360203585375571e-06,
+      "loss": 31.0671,
+      "step": 30900
+    },
+    {
+      "epoch": 0.9178861969161984,
+      "grad_norm": 7.983087062835693,
+      "learning_rate": 2.1998095639965577e-06,
+      "loss": 30.913,
+      "step": 31000
+    },
+    {
+      "epoch": 0.9178861969161984,
+      "eval_loss": 3.785719394683838,
+      "eval_runtime": 110.9703,
+      "eval_samples_per_second": 9.741,
+      "eval_steps_per_second": 2.442,
+      "step": 31000
+    },
+    {
+      "epoch": 0.9208471201320572,
+      "grad_norm": 8.11637020111084,
+      "learning_rate": 2.044957313026925e-06,
+      "loss": 30.7294,
+      "step": 31100
+    },
+    {
+      "epoch": 0.9238080433479159,
+      "grad_norm": 7.882040977478027,
+      "learning_rate": 1.895661679400842e-06,
+      "loss": 30.7816,
+      "step": 31200
+    },
+    {
+      "epoch": 0.9267689665637746,
+      "grad_norm": 7.475772857666016,
+      "learning_rate": 1.7519369772947525e-06,
+      "loss": 30.5198,
+      "step": 31300
+    },
+    {
+      "epoch": 0.9297298897796333,
+      "grad_norm": 8.094454765319824,
+      "learning_rate": 1.6137969867549674e-06,
+      "loss": 30.8313,
+      "step": 31400
+    },
+    {
+      "epoch": 0.932690812995492,
+      "grad_norm": 8.635899543762207,
+      "learning_rate": 1.4812549523764674e-06,
+      "loss": 30.6539,
+      "step": 31500
+    },
+    {
+      "epoch": 0.9356517362113507,
+      "grad_norm": 7.975414752960205,
+      "learning_rate": 1.354323582033039e-06,
+      "loss": 30.5804,
+      "step": 31600
+    },
+    {
+      "epoch": 0.9386126594272094,
+      "grad_norm": 7.660233020782471,
+      "learning_rate": 1.233015045658823e-06,
+      "loss": 30.6357,
+      "step": 31700
+    },
+    {
+      "epoch": 0.9415735826430681,
+      "grad_norm": 8.09595012664795,
+      "learning_rate": 1.1173409740815532e-06,
+      "loss": 30.7201,
+      "step": 31800
+    },
+    {
+      "epoch": 0.9445345058589268,
+      "grad_norm": 8.44491958618164,
+      "learning_rate": 1.0073124579073701e-06,
+      "loss": 30.7462,
+      "step": 31900
+    },
+    {
+      "epoch": 0.9474954290747856,
+      "grad_norm": 8.275026321411133,
+      "learning_rate": 9.0294004645749e-07,
+      "loss": 30.7256,
+      "step": 32000
+    },
+    {
+      "epoch": 0.9474954290747856,
+      "eval_loss": 3.7850279808044434,
+      "eval_runtime": 109.0824,
+      "eval_samples_per_second": 9.91,
+      "eval_steps_per_second": 2.484,
+      "step": 32000
+    },
+    {
+      "epoch": 0.9504563522906442,
+      "grad_norm": 7.571169376373291,
+      "learning_rate": 8.042337467567484e-07,
+      "loss": 30.7194,
+      "step": 32100
+    },
+    {
+      "epoch": 0.9534172755065029,
+      "grad_norm": 8.020681381225586,
+      "learning_rate": 7.112030225741472e-07,
+      "loss": 30.5828,
+      "step": 32200
+    },
+    {
+      "epoch": 0.9563781987223616,
+      "grad_norm": 7.482342720031738,
+      "learning_rate": 6.238567935155004e-07,
+      "loss": 30.5888,
+      "step": 32300
+    },
+    {
+      "epoch": 0.9593391219382204,
+      "grad_norm": 8.336071014404297,
+      "learning_rate": 5.422034341682314e-07,
+      "loss": 30.858,
+      "step": 32400
+    },
+    {
+      "epoch": 0.9623000451540791,
+      "grad_norm": 7.819650173187256,
+      "learning_rate": 4.6625077329842224e-07,
+      "loss": 30.6983,
+      "step": 32500
+    },
+    {
+      "epoch": 0.9652609683699378,
+      "grad_norm": 8.101078987121582,
+      "learning_rate": 3.960060931002141e-07,
+      "loss": 30.7803,
+      "step": 32600
+    },
+    {
+      "epoch": 0.9682218915857964,
+      "grad_norm": 9.275129318237305,
+      "learning_rate": 3.3147612849762533e-07,
+      "loss": 30.8961,
+      "step": 32700
+    },
+    {
+      "epoch": 0.9711828148016551,
+      "grad_norm": 8.00763988494873,
+      "learning_rate": 2.7266706649877516e-07,
+      "loss": 30.9344,
+      "step": 32800
+    },
+    {
+      "epoch": 0.9741437380175139,
+      "grad_norm": 8.840792655944824,
+      "learning_rate": 2.1958454560274455e-07,
+      "loss": 30.7027,
+      "step": 32900
+    },
+    {
+      "epoch": 0.9771046612333726,
+      "grad_norm": 8.015409469604492,
+      "learning_rate": 1.722336552589021e-07,
+      "loss": 30.7569,
+      "step": 33000
+    },
+    {
+      "epoch": 0.9771046612333726,
+      "eval_loss": 3.785550117492676,
+      "eval_runtime": 109.3256,
+      "eval_samples_per_second": 9.888,
+      "eval_steps_per_second": 2.479,
+      "step": 33000
+    },
+    {
+      "epoch": 0.9800655844492313,
+      "grad_norm": 8.226040840148926,
+      "learning_rate": 1.3061893537898773e-07,
+      "loss": 30.6858,
+      "step": 33100
+    },
+    {
+      "epoch": 0.9830265076650899,
+      "grad_norm": 7.274777889251709,
+      "learning_rate": 9.474437590182072e-08,
+      "loss": 30.701,
+      "step": 33200
+    },
+    {
+      "epoch": 0.9859874308809486,
+      "grad_norm": 7.866406440734863,
+      "learning_rate": 6.46134164107326e-08,
+      "loss": 30.6392,
+      "step": 33300
+    },
+    {
+      "epoch": 0.9889483540968074,
+      "grad_norm": 7.722043514251709,
+      "learning_rate": 4.022894580381742e-08,
+      "loss": 30.8502,
+      "step": 33400
+    },
+    {
+      "epoch": 0.9919092773126661,
+      "grad_norm": 7.612312316894531,
+      "learning_rate": 2.1593302016933437e-08,
+      "loss": 30.7914,
+      "step": 33500
+    },
+    {
+      "epoch": 0.9948702005285248,
+      "grad_norm": 8.195243835449219,
+      "learning_rate": 8.708271799542367e-09,
+      "loss": 30.8885,
+      "step": 33600
+    },
+    {
+      "epoch": 0.9978311237443835,
+      "grad_norm": 8.127638816833496,
+      "learning_rate": 1.5750905434130935e-09,
+      "loss": 30.9894,
+      "step": 33700
+    }
+  ],
+  "logging_steps": 100,
+  "max_steps": 33773,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5.598282561239384e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4c7abecf39843cd0ef9dea5e194fc495c60905578d8e9c6fafa20b2cc00b4dfc
+size 5777

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff